diff --git a/system/system_monitor/CHANGELOG.rst b/system/system_monitor/CHANGELOG.rst new file mode 100644 index 0000000000000..ce6769d12e186 --- /dev/null +++ b/system/system_monitor/CHANGELOG.rst @@ -0,0 +1,8 @@ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Changelog for package system_monitor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1.1x.0 (2020-xx-xx) +------------------- +* Initial commit +* Contributors: Fumihito Ito diff --git a/system/system_monitor/CMakeLists.txt b/system/system_monitor/CMakeLists.txt new file mode 100644 index 0000000000000..03ca85213b40b --- /dev/null +++ b/system/system_monitor/CMakeLists.txt @@ -0,0 +1,315 @@ +cmake_minimum_required(VERSION 3.5) +project(system_monitor) + +## Compile as C++14, supported in ROS Melodic and newer +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_CXX_EXTENSIONS OFF) +endif() +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic -Werror) +endif() + +find_package(ament_cmake_auto REQUIRED) +ament_auto_find_build_dependencies() + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +find_package(NVML) +find_package(fmt REQUIRED) +set(LIBRARIES fmt) + +########### +## Build ## +########### + +## Specify additional locations of header files + +find_path(LIBNL3_INCLUDE_DIRS + NAMES netlink/netlink.h + PATH_SUFFIXES libnl3 +) + +if(NVML_FOUND) + include_directories( + include + ${LIBNL3_INCLUDE_DIRS} + ${NVML_INCLUDE_DIRS} + ) +else() + include_directories( + include + ${LIBNL3_INCLUDE_DIRS} + ) +endif() + +## Declare a C++ executable + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(CMAKE_CPU_PLATFORM "intel") + add_definitions(-D_CPU_INTEL_) +elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm") + if(CMAKE_HOST_SYSTEM_VERSION MATCHES ".*raspi.*") + set(CMAKE_CPU_PLATFORM "raspi") + add_definitions(-D_CPU_RASPI_) + elseif(CMAKE_HOST_SYSTEM_VERSION MATCHES ".*tegra.*") + set(CMAKE_CPU_PLATFORM "tegra") + add_definitions(-D_CPU_TEGRA_) + else() + set(CMAKE_CPU_PLATFORM "arm") + add_definitions(-D_CPU_ARM_) + endif() +else() + set(CMAKE_CPU_PLATFORM "unknown") +endif() + +if(NVML_FOUND) + set(CMAKE_GPU_PLATFORM "nvml") + add_definitions(-D_GPU_NVML_) + set(GPU_LIBRARY ${NVML_LIBRARIES}) +else() + if(CMAKE_CPU_PLATFORM STREQUAL "tegra") + set(CMAKE_GPU_PLATFORM "tegra") + add_definitions(-D_GPU_TEGRA_) + else() + set(CMAKE_GPU_PLATFORM "unknown") + endif() +endif() + +message(STATUS "HOST_SYSTEM_VERSION: " ${CMAKE_HOST_SYSTEM_VERSION}) +message(STATUS "SYSTEM_PROCESSOR: " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "CPU PLATFORM: " ${CMAKE_CPU_PLATFORM}) +message(STATUS "GPU PLATFORM: " ${CMAKE_GPU_PLATFORM}) + +set(CPU_MONITOR_SOURCE + src/cpu_monitor/cpu_monitor_base.cpp + src/cpu_monitor/${CMAKE_CPU_PLATFORM}_cpu_monitor.cpp +) + +ament_auto_add_library(cpu_monitor_lib SHARED + ${CPU_MONITOR_SOURCE} +) + +ament_auto_add_library(hdd_monitor_lib SHARED + src/hdd_monitor/hdd_monitor.cpp +) + +ament_auto_add_library(mem_monitor_lib SHARED + src/mem_monitor/mem_monitor.cpp +) + +ament_auto_add_library(net_monitor_lib SHARED + src/net_monitor/net_monitor.cpp + src/net_monitor/nl80211.cpp +) + +ament_auto_add_library(ntp_monitor_lib SHARED + src/ntp_monitor/ntp_monitor.cpp +) + +ament_auto_add_library(process_monitor_lib SHARED + src/process_monitor/process_monitor.cpp +) + +set(GPU_MONITOR_SOURCE + src/gpu_monitor/gpu_monitor_base.cpp + src/gpu_monitor/${CMAKE_GPU_PLATFORM}_gpu_monitor.cpp +) +ament_auto_add_library(gpu_monitor_lib SHARED + ${GPU_MONITOR_SOURCE} +) + +ament_auto_add_executable(msr_reader + reader/msr_reader/msr_reader.cpp +) + +ament_auto_add_executable(hdd_reader + reader/hdd_reader/hdd_reader.cpp +) + +find_library(NL3 nl-3 REQUIRED) +find_library(NLGENL3 nl-genl-3 REQUIRED) +list(APPEND NL_LIBS ${NL3} ${NLGENL3}) + +find_package(Boost REQUIRED COMPONENTS + serialization + thread + filesystem + regex +) + +## Specify libraries to link a library or executable target against +target_link_libraries(cpu_monitor_lib ${Boost_LIBRARIES} ${LIBRARIES}) +target_link_libraries(hdd_monitor_lib ${Boost_LIBRARIES} ${LIBRARIES}) +target_link_libraries(mem_monitor_lib ${LIBRARIES}) +target_link_libraries(net_monitor_lib ${NL_LIBS} ${LIBRARIES}) +target_link_libraries(ntp_monitor_lib ${Boost_LIBRARIES} ${LIBRARIES}) +target_link_libraries(process_monitor_lib ${LIBRARIES}) +target_link_libraries(gpu_monitor_lib ${GPU_LIBRARY} ${Boost_LIBRARIES} ${LIBRARIES}) +target_link_libraries(msr_reader ${Boost_LIBRARIES} ${LIBRARIES}) +target_link_libraries(hdd_reader ${Boost_LIBRARIES} ${LIBRARIES}) + +rclcpp_components_register_node(cpu_monitor_lib + PLUGIN "CPUMonitor" + EXECUTABLE cpu_monitor +) + +rclcpp_components_register_node(hdd_monitor_lib + PLUGIN "HDDMonitor" + EXECUTABLE hdd_monitor +) + +rclcpp_components_register_node(mem_monitor_lib + PLUGIN "MemMonitor" + EXECUTABLE mem_monitor +) + +rclcpp_components_register_node(net_monitor_lib + PLUGIN "NetMonitor" + EXECUTABLE net_monitor +) + +rclcpp_components_register_node(ntp_monitor_lib + PLUGIN "NTPMonitor" + EXECUTABLE ntp_monitor +) + +rclcpp_components_register_node(process_monitor_lib + PLUGIN "ProcessMonitor" + EXECUTABLE process_monitor +) + +rclcpp_components_register_node(gpu_monitor_lib + PLUGIN "GPUMonitor" + EXECUTABLE gpu_monitor +) + +# TODO(yunus.caliskan): Port the tests to ROS2, robustify the tests. +if(BUILD_TESTING) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() + + # ament_add_gtest(test_cpu_monitor + # test/src/cpu_monitor/test_${CMAKE_CPU_PLATFORM}_cpu_monitor.cpp + # ${CPU_MONITOR_SOURCE} + # ) + + # ament_target_dependencies(test_cpu_monitor + # "rclcpp" + # "diagnostic_msgs" + # ) + + # target_include_directories(test_cpu_monitor + # PRIVATE "include" + # ) + + # target_link_libraries(test_cpu_monitor ${Boost_LIBRARIES} ${LIBRARIES}) + + # ament_add_gtest(test_hdd_monitor + # test/src/hdd_monitor/test_hdd_monitor.cpp + # src/hdd_monitor/hdd_monitor.cpp + # ) + + # ament_target_dependencies(test_hdd_monitor + # "rclcpp" + # "diagnostic_msgs" + # ) + + # target_include_directories(test_hdd_monitor + # PRIVATE "include" + # ) + + # target_link_libraries(test_hdd_monitor ${Boost_LIBRARIES} ${LIBRARIES} + # ) + + # ament_add_gtest(test_mem_monitor + # test/src/mem_monitor/test_mem_monitor.cpp + # src/mem_monitor/mem_monitor.cpp + # ) + + # ament_target_dependencies(test_mem_monitor + # "rclcpp" + # "diagnostic_msgs" + # ) + + # target_include_directories(test_mem_monitor + # PRIVATE "include" + # ) + + # target_link_libraries(test_mem_monitor ${Boost_LIBRARIES} ${LIBRARIES}) + + # ament_add_gtest(test_net_monitor + # test/src/net_monitor/test_net_monitor.cpp + # src/net_monitor/net_monitor.cpp + # src/net_monitor/nl80211.cpp + # ) + + # ament_target_dependencies(test_net_monitor + # "rclcpp" + # "diagnostic_msgs" + # ) + + # target_include_directories(test_net_monitor + # PRIVATE "include" + # ) + + # target_link_libraries(test_net_monitor ${Boost_LIBRARIES} ${NL_LIBS} ${LIBRARIES}) + + # ament_add_gtest(test_ntp_monitor + # test/src/ntp_monitor/test_ntp_monitor.cpp + # src/ntp_monitor/ntp_monitor.cpp + # ) + + # ament_target_dependencies(test_ntp_monitor + # "rclcpp" + # "diagnostic_msgs" + # ) + + # target_include_directories(test_ntp_monitor + # PRIVATE "include" + # ) + + # target_link_libraries(test_ntp_monitor ${Boost_LIBRARIES} ${LIBRARIES}) + + # ament_add_gtest(test_process_monitor + # test/src/process_monitor/test_process_monitor.cpp + # src/process_monitor/process_monitor.cpp + # ) + + # ament_target_dependencies(test_process_monitor + # "rclcpp" + # "diagnostic_msgs" + # ) + + # target_include_directories(test_process_monitor + # PRIVATE "include" + # ) + + # target_link_libraries(test_process_monitor ${Boost_LIBRARIES} ${LIBRARIES}) + + # ament_add_gtest(test_gpu_monitor + # test/src/gpu_monitor/test_${CMAKE_GPU_PLATFORM}_gpu_monitor.cpp + # ${GPU_MONITOR_SOURCE} + # ) + + # ament_target_dependencies(test_gpu_monitor + # "rclcpp" + # "diagnostic_msgs" + # ) + + # target_include_directories(test_gpu_monitor + # PRIVATE "include" + # ) + + # target_link_libraries(test_gpu_monitor ${GPU_LIBRARY} ${LIBRARIES}) + +endif() + +############# +## Install ## +############# + +ament_auto_package(INSTALL_TO_SHARE + launch + config +) diff --git a/system/system_monitor/README.md b/system/system_monitor/README.md new file mode 100644 index 0000000000000..8b4dd9873f7ba --- /dev/null +++ b/system/system_monitor/README.md @@ -0,0 +1,179 @@ +# System Monitor for Autoware + +**Further improvement of system monitor functionality for Autoware.** + +## Description + +This package provides the following nodes for monitoring system: + +- CPU Monitor +- HDD Monitor +- Memory Monitor +- Network Monitor +- NTP Monitor +- Process Monitor +- GPU Monitor + +### Supported architecture + +- x86_64 +- arm64v8/aarch64 + +### Operation confirmed platform + +- PC system intel core i7 +- NVIDIA Jetson AGX Xavier +- Raspberry Pi4 Model B + +## How to use + +Use colcon build and launch in the same way as other packages. + +```sh +colcon build +source install/setup.bash +ros2 launch system_monitor system_monitor.launch.xml +``` + +CPU and GPU monitoring method differs depending on platform.
+CMake automatically chooses source to be built according to build environment.
+If you build this package on intel platform, CPU monitor and GPU monitor which run on intel platform are built. + +## ROS topics published by system monitor + +Every topic is published in 1 minute interval. + +- [CPU Monitor](docs/topics_cpu_monitor.md) +- [HDD Monitor](docs/topics_hdd_monitor.md) +- [Mem Monitor](docs/topics_mem_monitor.md) +- [Net Monitor](docs/topics_net_monitor.md) +- [NTP Monitor](docs/topics_ntp_monitor.md) +- [Process Monitor](docs/topics_process_monitor.md) +- [GPU Monitor](docs/topics_gpu_monitor.md) + +[Usage] ✓:Supported, -:Not supported + +| Node | Message | Intel | arm64(tegra) | arm64(raspi) | Notes | +| --------------- | ---------------------- | :---: | :----------: | :----------: | ------------------------------------------------------------- | +| CPU Monitor | CPU Temperature | ✓ | ✓ | ✓ | | +| | CPU Usage | ✓ | ✓ | ✓ | | +| | CPU Load Average | ✓ | ✓ | ✓ | | +| | CPU Thermal Throttling | ✓ | - | ✓ | | +| | CPU Frequency | ✓ | ✓ | ✓ | Notification of frequency only, normally error not generated. | +| HDD Monitor | HDD Temperature | ✓ | ✓ | ✓ | | +| | HDD Usage | ✓ | ✓ | ✓ | | +| Memory Monitor | Memory Usage | ✓ | ✓ | ✓ | | +| Net Monitor | Network Usage | ✓ | ✓ | ✓ | | +| NTP Monitor | NTP Offset | ✓ | ✓ | ✓ | | +| Process Monitor | Tasks Summary | ✓ | ✓ | ✓ | | +| | High-load Proc[0-9] | ✓ | ✓ | ✓ | | +| | High-mem Proc[0-9] | ✓ | ✓ | ✓ | | +| GPU Monitor | GPU Temperature | ✓ | ✓ | - | | +| | GPU Usage | ✓ | ✓ | - | | +| | GPU Memory Usage | ✓ | - | - | | +| | GPU Thermal Throttling | ✓ | - | - | | +| | GPU Frequency | - | ✓ | - | | + +## ROS parameters + +See [ROS parameters](docs/ros_parameters.md). + +## Notes + +### CPU monitor for intel platform + +Thermal throttling event can be monitored by reading contents of MSR(Model Specific Register), and accessing MSR is only allowed for root by default, so this package provides the following approach to minimize security risks as much as possible:
+ +- Provide a small program named 'msr_reader' which accesses MSR and sends thermal throttling status to CPU monitor by using socket programming. +- Run 'msr_reader' as a specific user instead of root. +- CPU monitor is able to know the status as an unprivileged user since thermal throttling status is sent by socket communication. + +### Instructions before starting + +1. Create a user to run 'msr_reader'. + + ```sh + sudo adduser + ``` + +2. Load kernel module 'msr' into your target system.
+ The path '/dev/cpu/CPUNUM/msr' appears. + + ```sh + sudo modprobe msr + ``` + +3. Allow user to access MSR with read-only privilege using the Access Control List (ACL). + + ```sh + sudo setfacl -m u::r /dev/cpu/*/msr + ``` + +4. Assign capability to 'msr_reader' since msr kernel module requires rawio capability. + + ```sh + sudo setcap cap_sys_rawio=ep install/system_monitor/lib/system_monitor/msr_reader + ``` + +5. Run 'msr_reader' as the user you created, and run system_monitor as a generic user. + + ```sh + su + install/system_monitor/lib/system_monitor/msr_reader + ``` + +### See also + +[msr_reader](docs/msr_reader.md) + +## HDD Monitor + +Generally, S.M.A.R.T. information is used to monitor HDD temperature, and normally accessing disk device node is allowed for root user or disk group.
+As with the CPU monitor, this package provides an approach to minimize security risks as much as possible:
+ +- Provide a small program named 'hdd_reader' which accesses S.M.A.R.T. information and sends HDD temperature to HDD monitor by using socket programming. +- Run 'hdd_reader' as a specific user. +- HDD monitor is able to know HDD temperature as an unprivileged user since HDD temperature is sent by socket communication. + +### Instructions before starting + +1. Create a user to run 'hdd_reader'. + + ```sh + sudo adduser + ``` + +2. Add user to the disk group. + + ```sh + sudo usermod -a -G disk + ``` + +3. Assign capabilities to 'hdd_reader' since SCSI kernel module requires rawio capability to send ATA PASS-THROUGH (12) command and NVMe kernel module requires admin capability to send Admin Command. + + ```sh + sudo setcap 'cap_sys_rawio=ep cap_sys_admin=ep' install/system_monitor/lib/system_monitor/hdd_reader + ``` + +4. Run 'hdd_reader' as the user you created, and run system_monitor as a generic user. + + ```sh + su + install/system_monitor/lib/system_monitor/hdd_reader + ``` + +### See also + +[hdd_reader](docs/hdd_reader.md) + +## GPU Monitor for intel platform + +Currently GPU monitor for intel platform only supports NVIDIA GPU whose information can be accessed by NVML API. + +Also you need to install CUDA libraries. +For installation instructions for CUDA 10.0, see [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/archive/10.0/cuda-installation-guide-linux/index.html). + +## UML diagrams + +See [Class diagrams](docs/class_diagrams.md). +See [Sequence diagrams](docs/seq_diagrams.md). diff --git a/system/system_monitor/cmake/FindNVML.cmake b/system/system_monitor/cmake/FindNVML.cmake new file mode 100644 index 0000000000000..0a79009ab98d2 --- /dev/null +++ b/system/system_monitor/cmake/FindNVML.cmake @@ -0,0 +1,34 @@ +# Copyright 2020 Tier IV, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# - Find NVML +# Find the native NVML(NVIDIA Management Library) includes and libraries +# +# NVML_INCLUDE_DIRS - where to find nvml.h. +# NVML_LIBRARIES - the library needed to use NVML. +# NVML_FOUND - True if NVML found. + +if(NOT NVML_INCLUDE_DIRS) + find_path(NVML_INCLUDE_DIRS nvml.h PATHS /usr/local/cuda/include) +endif() + +if(NOT NVML_LIBRARIES) + find_library(NVML_LIBRARIES NAMES nvidia-ml) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NVML DEFAULT_MSG NVML_LIBRARIES NVML_INCLUDE_DIRS) + +message(STATUS "NVML include dir: ${NVML_INCLUDE_DIRS}") +message(STATUS "NVML library : ${NVML_LIBRARIES}") diff --git a/system/system_monitor/config/cpu_monitor.param.yaml b/system/system_monitor/config/cpu_monitor.param.yaml new file mode 100644 index 0000000000000..e9f91cb1708e4 --- /dev/null +++ b/system/system_monitor/config/cpu_monitor.param.yaml @@ -0,0 +1,8 @@ +/**: + ros__parameters: + temp_warn: 90.0 + temp_error: 95.0 + usage_warn: 0.90 + usage_error: 1.00 + usage_avg: true + msr_reader_port: 7634 diff --git a/system/system_monitor/config/gpu_monitor.param.yaml b/system/system_monitor/config/gpu_monitor.param.yaml new file mode 100644 index 0000000000000..d96b9f24640b2 --- /dev/null +++ b/system/system_monitor/config/gpu_monitor.param.yaml @@ -0,0 +1,8 @@ +/**: + ros__parameters: + temp_warn: 90.0 + temp_error: 95.0 + gpu_usage_warn: 0.90 + gpu_usage_error: 1.00 + memory_usage_warn: 0.95 + memory_usage_error: 0.99 diff --git a/system/system_monitor/config/hdd_monitor.param.yaml b/system/system_monitor/config/hdd_monitor.param.yaml new file mode 100644 index 0000000000000..bf687ae848059 --- /dev/null +++ b/system/system_monitor/config/hdd_monitor.param.yaml @@ -0,0 +1,11 @@ +/**: + ros__parameters: + hdd_reader_port: 7635 + num_disks: 1 + disks: # Until multi type lists are allowed, name N the disks as disk0...disk{N-1} + disk0: + name: /dev/sda + temp_warn: 55.0 + temp_error: 70.0 + usage_warn: 0.95 + usage_error: 0.99 diff --git a/system/system_monitor/config/mem_monitor.param.yaml b/system/system_monitor/config/mem_monitor.param.yaml new file mode 100644 index 0000000000000..93688d608a9ef --- /dev/null +++ b/system/system_monitor/config/mem_monitor.param.yaml @@ -0,0 +1,4 @@ +/**: + ros__parameters: + usage_warn: 0.95 + usage_error: 0.99 diff --git a/system/system_monitor/config/net_monitor.param.yaml b/system/system_monitor/config/net_monitor.param.yaml new file mode 100644 index 0000000000000..d0707ddba399f --- /dev/null +++ b/system/system_monitor/config/net_monitor.param.yaml @@ -0,0 +1,4 @@ +/**: + ros__parameters: + devices: ["*"] + usage_warn: 0.95 diff --git a/system/system_monitor/config/ntp_monitor.param.yaml b/system/system_monitor/config/ntp_monitor.param.yaml new file mode 100644 index 0000000000000..db54f70d1ce59 --- /dev/null +++ b/system/system_monitor/config/ntp_monitor.param.yaml @@ -0,0 +1,5 @@ +/**: + ros__parameters: + server: ntp.nict.jp + offset_warn: 0.1 + offset_error: 5.0 diff --git a/system/system_monitor/config/process_monitor.param.yaml b/system/system_monitor/config/process_monitor.param.yaml new file mode 100644 index 0000000000000..3d6d82fae5ce2 --- /dev/null +++ b/system/system_monitor/config/process_monitor.param.yaml @@ -0,0 +1,3 @@ +/**: + ros__parameters: + num_of_procs: 5 diff --git a/system/system_monitor/docs/class_diagrams.md b/system/system_monitor/docs/class_diagrams.md new file mode 100644 index 0000000000000..662e6a26195a7 --- /dev/null +++ b/system/system_monitor/docs/class_diagrams.md @@ -0,0 +1,29 @@ +# Class diagrams + +## CPU Monitor + +![CPU Monitor](images/class_cpu_monitor.png) + +## HDD Monitor + +![HDD Monitor](images/class_hdd_monitor.png) + +## Memory Monitor + +![Memory Monitor](images/class_mem_monitor.png) + +## Net Monitor + +![Net Monitor](images/class_net_monitor.png) + +## NTP Monitor + +![NTP Monitor](images/class_ntp_monitor.png) + +## Process Monitor + +![Process Monitor](images/class_process_monitor.png) + +## GPU Monitor + +![GPU Monitor](images/class_gpu_monitor.png) diff --git a/system/system_monitor/docs/hdd_reader.md b/system/system_monitor/docs/hdd_reader.md new file mode 100644 index 0000000000000..f1092286df0e2 --- /dev/null +++ b/system/system_monitor/docs/hdd_reader.md @@ -0,0 +1,58 @@ +# hdd_reader + +## Name + +hdd_reader - Read S.M.A.R.T. information for monitoring HDD temperature + +## Synopsis + +hdd_reader [OPTION] + +## Description + +Read S.M.A.R.T. information for monitoring HDD temperature.
+This runs as a daemon process and listens to a TCP/IP port (7635 by default). + +**Options:**
+_-h, --help_
+    Display help
+_-p, --port #_
+    Port number to listen to + +**Exit status:**
+Returns 0 if OK; non-zero otherwise. + +## Notes + +The 'hdd_reader' accesses minimal data enough to get Model number, Serial number, and HDD temperature.
+This is an approach to limit its functionality, however, the functionality can be expanded for further improvements and considerations in the future.

+ +### [ATA] + +| Purpose | Name | Length | +| --------------------------- | -------------------- | -------------------- | +| Model number, Serial number | IDENTIFY DEVICE data | 256 words(512 bytes) | +| HDD temperature | SMART READ DATA | 256 words(512 bytes) | + +For details please see the documents below.
+ +- [ATA Command Set - 4 (ACS-4)](https://www.t13.org/system/files/Project%20Drafts/2017/di529r20-ATA/ATAPI%20Command%20Set%20-%204_2.pdf) +- [ATA/ATAPI Command Set - 3 (ACS-3)](https://www.t13.org/system/files/Standards/2013/d2161r5-ATA/ATAPI%20Command%20Set%20-%203.pdf) +- [SMART Attribute Overview](https://www.t13.org/system/files/Documents/2005/e05171r0-SMART%20Attributes%20Overview_1.pdf) +- [SMART Attribute Annex](https://www.t13.org/system/files/Documents/2005/e05148r0-ACS-SMART%20Attributes%20Annex_1.pdf) + +### [NVMe] + +| Purpose | Name | Length | +| --------------------------- | ---------------------------------- | ---------------- | +| Model number, Serial number | Identify Controller data structure | 4096 bytes | +| HDD temperature | SMART / Health Information | 1 Dword(4 bytes) | + +For details please see the documents below.
+ +- [NVM Express 1.2b](https://www.nvmexpress.org/wp-content/uploads/NVM_Express_1_2b_Gold_20160603.pdf) + +## Operation confirmed drives + +- SAMSUNG MZVLB1T0HALR (SSD) +- Western Digital My Passport (Portable HDD) diff --git a/system/system_monitor/docs/images/class_cpu_monitor.png b/system/system_monitor/docs/images/class_cpu_monitor.png new file mode 100644 index 0000000000000..3fb87810b0209 Binary files /dev/null and b/system/system_monitor/docs/images/class_cpu_monitor.png differ diff --git a/system/system_monitor/docs/images/class_gpu_monitor.png b/system/system_monitor/docs/images/class_gpu_monitor.png new file mode 100644 index 0000000000000..857f3bc8b0ff1 Binary files /dev/null and b/system/system_monitor/docs/images/class_gpu_monitor.png differ diff --git a/system/system_monitor/docs/images/class_hdd_monitor.png b/system/system_monitor/docs/images/class_hdd_monitor.png new file mode 100644 index 0000000000000..3ae7f9bf0dccf Binary files /dev/null and b/system/system_monitor/docs/images/class_hdd_monitor.png differ diff --git a/system/system_monitor/docs/images/class_mem_monitor.png b/system/system_monitor/docs/images/class_mem_monitor.png new file mode 100644 index 0000000000000..a0261a4a9b0b2 Binary files /dev/null and b/system/system_monitor/docs/images/class_mem_monitor.png differ diff --git a/system/system_monitor/docs/images/class_net_monitor.png b/system/system_monitor/docs/images/class_net_monitor.png new file mode 100644 index 0000000000000..126547faeef46 Binary files /dev/null and b/system/system_monitor/docs/images/class_net_monitor.png differ diff --git a/system/system_monitor/docs/images/class_ntp_monitor.png b/system/system_monitor/docs/images/class_ntp_monitor.png new file mode 100644 index 0000000000000..71660ea938af4 Binary files /dev/null and b/system/system_monitor/docs/images/class_ntp_monitor.png differ diff --git a/system/system_monitor/docs/images/class_process_monitor.png b/system/system_monitor/docs/images/class_process_monitor.png new file mode 100644 index 0000000000000..8cecef0c3e161 Binary files /dev/null and b/system/system_monitor/docs/images/class_process_monitor.png differ diff --git a/system/system_monitor/docs/images/seq_cpu_monitor.png b/system/system_monitor/docs/images/seq_cpu_monitor.png new file mode 100644 index 0000000000000..23499d053bc36 Binary files /dev/null and b/system/system_monitor/docs/images/seq_cpu_monitor.png differ diff --git a/system/system_monitor/docs/images/seq_gpu_monitor.png b/system/system_monitor/docs/images/seq_gpu_monitor.png new file mode 100644 index 0000000000000..f0b145a40cb99 Binary files /dev/null and b/system/system_monitor/docs/images/seq_gpu_monitor.png differ diff --git a/system/system_monitor/docs/images/seq_hdd_monitor.png b/system/system_monitor/docs/images/seq_hdd_monitor.png new file mode 100644 index 0000000000000..73538f8cf2370 Binary files /dev/null and b/system/system_monitor/docs/images/seq_hdd_monitor.png differ diff --git a/system/system_monitor/docs/images/seq_mem_monitor.png b/system/system_monitor/docs/images/seq_mem_monitor.png new file mode 100644 index 0000000000000..969ff8cc9285b Binary files /dev/null and b/system/system_monitor/docs/images/seq_mem_monitor.png differ diff --git a/system/system_monitor/docs/images/seq_net_monitor.png b/system/system_monitor/docs/images/seq_net_monitor.png new file mode 100644 index 0000000000000..2d2b26fe87499 Binary files /dev/null and b/system/system_monitor/docs/images/seq_net_monitor.png differ diff --git a/system/system_monitor/docs/images/seq_ntp_monitor.png b/system/system_monitor/docs/images/seq_ntp_monitor.png new file mode 100644 index 0000000000000..53e97d2ced661 Binary files /dev/null and b/system/system_monitor/docs/images/seq_ntp_monitor.png differ diff --git a/system/system_monitor/docs/images/seq_process_monitor.png b/system/system_monitor/docs/images/seq_process_monitor.png new file mode 100644 index 0000000000000..b759fcc5cca4b Binary files /dev/null and b/system/system_monitor/docs/images/seq_process_monitor.png differ diff --git a/system/system_monitor/docs/msr_reader.md b/system/system_monitor/docs/msr_reader.md new file mode 100644 index 0000000000000..c8594be011625 --- /dev/null +++ b/system/system_monitor/docs/msr_reader.md @@ -0,0 +1,40 @@ +# msr_reader + +## Name + +msr_reader - Read MSR register for monitoring thermal throttling event + +## Synopsis + +msr_reader [OPTION] + +## Description + +Read MSR register for monitoring thermal throttling event.
+This runs as a daemon process and listens to a TCP/IP port (7634 by default). + +**Options:**
+_-h, --help_
+    Display help
+_-p, --port #_
+    Port number to listen to + +**Exit status:**
+Returns 0 if OK; non-zero otherwise. + +## Notes + +The 'msr_reader' accesses minimal data enough to get thermal throttling event.
+This is an approach to limit its functionality, however, the functionality can be expanded for further improvements and considerations in the future. + +| Register Address | Name | Length | +| ---------------- | ------------------------- | ------ | +| 1B1H | IA32_PACKAGE_THERM_STATUS | 64bit | + +For details please see the documents below.
+ +- [Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual](https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf) + +## Operation confirmed platform + +- PC system intel core i7 diff --git a/system/system_monitor/docs/ros_parameters.md b/system/system_monitor/docs/ros_parameters.md new file mode 100644 index 0000000000000..e42f843c78992 --- /dev/null +++ b/system/system_monitor/docs/ros_parameters.md @@ -0,0 +1,84 @@ +# ROS parameters + +## CPU Monitor + +cpu_monitor: + +| Name | Type | Unit | Default | Notes | +| :-------------- | :---: | :-----: | :-----: | :---------------------------------------------------------------------------- | +| temp_warn | float | DegC | 90.0 | Generates warning when CPU temperature reaches a specified value or higher. | +| temp_error | float | DegC | 95.0 | Generates error when CPU temperature reaches a specified value or higher. | +| usage_warn | float | %(1e-2) | 0.90 | Generates warning when CPU usage reaches a specified value or higher. | +| usage_error | float | %(1e-2) | 1.00 | Generates error when CPU usage reaches a specified value or higher. | +| load1_warn | float | %(1e-2) | 0.90 | Generates warning when load average 1min reaches a specified value or higher. | +| load5_warn | float | %(1e-2) | 0.80 | Generates warning when load average 5min reaches a specified value or higher. | +| msr_reader_port | int | n/a | 7634 | Port number to connect to msr_reader. | + +## HDD Monitor + +hdd_monitor: + +  disks: + +| Name | Type | Unit | Default | Notes | +| :--------- | :----: | :--: | :-----: | :-------------------------------------------------------------------------- | +| name | string | n/a | none | The disk name to monitor temperature. (e.g. /dev/sda) | +| temp_error | float | DegC | 55.0 | Generates warning when HDD temperature reaches a specified value or higher. | +| temp_error | float | DegC | 70.0 | Generates error when HDD temperature reaches a specified value or higher. | + +hdd_monitor: + +| Name | Type | Unit | Default | Notes | +| :-------------- | :---: | :-----: | :-----: | :--------------------------------------------------------------------- | +| hdd_reader_port | int | n/a | 7635 | Port number to connect to hdd_reader. | +| usage_warn | float | %(1e-2) | 0.95 | Generates warning when disk usage reaches a specified value or higher. | +| usage_error | float | %(1e-2) | 0.99 | Generates error when disk usage reaches a specified value or higher. | + +## Memory Monitor + +mem_monitor: + +| Name | Type | Unit | Default | Notes | +| :---------- | :---: | :-----: | :-----: | :-------------------------------------------------------------------------------- | +| usage_warn | float | %(1e-2) | 0.95 | Generates warning when physical memory usage reaches a specified value or higher. | +| usage_error | float | %(1e-2) | 0.99 | Generates error when physical memory usage reaches a specified value or higher. | + +## Net Monitor + +net_monitor: + +| Name | Type | Unit | Default | Notes | +| :--------- | :----------: | :-----: | :-----: | :----------------------------------------------------------------------------------- | +| devices | list[string] | n/a | none | The name of network interface to monitor. (e.g. eth0, \* for all network interfaces) | +| usage_warn | float | %(1e-2) | 0.95 | Generates warning when network usage reaches a specified value or higher. | + +## NTP Monitor + +ntp_monitor: + +| Name | Type | Unit | Default | Notes | +| :----------- | :----: | :--: | :------------: | :---------------------------------------------------------------------------------------- | +| server | string | n/a | ntp.ubuntu.com | The name of NTP server to synchronize date and time. (e.g. ntp.nict.jp for Japan) | +| offset_warn | float | sec | 0.1 | Generates warning when NTP offset reaches a specified value or higher. (default is 100ms) | +| offset_error | float | sec | 5.0 | Generates warning when NTP offset reaches a specified value or higher. (default is 5sec) | + +## Process Monitor + +process_monitor: + +| Name | Type | Unit | Default | Notes | +| :----------- | :--: | :--: | :-----: | :------------------------------------------------------------------------------ | +| num_of_procs | int | n/a | 5 | The number of processes to generate High-load Proc[0-9] and High-mem Proc[0-9]. | + +## GPU Monitor + +gpu_monitor: + +| Name | Type | Unit | Default | Notes | +| :----------------- | :---: | :-----: | :-----: | :--------------------------------------------------------------------------- | +| temp_warn | float | DegC | 90.0 | Generates warning when GPU temperature reaches a specified value or higher. | +| temp_error | float | DegC | 95.0 | Generates error when GPU temperature reaches a specified value or higher. | +| gpu_usage_warn | float | %(1e-2) | 0.90 | Generates warning when GPU usage reaches a specified value or higher. | +| gpu_usage_error | float | %(1e-2) | 1.00 | Generates error when GPU usage reaches a specified value or higher. | +| memory_usage_warn | float | %(1e-2) | 0.90 | Generates warning when GPU memory usage reaches a specified value or higher. | +| memory_usage_error | float | %(1e-2) | 1.00 | Generates error when GPU memory usage reaches a specified value or higher. | diff --git a/system/system_monitor/docs/seq_diagrams.md b/system/system_monitor/docs/seq_diagrams.md new file mode 100644 index 0000000000000..8fadedb6416a1 --- /dev/null +++ b/system/system_monitor/docs/seq_diagrams.md @@ -0,0 +1,29 @@ +# Sequence diagrams + +## CPU Monitor + +![CPU Monitor](images/seq_cpu_monitor.png) + +## HDD Monitor + +![HDD Monitor](images/seq_hdd_monitor.png) + +## Memory Monitor + +![Memory Monitor](images/seq_mem_monitor.png) + +## Net Monitor + +![Net Monitor](images/seq_net_monitor.png) + +## NTP Monitor + +![NTP Monitor](images/seq_ntp_monitor.png) + +## Process Monitor + +![Process Monitor](images/seq_process_monitor.png) + +## GPU Monitor + +![GPU Monitor](images/seq_gpu_monitor.png) diff --git a/system/system_monitor/docs/topics_cpu_monitor.md b/system/system_monitor/docs/topics_cpu_monitor.md new file mode 100644 index 0000000000000..6243ea0cdda56 --- /dev/null +++ b/system/system_monitor/docs/topics_cpu_monitor.md @@ -0,0 +1,104 @@ +# ROS topics: CPU Monitor + +## CPU Temperature + +/diagnostics/cpu_monitor: CPU Temperature + +[summary] + +| level | message | +| ----- | ------- | +| OK | OK | +| WARN | warm | +| ERROR | hot | + +[values] + +| key (example) | value (example) | +| ------------------------------------------- | --------------- | +| Package id 0, Core [0-9], thermal_zone[0-9] | 50.0 DegC | + +\*key: thermal_zone[0-9] for ARM architecture. + +## CPU Usage + +/diagnostics/cpu_monitor: CPU Usage + +[summary] + +| level | message | +| ----- | --------------- | +| OK | OK | +| WARN | high load | +| ERROR | very high Lload | + +[values] + +| key | value (example) | +| --------------------- | ------------------------------- | +| CPU [all,0-9]: status | OK / high load / very high load | +| CPU [all,0-9]: usr | 2.00% | +| CPU [all,0-9]: nice | 0.00% | +| CPU [all,0-9]: sys | 1.00% | +| CPU [all,0-9]: idle | 97.00% | + +## CPU Load Average + +/diagnostics/cpu_monitor: CPU Load Average + +[summary] + +| level | message | +| ----- | --------- | +| OK | OK | +| WARN | high load | + +[values] + +| key | value (example) | +| ----- | --------------- | +| 1min | 14.50% | +| 5min | 14.55% | +| 15min | 9.67% | + +## CPU Thermal Throttling + +> Intel and raspi platform only.
+> Tegra platform not supported. + +/diagnostics/cpu_monitor: CPU Thermal Throttling + +[summary] + +| level | message | +| ----- | ---------- | +| OK | OK | +| ERROR | throttling | + +[values for intel platform] + +| key | value (example) | +| ----------------------------- | --------------- | +| CPU [0-9]: Pkg Thermal Status | OK / throttling | + +[values for raspi platform] + +| key | value (example) | +| ------ | --------------------------------------------------------------- | +| status | All clear / Currently throttled / Soft temperature limit active | + +## CPU Frequency + +/diagnostics/cpu_monitor: CPU Frequency + +[summary] + +| level | message | +| ----- | ------- | +| OK | OK | + +[values] + +| key | value (example) | +| ---------------- | --------------- | +| CPU [0-9]: clock | 2879MHz | diff --git a/system/system_monitor/docs/topics_gpu_monitor.md b/system/system_monitor/docs/topics_gpu_monitor.md new file mode 100644 index 0000000000000..42ce63fab2e64 --- /dev/null +++ b/system/system_monitor/docs/topics_gpu_monitor.md @@ -0,0 +1,113 @@ +# ROS topics: GPU Monitor + +> Intel and tegra platform only.
+> Raspi platform not supported. + +## GPU Temperature + +/diagnostics/gpu_monitor: GPU Temperature + +[summary] + +| level | message | +| ----- | ------- | +| OK | OK | +| WARN | warm | +| ERROR | hot | + +[values] + +| key (example) | value (example) | +| ----------------------------------- | --------------- | +| GeForce GTX 1650, thermal_zone[0-9] | 46.0 DegC | + +\*key: thermal_zone[0-9] for ARM architecture. + +## GPU Usage + +/diagnostics/gpu_monitor: GPU Usage + +[summary] + +| level | message | +| ----- | -------------- | +| OK | OK | +| WARN | high load | +| ERROR | very high load | + +[values] + +| key | value (example) | +| ----------------- | ------------------------------- | +| GPU [0-9]: status | OK / high load / very high load | +| GPU [0-9]: name | GeForce GTX 1650, gpu.[0-9] | +| GPU [0-9]: usage | 19.0% | + +\*key: gpu.[0-9] for ARM architecture. + +## GPU Memory Usage + +> Intel platform only.
+> There is no separate gpu memory in tegra. Both cpu and gpu uses cpu memory. + +/diagnostics/gpu_monitor: GPU Memory Usage + +[summary] + +| level | message | +| ----- | -------------- | +| OK | OK | +| WARN | high load | +| ERROR | very high load | + +[values] + +| key | value (example) | +| ----------------- | ------------------------------- | +| GPU [0-9]: status | OK / high load / very high load | +| GPU [0-9]: name | GeForce GTX 1650 | +| GPU [0-9]: usage | 13.0% | +| GPU [0-9]: total | 3G | +| GPU [0-9]: used | 1G | +| GPU [0-9]: free | 2G | + +## GPU Thermal Throttling + +> Intel platform only.
+> Tegra platform not supported. + +/diagnostics/gpu_monitor: GPU Thermal Throttling + +[summary] + +| level | message | +| ----- | ---------- | +| OK | OK | +| ERROR | throttling | + +[values] + +| key | value (example) | +| ------------------------- | -------------------------------- | +| GPU [0-9]: status | OK / throttling | +| GPU [0-9]: name | GeForce GTX 1650 | +| GPU [0-9]: graphics clock | 1020 MHz | +| GPU [0-9]: reasons | GpuIdle / SwThermalSlowdown etc. | + +## GPU Frequency + +> Tegra platform only. + +/diagnostics/gpu_monitor: GPU Frequency + +[summary] + +| level | message | +| ----- | ------- | +| OK | OK | + +[values] + +| key (example) | value (example) | +| ------------------------- | --------------- | +| GPU 17000000.gv11b: clock | 318 MHz | diff --git a/system/system_monitor/docs/topics_hdd_monitor.md b/system/system_monitor/docs/topics_hdd_monitor.md new file mode 100644 index 0000000000000..dbc9820edd24c --- /dev/null +++ b/system/system_monitor/docs/topics_hdd_monitor.md @@ -0,0 +1,47 @@ +# ROS topics: CPU Monitor + +## HDD Temperature + +/diagnostics/hdd_monitor: HDD Temperature + +[summary] + +| level | message | +| ----- | ------------ | +| OK | OK | +| WARN | hot | +| ERROR | critical hot | + +[values] + +| key | value (example) | +| ---------------------- | -------------------------- | +| HDD [0-9]: status | OK / hot / critical hot | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: model | SAMSUNG MZVLB1T0HBLR-000L7 | +| HDD [0-9]: serial | S4EMNF0M820682 | +| HDD [0-9]: temperature | 37.0 DegC | + +## HDD Usage + +/diagnostics/hdd_monitor: HDD Usage + +[summary] + +| level | message | +| ----- | ------------------- | +| OK | OK | +| WARN | low disk space | +| ERROR | very low disk space | + +[values] + +| key | value (example) | +| --------------------- | ----------------------------------------- | +| HDD [0-9]: status | OK / low disk space / very low disk space | +| HDD [0-9]: filesystem | /dev/nvme0n1p4 | +| HDD [0-9]: size | 264G | +| HDD [0-9]: used | 172G | +| HDD [0-9]: avail | 749G | +| HDD [0-9]: use | 69% | +| HDD [0-9]: mounted on | / | diff --git a/system/system_monitor/docs/topics_mem_monitor.md b/system/system_monitor/docs/topics_mem_monitor.md new file mode 100644 index 0000000000000..ae55c2aff0246 --- /dev/null +++ b/system/system_monitor/docs/topics_mem_monitor.md @@ -0,0 +1,28 @@ +# ROS topics: Memory Monitor + +## Memory Usage + +/diagnostics/mem_monitor: Memory Usage + +[summary] + +| level | message | +| ----- | -------------- | +| OK | OK | +| WARN | high load | +| ERROR | very high load | + +[values] + +| key | value (example) | +| ------------ | --------------- | +| Mem: usage | 18.99% | +| Mem: total | 31G | +| Mem: used | 5.9G | +| Mem: free | 15G | +| Swap: total | 2.0G | +| Swap: used | 0B | +| Swap: free | 2.0G | +| Total: total | 33G | +| Total: used | 5.9G | +| Total: free | 17G | diff --git a/system/system_monitor/docs/topics_net_monitor.md b/system/system_monitor/docs/topics_net_monitor.md new file mode 100644 index 0000000000000..1e0eefe12063c --- /dev/null +++ b/system/system_monitor/docs/topics_net_monitor.md @@ -0,0 +1,31 @@ +# ROS topics: Net Monitor + +## Network Usage + +/diagnostics/cpu_monitor: Network Usage + +[summary] + +| level | message | +| ----- | --------- | +| OK | OK | +| WARN | high load | +| ERROR | down | + +[values] + +| key | value (example) | +| ----------------------------- | --------------------- | +| Network [0-9]: status | OK / high load / down | +| Network [0-9]: interface name | wlp82s0 | +| Network [0-9]: rx_usage | 0.00% | +| Network [0-9]: tx_usage | 0.00% | +| Network [0-9]: rx_traffic | 0.00 MB/s | +| Network [0-9]: tx_traffic | 0.00 MB/s | +| Network [0-9]: capacity | 400.0 MB/s | +| Network [0-9]: mtu | 1500 | +| Network [0-9]: rx_bytes | 58455228 | +| Network [0-9]: rx_errors | 0 | +| Network [0-9]: tx_bytes | 11069136 | +| Network [0-9]: tx_errors | 0 | +| Network [0-9]: collisions | 0 | diff --git a/system/system_monitor/docs/topics_ntp_monitor.md b/system/system_monitor/docs/topics_ntp_monitor.md new file mode 100644 index 0000000000000..398c40026707a --- /dev/null +++ b/system/system_monitor/docs/topics_ntp_monitor.md @@ -0,0 +1,20 @@ +# ROS topics: NTP Monitor + +## NTP Offset + +/diagnostics/ntp_monitor: NTP Offset + +[summary] + +| level | message | +| ----- | -------- | +| OK | OK | +| WARN | high | +| ERROR | too high | + +[values] + +| key | value (example) | +| ---------- | --------------- | +| NTP Offset | -0.013181 sec | +| NTP Delay | 0.053880 sec | diff --git a/system/system_monitor/docs/topics_process_monitor.md b/system/system_monitor/docs/topics_process_monitor.md new file mode 100644 index 0000000000000..97899172881fe --- /dev/null +++ b/system/system_monitor/docs/topics_process_monitor.md @@ -0,0 +1,75 @@ +# ROS topics: Process Monitor + +## Tasks Summary + +/diagnostics/process_monitor: Tasks Summary + +[summary] + +| level | message | +| ----- | ------- | +| OK | OK | + +[values] + +| key | value (example) | +| -------- | --------------- | +| total | 409 | +| running | 2 | +| sleeping | 321 | +| stopped | 0 | +| zombie | 0 | + +## High-load Proc[0-9] + +/diagnostics/process_monitor: High-load Proc[0-9] + +[summary] + +| level | message | +| ----- | ------- | +| OK | OK | + +[values] + +| key | value (example) | +| ------- | ------------------------ | +| COMMAND | /usr/lib/firefox/firefox | +| %CPU | 37.5 | +| %MEM | 2.1 | +| PID | 14062 | +| USER | autoware | +| PR | 20 | +| NI | 0 | +| VIRT | 3461152 | +| RES | 669052 | +| SHR | 481208 | +| S | S | +| TIME+ | 23:57.49 | + +## High-mem Proc[0-9] + +/diagnostics/process_monitor: High-mem Proc[0-9] + +[summary] + +| level | message | +| ----- | ------- | +| OK | OK | + +[values] + +| key | value (example) | +| ------- | ----------------------------------------------- | +| COMMAND | /snap/multipass/1784/usr/bin/qemu-system-x86_64 | +| %CPU | 0 | +| %MEM | 2.5 | +| PID | 1565 | +| USER | root | +| PR | 20 | +| NI | 0 | +| VIRT | 3722320 | +| RES | 812432 | +| SHR | 20340 | +| S | S | +| TIME+ | 0:22.84 | diff --git a/system/system_monitor/include/hdd_reader/hdd_reader.hpp b/system/system_monitor/include/hdd_reader/hdd_reader.hpp new file mode 100644 index 0000000000000..e06bd6766dccc --- /dev/null +++ b/system/system_monitor/include/hdd_reader/hdd_reader.hpp @@ -0,0 +1,64 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file hdd_reader.h + * @brief HDD reader definitions + */ + +#ifndef HDD_READER__HDD_READER_HPP_ +#define HDD_READER__HDD_READER_HPP_ + +#include +#include +#include + +#include +#include + +/** + * @brief HDD information + */ +struct HDDInfo +{ + int error_code_; //!< @brief error code, 0 on success, otherwise error + std::string model_; //!< @brief Model number + std::string serial_; //!< @brief Serial number + uint8_t temp_; //!< @brief temperature(DegC) + // Lowest byte of the raw value contains the exact temperature value (Celsius degrees) + // in S.M.A.R.T. information. + + /** + * @brief Load or save data members. + * @param [inout] ar archive reference to load or save the serialized data members + * @param [in] version version for the archive + * @note NOLINT syntax is needed since this is an interface to serialization and + * used inside boost serialization. + */ + template + void serialize(archive & ar, const unsigned /*version*/) // NOLINT(runtime/references) + { + ar & error_code_; + ar & model_; + ar & serial_; + ar & temp_; + } +}; + +/** + * @brief HDD information list + */ +typedef std::map HDDInfoList; + +#endif // HDD_READER__HDD_READER_HPP_ diff --git a/system/system_monitor/include/msr_reader/msr_reader.hpp b/system/system_monitor/include/msr_reader/msr_reader.hpp new file mode 100644 index 0000000000000..650932891cc2f --- /dev/null +++ b/system/system_monitor/include/msr_reader/msr_reader.hpp @@ -0,0 +1,51 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file msr_reader.h + * @brief MSR reader definitions + */ + +#ifndef MSR_READER__MSR_READER_HPP_ +#define MSR_READER__MSR_READER_HPP_ + +#include +#include + +#include + +/** + * @brief MSR information + */ +struct MSRInfo +{ + int error_code_; //!< @brief error code, 0 on success, otherwise error + std::vector pkg_thermal_status_; //!< @brief Pkg Thermal Status + + /** + * @brief Load or save data members. + * @param [inout] ar archive reference to load or save the serialized data members + * @param [in] version version for the archive + * @note NOLINT syntax is needed since this is an interface to serialization and + * used inside boost serialization. + */ + template + void serialize(archive & ar, const unsigned /*version*/) // NOLINT(runtime/references) + { + ar & error_code_; + ar & pkg_thermal_status_; + } +}; + +#endif // MSR_READER__MSR_READER_HPP_ diff --git a/system/system_monitor/include/system_monitor/cpu_monitor/arm_cpu_monitor.hpp b/system/system_monitor/include/system_monitor/cpu_monitor/arm_cpu_monitor.hpp new file mode 100644 index 0000000000000..44b828d0bf38c --- /dev/null +++ b/system/system_monitor/include/system_monitor/cpu_monitor/arm_cpu_monitor.hpp @@ -0,0 +1,53 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file arm_cpu_monitor.h + * @brief ARM CPU monitor class + */ + +#ifndef SYSTEM_MONITOR__CPU_MONITOR__ARM_CPU_MONITOR_HPP_ +#define SYSTEM_MONITOR__CPU_MONITOR__ARM_CPU_MONITOR_HPP_ + +#include "system_monitor/cpu_monitor/cpu_monitor_base.hpp" + +#include + +class CPUMonitor : public CPUMonitorBase +{ +public: + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + explicit CPUMonitor(const rclcpp::NodeOptions & options); + + /** + * @brief get names for core temperature files + */ + void getTempNames() override; + +protected: + /** + * @brief check CPU thermal throttling + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkThrottling( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) +}; + +#endif // SYSTEM_MONITOR__CPU_MONITOR__ARM_CPU_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/cpu_monitor/cpu_monitor_base.hpp b/system/system_monitor/include/system_monitor/cpu_monitor/cpu_monitor_base.hpp new file mode 100644 index 0000000000000..5f5d8d2fb6fd9 --- /dev/null +++ b/system/system_monitor/include/system_monitor/cpu_monitor/cpu_monitor_base.hpp @@ -0,0 +1,158 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file cpu_monitor_base.h + * @brief CPU monitor base class + */ + +#ifndef SYSTEM_MONITOR__CPU_MONITOR__CPU_MONITOR_BASE_HPP_ +#define SYSTEM_MONITOR__CPU_MONITOR__CPU_MONITOR_BASE_HPP_ + +#include + +#include +#include +#include +#include + +/** + * @brief CPU temperature information + */ +typedef struct cpu_temp_info +{ + std::string label_; //!< @brief cpu label + std::string path_; //!< @brief sysfs path to cpu temperature + + cpu_temp_info() : label_(), path_() {} + cpu_temp_info(const std::string & label, const std::string & path) : label_(label), path_(path) {} +} cpu_temp_info; + +/** + * @brief CPU frequency information + */ +typedef struct cpu_freq_info +{ + int index_; //!< @brief cpu index + std::string path_; //!< @brief sysfs path to cpu frequency + + cpu_freq_info() : index_(0), path_() {} + cpu_freq_info(int index, const std::string & path) : index_(index), path_(path) {} +} cpu_freq_info; + +class CPUMonitorBase : public rclcpp::Node +{ +public: + /** + * @brief Update the diagnostic state. + */ + void update(); + + /** + * @brief get names for core temperature files + */ + virtual void getTempNames(); + + /** + * @brief get names for cpu frequency files + */ + virtual void getFreqNames(); + +protected: + using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + CPUMonitorBase(const std::string & node_name, const rclcpp::NodeOptions & options); + + /** + * @brief check CPU temperature + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkTemp( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check CPU usage + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check CPU load average + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkLoad( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check CPU thermal throttling + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + */ + virtual void checkThrottling( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check CPU frequency + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkFrequency( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics + + char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name + int num_cores_; //!< @brief number of cores + std::vector temps_; //!< @brief CPU list for temperature + std::vector freqs_; //!< @brief CPU list for frequency + bool mpstat_exists_; //!< @brief flag if mpstat exists + + float temp_warn_; //!< @brief CPU temperature(DegC) to generate warning + float temp_error_; //!< @brief CPU temperature(DegC) to generate error + float usage_warn_; //!< @brief CPU usage(%) to generate warning + float usage_error_; //!< @brief CPU usage(%) to generate error + bool usage_avg_; //!< @brief Check CPU usage calculated as averages among all processors + + /** + * @brief CPU temperature status messages + */ + const std::map temp_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "warm"}, {DiagStatus::ERROR, "hot"}}; + + /** + * @brief CPU usage status messages + */ + const std::map load_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "high load"}, {DiagStatus::ERROR, "very high load"}}; + + /** + * @brief CPU thermal throttling status messages + */ + const std::map thermal_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "unused"}, {DiagStatus::ERROR, "throttling"}}; +}; + +#endif // SYSTEM_MONITOR__CPU_MONITOR__CPU_MONITOR_BASE_HPP_ diff --git a/system/system_monitor/include/system_monitor/cpu_monitor/intel_cpu_monitor.hpp b/system/system_monitor/include/system_monitor/cpu_monitor/intel_cpu_monitor.hpp new file mode 100644 index 0000000000000..a4891656b7659 --- /dev/null +++ b/system/system_monitor/include/system_monitor/cpu_monitor/intel_cpu_monitor.hpp @@ -0,0 +1,59 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file intel_cpu_monitor.h + * @brief CPU monitor class + */ + +#ifndef SYSTEM_MONITOR__CPU_MONITOR__INTEL_CPU_MONITOR_HPP_ +#define SYSTEM_MONITOR__CPU_MONITOR__INTEL_CPU_MONITOR_HPP_ + +#include "system_monitor/cpu_monitor/cpu_monitor_base.hpp" + +#include + +class CPUMonitor : public CPUMonitorBase +{ +public: + /** + * @brief constructor + * @param [in] options Options associated with this node. + */ + explicit CPUMonitor(const rclcpp::NodeOptions & options); + +protected: + /** + * @brief check CPU thermal throttling + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkThrottling( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief get names for core temperature files + */ + void getTempNames() override; + + /** + * @brief Add a loadable kernel module msr + */ + void modprobeMSR(); + + int msr_reader_port_; //!< @brief port number to connect to msr_reader +}; + +#endif // SYSTEM_MONITOR__CPU_MONITOR__INTEL_CPU_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/cpu_monitor/raspi_cpu_monitor.hpp b/system/system_monitor/include/system_monitor/cpu_monitor/raspi_cpu_monitor.hpp new file mode 100644 index 0000000000000..b0456cf34963e --- /dev/null +++ b/system/system_monitor/include/system_monitor/cpu_monitor/raspi_cpu_monitor.hpp @@ -0,0 +1,75 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file raspi_cpu_monitor.h + * @brief Raspberry Pi CPU monitor class + */ + +#ifndef SYSTEM_MONITOR__CPU_MONITOR__RASPI_CPU_MONITOR_HPP_ +#define SYSTEM_MONITOR__CPU_MONITOR__RASPI_CPU_MONITOR_HPP_ + +#include "system_monitor/cpu_monitor/cpu_monitor_base.hpp" + +#include + +#define raspiUnderVoltageDetected (1 << 0) // 0x00001 +#define raspiArmFrequencyCapped (1 << 1) // 0x00002 +#define raspiCurrentlyThrottled (1 << 2) // 0x00004 +#define raspiSoftTemperatureLimitActive (1 << 3) // 0x00008 +#define raspiUnderVoltageHasOccurred (1 << 16) // 0x10000 +#define raspiArmFrequencyCappedHasOccurred (1 << 17) // 0x20000 +#define raspiThrottlingHasOccurred (1 << 18) // 0x40000 +#define raspiSoftTemperatureLimitHasOccurred (1 << 19) // 0x80000 + +#define raspiThermalThrottlingMask (raspiCurrentlyThrottled | raspiSoftTemperatureLimitActive) + +#define throttledToString(X) \ + (((X)&raspiUnderVoltageDetected) ? "Under-voltage detected" \ + : ((X)&raspiArmFrequencyCapped) ? "Arm frequency capped" \ + : ((X)&raspiCurrentlyThrottled) ? "Currently throttled" \ + : ((X)&raspiSoftTemperatureLimitActive) ? "Soft temperature limit active" \ + : ((X)&raspiUnderVoltageHasOccurred) ? "Under-voltage has occurred" \ + : ((X)&raspiArmFrequencyCappedHasOccurred) ? "Arm frequency capped has occurred" \ + : ((X)&raspiThrottlingHasOccurred) ? "Throttling has occurred" \ + : ((X)&raspiSoftTemperatureLimitHasOccurred) ? "Soft temperature limit has occurred" \ + : "UNKNOWN") + +class CPUMonitor : public CPUMonitorBase +{ +public: + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + explicit CPUMonitor(const rclcpp::NodeOptions & options); + + /** + * @brief get names for core temperature files + */ + void getTempNames() override; + +protected: + /** + * @brief check CPU thermal throttling + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkThrottling( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) +}; + +#endif // SYSTEM_MONITOR__CPU_MONITOR__RASPI_CPU_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/cpu_monitor/tegra_cpu_monitor.hpp b/system/system_monitor/include/system_monitor/cpu_monitor/tegra_cpu_monitor.hpp new file mode 100644 index 0000000000000..e86e21470e4dc --- /dev/null +++ b/system/system_monitor/include/system_monitor/cpu_monitor/tegra_cpu_monitor.hpp @@ -0,0 +1,53 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file tegra_cpu_monitor.h + * @brief TEGRA CPU monitor class + */ + +#ifndef SYSTEM_MONITOR__CPU_MONITOR__TEGRA_CPU_MONITOR_HPP_ +#define SYSTEM_MONITOR__CPU_MONITOR__TEGRA_CPU_MONITOR_HPP_ + +#include "system_monitor/cpu_monitor/cpu_monitor_base.hpp" + +#include + +class CPUMonitor : public CPUMonitorBase +{ +public: + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + explicit CPUMonitor(const rclcpp::NodeOptions & options); + + /** + * @brief get names for core temperature files + */ + void getTempNames() override; + +protected: + /** + * @brief check CPU thermal throttling + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkThrottling( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) +}; + +#endif // SYSTEM_MONITOR__CPU_MONITOR__TEGRA_CPU_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/cpu_monitor/unknown_cpu_monitor.hpp b/system/system_monitor/include/system_monitor/cpu_monitor/unknown_cpu_monitor.hpp new file mode 100644 index 0000000000000..926011a3ed285 --- /dev/null +++ b/system/system_monitor/include/system_monitor/cpu_monitor/unknown_cpu_monitor.hpp @@ -0,0 +1,38 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file unknown_cpu_monitor.h + * @brief Unknown CPU monitor class + */ + +#ifndef SYSTEM_MONITOR__CPU_MONITOR__UNKNOWN_CPU_MONITOR_HPP_ +#define SYSTEM_MONITOR__CPU_MONITOR__UNKNOWN_CPU_MONITOR_HPP_ + +#include "system_monitor/cpu_monitor/cpu_monitor_base.hpp" + +#include + +class CPUMonitor : public CPUMonitorBase +{ +public: + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + explicit CPUMonitor(const rclcpp::NodeOptions & options); +}; + +#endif // SYSTEM_MONITOR__CPU_MONITOR__UNKNOWN_CPU_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/gpu_monitor/gpu_monitor_base.hpp b/system/system_monitor/include/system_monitor/gpu_monitor/gpu_monitor_base.hpp new file mode 100644 index 0000000000000..9c4f6c7510c94 --- /dev/null +++ b/system/system_monitor/include/system_monitor/gpu_monitor/gpu_monitor_base.hpp @@ -0,0 +1,129 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file gpu_monitor.h + * @brief GPU monitor class + */ + +#ifndef SYSTEM_MONITOR__GPU_MONITOR__GPU_MONITOR_BASE_HPP_ +#define SYSTEM_MONITOR__GPU_MONITOR__GPU_MONITOR_BASE_HPP_ + +#include + +#include +#include +#include + +class GPUMonitorBase : public rclcpp::Node +{ +public: + /** + * @brief Update the diagnostic state. + */ + virtual void update(); + + /** + * @brief Terminate the node, log final statements. An independent function is preferred to allow + * an explicit way to operate actions that require a valid rclcpp context. By default this method + * does nothing. + */ + virtual void shut_down(); + +protected: + using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + GPUMonitorBase(const std::string & node_name, const rclcpp::NodeOptions & options); + + /** + * @brief check GPU temperature + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkTemp( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check GPU usage + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check GPU memory usage + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkMemoryUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check GPU throttling + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkThrottling( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check GPU frequency + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + virtual void checkFrequency( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics + + char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name + + float temp_warn_; //!< @brief GPU temperature(DegC) to generate warning + float temp_error_; //!< @brief GPU temperature(DegC) to generate error + float gpu_usage_warn_; //!< @brief GPU usage(%) to generate warning + float gpu_usage_error_; //!< @brief GPU usage(%) to generate error + float memory_usage_warn_; //!< @brief GPU memory usage(%) to generate warning + float memory_usage_error_; //!< @brief GPU memory usage(%) to generate error + + /** + * @brief GPU temperature status messages + */ + const std::map temp_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "warm"}, {DiagStatus::ERROR, "hot"}}; + + /** + * @brief GPU usage status messages + */ + const std::map load_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "high load"}, {DiagStatus::ERROR, "very high load"}}; + + /** + * @brief GPU throttling status messages + */ + const std::map throttling_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "unused"}, {DiagStatus::ERROR, "throttling"}}; +}; + +#endif // SYSTEM_MONITOR__GPU_MONITOR__GPU_MONITOR_BASE_HPP_ diff --git a/system/system_monitor/include/system_monitor/gpu_monitor/nvml_gpu_monitor.hpp b/system/system_monitor/include/system_monitor/gpu_monitor/nvml_gpu_monitor.hpp new file mode 100644 index 0000000000000..c29a026ba914c --- /dev/null +++ b/system/system_monitor/include/system_monitor/gpu_monitor/nvml_gpu_monitor.hpp @@ -0,0 +1,133 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file nvml_gpu_monitor.h + * @brief NVML GPU monitor class + */ + +#ifndef SYSTEM_MONITOR__GPU_MONITOR__NVML_GPU_MONITOR_HPP_ +#define SYSTEM_MONITOR__GPU_MONITOR__NVML_GPU_MONITOR_HPP_ + +#include "system_monitor/gpu_monitor/gpu_monitor_base.hpp" + +#include + +#include +#include +#include + +#define reasonToString(X) \ + (((X)&nvmlClocksThrottleReasonGpuIdle) ? "GpuIdle" \ + : ((X)&nvmlClocksThrottleReasonApplicationsClocksSetting) ? "ApplicationsClocksSetting" \ + : ((X)&nvmlClocksThrottleReasonSwPowerCap) ? "SwPowerCap" \ + : ((X)&nvmlClocksThrottleReasonHwSlowdown) ? "HwSlowdown" \ + : ((X)&nvmlClocksThrottleReasonSyncBoost) ? "SyncBoost" \ + : ((X)&nvmlClocksThrottleReasonSwThermalSlowdown) ? "SwThermalSlowdown" \ + : ((X)&nvmlClocksThrottleReasonHwThermalSlowdown) ? "HwThermalSlowdown" \ + : ((X)&nvmlClocksThrottleReasonHwPowerBrakeSlowdown) ? "HwPowerBrakeSlowdown" \ + : ((X)&nvmlClocksThrottleReasonDisplayClockSetting) ? "DisplayClockSetting" \ + : "UNKNOWN") + +/** + * @brief GPU information + */ +struct gpu_info +{ + nvmlDevice_t device; //!< @brief handle for a particular device + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; //!< @brief name of device + nvmlPciInfo_t pci; //!< @brief PCI information about a GPU device + nvmlUtilization_t utilization; //!< @brief Utilization information for a device +}; + +class GPUMonitor : public GPUMonitorBase +{ +public: + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + explicit GPUMonitor(const rclcpp::NodeOptions & options); + + /** + * @brief Terminate the node, log final statements. An independent function is preferred to allow + * an explicit way to operate actions that require a valid rclcpp context. By default this method + * does nothing. + */ + void shut_down() override; + +protected: + /** + * @brief check GPU temperature + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkTemp( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief check GPU usage + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief add stat of GPU usage per process + * @param [in] index GPU index + * @param [in] device GPU device + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + */ + void addProcessUsage( + int index, nvmlDevice_t device, diagnostic_updater::DiagnosticStatusWrapper & stat); + + /** + * @brief check GPU memory usage + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkMemoryUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief check GPU throttling + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkThrottling( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief get human-readable output for memory size + * @param [in] size size with bytes + * @return human-readable output + * @note NOLINT syntax is needed since struct nvmlMemory_t has unsigned long long values to return + * memory size. + */ + std::string toHumanReadable(unsigned long long size); // NOLINT(runtime/int) + + static const size_t MAX_ARRAY_SIZE = 64; + static const size_t MAX_NAME_LENGTH = 128; + + std::vector gpus_; //!< @brief list of gpus + uint64_t current_timestamp_ = 0; //!< @brief latest timestamp[usec] of addProcessUsage() +}; + +#endif // SYSTEM_MONITOR__GPU_MONITOR__NVML_GPU_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/gpu_monitor/tegra_gpu_monitor.hpp b/system/system_monitor/include/system_monitor/gpu_monitor/tegra_gpu_monitor.hpp new file mode 100644 index 0000000000000..c8d513d64f296 --- /dev/null +++ b/system/system_monitor/include/system_monitor/gpu_monitor/tegra_gpu_monitor.hpp @@ -0,0 +1,103 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/** + * @file tegra_gpu_monitor.h + * @brief Tegra TGPU monitor class + */ + +#ifndef SYSTEM_MONITOR__GPU_MONITOR__TEGRA_GPU_MONITOR_HPP_ +#define SYSTEM_MONITOR__GPU_MONITOR__TEGRA_GPU_MONITOR_HPP_ + +#include "system_monitor/gpu_monitor/gpu_monitor_base.hpp" + +#include +#include + +struct gpu_info +{ + std::string label_; //!< @brief gpu label + std::string path_; //!< @brief sysfs path to gpu temperature + + gpu_info() : label_(), path_() {} + gpu_info(const std::string & l, const std::string & p) : label_(l), path_(p) {} +}; + +class GPUMonitor : public GPUMonitorBase +{ +public: + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + explicit GPUMonitor(const rclcpp::NodeOptions & options); + +protected: + /** + * @brief check GPU temperature + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkTemp( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief check GPU usage + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief check GPU throttling + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkThrottling( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief check GPU frequency + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkFrequency( + diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references) + + /** + * @brief get names for gpu temperature files + */ + void getTempNames(); + + /** + * @brief get names for gpu load files + */ + void getLoadNames(); + + /** + * @brief get names for gpu frequency files + */ + void getFreqNames(); + + std::vector temps_; //!< @brief GPU list for temperature + std::vector loads_; //!< @brief GPU list for utilization + std::vector freqs_; //!< @brief GPU list for frequency +}; + +#endif // SYSTEM_MONITOR__GPU_MONITOR__TEGRA_GPU_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/gpu_monitor/unknown_gpu_monitor.hpp b/system/system_monitor/include/system_monitor/gpu_monitor/unknown_gpu_monitor.hpp new file mode 100644 index 0000000000000..a512c2e9109bf --- /dev/null +++ b/system/system_monitor/include/system_monitor/gpu_monitor/unknown_gpu_monitor.hpp @@ -0,0 +1,37 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file unknown_gpu_monitor.h + * @brief Unknown GPU monitor class + */ + +#ifndef SYSTEM_MONITOR__GPU_MONITOR__UNKNOWN_GPU_MONITOR_HPP_ +#define SYSTEM_MONITOR__GPU_MONITOR__UNKNOWN_GPU_MONITOR_HPP_ + +#include "system_monitor/gpu_monitor/gpu_monitor_base.hpp" + +#include + +class GPUMonitor : public GPUMonitorBase +{ +public: + /** + * @brief constructor + * @param [in] options Options associated with this node. + */ + explicit GPUMonitor(const rclcpp::NodeOptions & options); +}; + +#endif // SYSTEM_MONITOR__GPU_MONITOR__UNKNOWN_GPU_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp new file mode 100644 index 0000000000000..4ed7fe0a89c85 --- /dev/null +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -0,0 +1,105 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file hdd_monitor.h + * @brief HDD monitor class + */ + +#ifndef SYSTEM_MONITOR__HDD_MONITOR__HDD_MONITOR_HPP_ +#define SYSTEM_MONITOR__HDD_MONITOR__HDD_MONITOR_HPP_ + +#include + +#include +#include +#include +#include + +/** + * @brief error and warning temperature levels + */ +struct HDDParam +{ + float temp_warn_; //!< @brief HDD temperature(DegC) to generate warning + float temp_error_; //!< @brief HDD temperature(DegC) to generate error + float usage_warn_; //!< @brief HDD usage(%) to generate warning + float usage_error_; //!< @brief HDD usage(%) to generate error + + HDDParam() : temp_warn_(55.0), temp_error_(70.0), usage_warn_(0.95), usage_error_(0.99) {} +}; + +class HDDMonitor : public rclcpp::Node +{ +public: + /** + * @brief constructor + * @param [in] options Options associated with this node. + */ + explicit HDDMonitor(const rclcpp::NodeOptions & options); + + /** + * @brief Update the diagnostic state. + */ + void update(); + +protected: + using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + + /** + * @brief check HDD temperature + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkTemp(diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD usage + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief get HDD parameters + */ + void getHDDParams(); + + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics + + char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name + + int hdd_reader_port_; //!< @brief port number to connect to hdd_reader + std::map hdd_params_; //!< @brief list of error and warning levels + std::vector hdd_devices_; //!< @brief list of devices + + /** + * @brief HDD temperature status messages + */ + const std::map temp_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "hot"}, {DiagStatus::ERROR, "critical hot"}}; + + /** + * @brief HDD usage status messages + */ + const std::map usage_dict_ = { + {DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "low disk space"}, + {DiagStatus::ERROR, "very low disk space"}}; +}; + +#endif // SYSTEM_MONITOR__HDD_MONITOR__HDD_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp b/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp new file mode 100644 index 0000000000000..697f4e2e1bac6 --- /dev/null +++ b/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp @@ -0,0 +1,76 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file mem_monitor.h + * @brief Memory monitor class + */ + +#ifndef SYSTEM_MONITOR__MEM_MONITOR__MEM_MONITOR_HPP_ +#define SYSTEM_MONITOR__MEM_MONITOR__MEM_MONITOR_HPP_ + +#include + +#include +#include +#include + +class MemMonitor : public rclcpp::Node +{ +public: + /** + * @brief constructor + * @param [in] options Options associated with this node. + */ + explicit MemMonitor(const rclcpp::NodeOptions & options); + + /** + * @brief Update the diagnostic state. + */ + void update(); + +protected: + using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + + /** + * @brief check Memory usage + * @param @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief get human-readable output for memory size + * @param [in] str size with bytes + * @return human-readable output + */ + std::string toHumanReadable(const std::string & str); + + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics + + char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name + + float usage_warn_; //!< @brief Memory usage(%) to generate warning + float usage_error_; //!< @brief Memory usage(%) to generate error + + /** + * @brief Memory usage status messages + */ + const std::map usage_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "high load"}, {DiagStatus::ERROR, "very high load"}}; +}; + +#endif // SYSTEM_MONITOR__MEM_MONITOR__MEM_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp b/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp new file mode 100644 index 0000000000000..834dd2950ab69 --- /dev/null +++ b/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp @@ -0,0 +1,104 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file net_monitor.h + * @brief Net monitor class + */ + +#ifndef SYSTEM_MONITOR__NET_MONITOR__NET_MONITOR_HPP_ +#define SYSTEM_MONITOR__NET_MONITOR__NET_MONITOR_HPP_ + +#include "system_monitor/net_monitor/nl80211.hpp" + +#include + +#include +#include +#include +#include + +#define toMbit(X) (static_cast(X) / 1000000 * 8) + +/** + * @brief Bytes information + */ +typedef struct bytes +{ + unsigned int rx_bytes; //!< @brief total bytes received + unsigned int tx_bytes; //!< @brief total bytes transmitted + + bytes() : rx_bytes(0), tx_bytes(0) {} +} bytes; + +class NetMonitor : public rclcpp::Node +{ +public: + /** + * @brief constructor + * @param [in] options Options associated with this node. + */ + explicit NetMonitor(const rclcpp::NodeOptions & options); + /** + * @brief destructor + */ + ~NetMonitor(); + + /** + * @brief Update the diagnostic state. + */ + void update(); + + /** + * @brief Shutdown nl80211 object + */ + void shutdown_nl80211(); + +protected: + using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + + /** + * @brief check CPU usage + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief get wireless speed + * @param [in] ifa_name interface name + * @return wireless speed + */ + float getWirelessSpeed(const char * ifa_name); + + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics + + char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name + std::map bytes_; //!< @brief list of bytes + rclcpp::Time last_update_time_; //!< @brief last update time + std::vector device_params_; //!< @brief list of devices + NL80211 nl80211_; // !< @brief 802.11 netlink-based interface + + float usage_warn_; //!< @brief Memory usage(%) to generate warning + + /** + * @brief Network usage status messages + */ + const std::map usage_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "high load"}, {DiagStatus::ERROR, "down"}}; +}; + +#endif // SYSTEM_MONITOR__NET_MONITOR__NET_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/net_monitor/nl80211.hpp b/system/system_monitor/include/system_monitor/net_monitor/nl80211.hpp new file mode 100644 index 0000000000000..0400b1524cca0 --- /dev/null +++ b/system/system_monitor/include/system_monitor/net_monitor/nl80211.hpp @@ -0,0 +1,60 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file nl80211.h + * @brief 802.11 netlink-based interface class + */ + +#ifndef SYSTEM_MONITOR__NET_MONITOR__NL80211_HPP_ +#define SYSTEM_MONITOR__NET_MONITOR__NL80211_HPP_ + +class NL80211 +{ +public: + /** + * @brief constructor + * @param [in] node_name Name of the node. + * @param [in] options Options associated with this node. + */ + NL80211(); + + /** + * @brief initialize + */ + void init(); + + /** + * @brief get bitrate + * @param [in] ifa_name interface name + * @return bitrate + */ + float getBitrate(const char * ifa_name); + + /** + * @brief shutdown + */ + void shutdown(); + + float bitrate_; //!< @brief bitrate + +private: + bool + initialized_; //!< @brief Indicating whether initialization was completed successfully or not + struct nl_sock * socket_; //!< @brief Netlink socket + int id_; //!< @brief Generic netlink family id + struct nl_cb * cb_; //!< @brief Callback handle +}; + +#endif // SYSTEM_MONITOR__NET_MONITOR__NL80211_HPP_ diff --git a/system/system_monitor/include/system_monitor/ntp_monitor/ntp_monitor.hpp b/system/system_monitor/include/system_monitor/ntp_monitor/ntp_monitor.hpp new file mode 100644 index 0000000000000..c319b8e35d5dd --- /dev/null +++ b/system/system_monitor/include/system_monitor/ntp_monitor/ntp_monitor.hpp @@ -0,0 +1,80 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file ntp_monitor.h + * @brief NTP monitor class + */ + +#ifndef SYSTEM_MONITOR__NTP_MONITOR__NTP_MONITOR_HPP_ +#define SYSTEM_MONITOR__NTP_MONITOR__NTP_MONITOR_HPP_ + +#include + +#include +#include +#include +#include + +class NTPMonitor : public rclcpp::Node +{ +public: + /** + * @brief constructor + * @param [in] options Options associated with this node. + */ + explicit NTPMonitor(const rclcpp::NodeOptions & options); + + /** + * @brief Update the diagnostic state. + */ + void update(); + +protected: + using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + + /** + * @brief check NTP Offset + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkOffset( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief function to execute chronyc + * @param [out] outOffset offset value of NTP time + * @param [out] out_tracking_map "chronyc tracking" output for diagnostic + * @return if error occurred, return error string + */ + std::string executeChronyc( + float & outOffset, std::map & out_tracking_map); + + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics + + char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name + bool chronyc_exists_; //!< @brief flag if chronyc exists + + float offset_warn_; //!< @brief NTP offset(sec) to generate warning + float offset_error_; //!< @brief NTP offset(sec) to generate error + + /** + * @brief NTP offset status messages + */ + const std::map offset_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "high"}, {DiagStatus::ERROR, "too high"}}; +}; + +#endif // SYSTEM_MONITOR__NTP_MONITOR__NTP_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/process_monitor/diag_task.hpp b/system/system_monitor/include/system_monitor/process_monitor/diag_task.hpp new file mode 100644 index 0000000000000..183f86baa2a08 --- /dev/null +++ b/system/system_monitor/include/system_monitor/process_monitor/diag_task.hpp @@ -0,0 +1,179 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file diag_task.h + * @brief diagnostics task for high load/memory procs + */ + +#ifndef SYSTEM_MONITOR__PROCESS_MONITOR__DIAG_TASK_HPP_ +#define SYSTEM_MONITOR__PROCESS_MONITOR__DIAG_TASK_HPP_ + +#include + +#include + +class DiagTask : public diagnostic_updater::DiagnosticTask +{ +public: + using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + + /** + * @brief constructor + * @param [in] name diagnostics status name + */ + explicit DiagTask(const std::string & name) : DiagnosticTask(name) {} + + /** + * @brief main loop + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + */ + void run(diagnostic_updater::DiagnosticStatusWrapper & stat) + { + stat.summary(level_, message_); + + if (level_ != DiagStatus::OK) { + stat.add("content", content_); + } else { + stat.add("COMMAND", command_); + stat.add("%CPU", cpu_); + stat.add("%MEM", mem_); + stat.add("PID", pid_); + stat.add("USER", user_); + stat.add("PR", pr_); + stat.add("NI", ni_); + stat.add("VIRT", virt_); + stat.add("RES", res_); + stat.add("SHR", shr_); + stat.add("S", s_); + stat.add("TIME+", time_); + } + stat.summary(level_, message_); + } + + /** + * @brief set diagnostics status + * @param [in] status Diagnostics error level + * @param [in] message Diagnostics status message + */ + void setDiagnosticsStatus(int level, const std::string & message) + { + level_ = level; + message_ = message; + } + + /** + * @brief set error content + * @param [in] error_command Error command + * @param [in] content Error content + */ + void setErrorContent(const std::string & error_command, const std::string & content) + { + error_command_ = error_command; + content_ = content; + } + + /** + * @brief set process id + * @param [in] pid process id + */ + void setProcessId(const std::string & pid) { pid_ = pid; } + + /** + * @brief set user name + * @param [in] user user name + */ + void setUserName(const std::string & user) { user_ = user; } + + /** + * @brief set priority + * @param [in] pr priority + */ + void setPriority(const std::string & pr) { pr_ = pr; } + + /** + * @brief set nice value + * @param [in] ni nice value + */ + void setNiceValue(const std::string & ni) { ni_ = ni; } + + /** + * @brief set virtual image + * @param [in] virt virtual image + */ + void setVirtualImage(const std::string & virt) { virt_ = virt; } + + /** + * @brief set resident size + * @param [in] res resident size + */ + void setResidentSize(const std::string & res) { res_ = res; } + + /** + * @brief set shared mem size + * @param [in] shr shared mem size + */ + void setSharedMemSize(const std::string & shr) { shr_ = shr; } + + /** + * @brief set process status + * @param [in] s process status + */ + void setProcessStatus(const std::string & s) { s_ = s; } + + /** + * @brief set CPU usage + * @param [in] cpu CPU usage + */ + void setCPUUsage(const std::string & cpu) { cpu_ = cpu; } + + /** + * @brief set memory usage + * @param [in] mem memory usage + */ + void setMemoryUsage(const std::string & mem) { mem_ = mem; } + + /** + * @brief set CPU time + * @param [in] time CPU time + */ + void setCPUTime(const std::string & time) { time_ = time; } + + /** + * @brief set Command name/line + * @param [in] command Command name/line + */ + void setCommandName(const std::string & command) { command_ = command; } + +private: + int level_; //!< @brief Diagnostics error level + std::string message_; //!< @brief Diagnostics status message + std::string error_command_; //!< @brief Error command + std::string content_; //!< @brief Error content + + std::string pid_; //!< @brief Process Id + std::string user_; //!< @brief User Name + std::string pr_; //!< @brief Priority + std::string ni_; //!< @brief Nice value + std::string virt_; //!< @brief Virtual Image (kb) + std::string res_; //!< @brief Resident size (kb) + std::string shr_; //!< @brief Shared Mem size (kb) + std::string s_; //!< @brief Process Status + std::string cpu_; //!< @brief CPU usage + std::string mem_; //!< @brief Memory usage + std::string time_; //!< @brief CPU Time + std::string command_; //!< @brief Command name/line +}; + +#endif // SYSTEM_MONITOR__PROCESS_MONITOR__DIAG_TASK_HPP_ diff --git a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp new file mode 100644 index 0000000000000..39bf897116b0a --- /dev/null +++ b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp @@ -0,0 +1,124 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file process_monitor.h + * @brief Process monitor class + */ + +#ifndef SYSTEM_MONITOR__PROCESS_MONITOR__PROCESS_MONITOR_HPP_ +#define SYSTEM_MONITOR__PROCESS_MONITOR__PROCESS_MONITOR_HPP_ + +#include "system_monitor/process_monitor/diag_task.hpp" + +#include + +#include + +#include +#include +#include + +namespace bp = boost::process; + +class ProcessMonitor : public rclcpp::Node +{ +public: + /** + * @brief constructor + * @param [in] options Options associated with this node. + */ + explicit ProcessMonitor(const rclcpp::NodeOptions & options); + + /** + * @brief Update the diagnostic state + */ + void update(); + +protected: + using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + + /** + * @brief monitor processes + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void monitorProcesses( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief get task summary + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @param [in] output top command output + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void getTasksSummary( + diagnostic_updater::DiagnosticStatusWrapper & stat, + const std::string & output); // NOLINT(runtime/references) + + /** + * @brief remove header + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @param [in] output top command output + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void removeHeader( + diagnostic_updater::DiagnosticStatusWrapper & stat, + std::string & output); // NOLINT(runtime/references) + + /** + * @brief get high load processes + * @param [in] output top command output + */ + void getHighLoadProcesses(const std::string & output); + + /** + * @brief get high memory processes + * @param [in] output top command output + */ + void getHighMemoryProcesses(const std::string & output); + + /** + * @brief get top-rated processes + * @param [in] tasks list of diagnostics tasks for high load procs + * @param [in] output top command output + */ + void getTopratedProcesses(std::vector> * tasks, bp::pipe * p); + + /** + * @brief get top-rated processes + * @param [in] tasks list of diagnostics tasks for high load procs + * @param [in] message Diagnostics status message + * @param [in] error_command Error command + * @param [in] content Error content + */ + void setErrorContent( + std::vector> * tasks, const std::string & message, + const std::string & error_command, const std::string & content); + + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics + + char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name + + int num_of_procs_; //!< @brief number of processes to show + std::vector> + load_tasks_; //!< @brief list of diagnostics tasks for high load procs + std::vector> + memory_tasks_; //!< @brief list of diagnostics tasks for high memory procs +}; + +#endif // SYSTEM_MONITOR__PROCESS_MONITOR__PROCESS_MONITOR_HPP_ diff --git a/system/system_monitor/include/system_monitor/system_monitor_utility.hpp b/system/system_monitor/include/system_monitor/system_monitor_utility.hpp new file mode 100644 index 0000000000000..fc27757394aee --- /dev/null +++ b/system/system_monitor/include/system_monitor/system_monitor_utility.hpp @@ -0,0 +1,124 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file system_monitor_utility.h + * @brief System Monitor Utility class + */ + +#ifndef SYSTEM_MONITOR__SYSTEM_MONITOR_UTILITY_HPP_ +#define SYSTEM_MONITOR__SYSTEM_MONITOR_UTILITY_HPP_ + +#include +#include +#include + +#include +#include +#include +#include + +namespace fs = boost::filesystem; + +typedef struct thermal_zone +{ + std::string type_; //!< @brief thermal zone name + std::string label_; //!< @brief thermal_zone[0-9] + std::string path_; //!< @brief sysfs path to temperature + + thermal_zone() : type_(), label_(), path_() {} + thermal_zone(const std::string & type, const std::string & label, const std::string & path) + : type_(type), label_(label), path_(path) + { + } +} thermal_zone; + +class SystemMonitorUtility +{ +public: + /** + * @brief get thermal zone information + * @param [in] t thermal zone name + * @param [in] pointer to thermal zone information + */ + static void getThermalZone(const std::string & t, std::vector * therm) + { + if (therm == nullptr) { + return; + } + + therm->clear(); + + const fs::path root("/sys/class/thermal"); + + for (const fs::path & path : + boost::make_iterator_range(fs::directory_iterator(root), fs::directory_iterator())) { + if (!fs::is_directory(path)) { + continue; + } + + std::cmatch match; + const char * therm_dir = path.generic_string().c_str(); + + // not thermal_zone[0-9] + if (!std::regex_match(therm_dir, match, std::regex(".*/thermal_zone(\\d+)"))) { + continue; + } + + std::string type; + const fs::path type_path = path / "type"; + fs::ifstream ifs(type_path, std::ios::in); + if (ifs) { + std::string line; + if (std::getline(ifs, line)) { + type = line; + } + } + ifs.close(); + + if (type != t) { + continue; + } + + const fs::path temp_path = path / "temp"; + therm->emplace_back(t, path.filename().generic_string(), temp_path.generic_string()); + } + } + + /** + * @brief Remember start time to measure elapsed time + * @return start time + */ + static std::chrono::high_resolution_clock::time_point startMeasurement() + { + return std::chrono::high_resolution_clock::now(); + } + + /** + * @brief Measure elapsed time since start time and report + * @param [in] t_start start time + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + */ + static void stopMeasurement( + const std::chrono::high_resolution_clock::time_point & start, + diagnostic_updater::DiagnosticStatusWrapper & stat) + { + // Measure elapsed time since start time and report + const auto t_end = std::chrono::high_resolution_clock::now(); + const float elapsed_ms = std::chrono::duration(t_end - start).count(); + stat.addf("execution time", "%f ms", elapsed_ms); + } +}; + +#endif // SYSTEM_MONITOR__SYSTEM_MONITOR_UTILITY_HPP_ diff --git a/system/system_monitor/launch/system_monitor.launch.py b/system/system_monitor/launch/system_monitor.launch.py new file mode 100644 index 0000000000000..1a54f8954dce2 --- /dev/null +++ b/system/system_monitor/launch/system_monitor.launch.py @@ -0,0 +1,156 @@ +# Copyright 2020 Tier IV, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from ament_index_python.packages import get_package_share_directory +import launch +from launch.actions import DeclareLaunchArgument +from launch.actions import OpaqueFunction +from launch.substitutions import LaunchConfiguration +from launch_ros.actions import ComposableNodeContainer +from launch_ros.descriptions import ComposableNode +import yaml + + +def launch_setup(context, *args, **kwargs): + + with open(LaunchConfiguration("cpu_monitor_config_file").perform(context), "r") as f: + cpu_monitor_config = yaml.safe_load(f)["/**"]["ros__parameters"] + cpu_monitor = ComposableNode( + package="system_monitor", + plugin="CPUMonitor", + name="cpu_monitor", + parameters=[ + cpu_monitor_config, + ], + ) + with open(LaunchConfiguration("hdd_monitor_config_file").perform(context), "r") as f: + hdd_monitor_config = yaml.safe_load(f)["/**"]["ros__parameters"] + hdd_monitor = ComposableNode( + package="system_monitor", + plugin="HDDMonitor", + name="hdd_monitor", + parameters=[ + hdd_monitor_config, + ], + ) + with open(LaunchConfiguration("mem_monitor_config_file").perform(context), "r") as f: + mem_monitor_config = yaml.safe_load(f)["/**"]["ros__parameters"] + mem_monitor = ComposableNode( + package="system_monitor", + plugin="MemMonitor", + name="mem_monitor", + parameters=[ + mem_monitor_config, + ], + ) + with open(LaunchConfiguration("net_monitor_config_file").perform(context), "r") as f: + net_monitor_config = yaml.safe_load(f)["/**"]["ros__parameters"] + net_monitor = ComposableNode( + package="system_monitor", + plugin="NetMonitor", + name="net_monitor", + parameters=[ + net_monitor_config, + ], + ) + with open(LaunchConfiguration("ntp_monitor_config_file").perform(context), "r") as f: + ntp_monitor_config = yaml.safe_load(f)["/**"]["ros__parameters"] + ntp_monitor = ComposableNode( + package="system_monitor", + plugin="NTPMonitor", + name="ntp_monitor", + parameters=[ + ntp_monitor_config, + ], + ) + with open(LaunchConfiguration("process_monitor_config_file").perform(context), "r") as f: + process_monitor_config = yaml.safe_load(f)["/**"]["ros__parameters"] + process_monitor = ComposableNode( + package="system_monitor", + plugin="ProcessMonitor", + name="process_monitor", + parameters=[ + process_monitor_config, + ], + ) + with open(LaunchConfiguration("gpu_monitor_config_file").perform(context), "r") as f: + gpu_monitor_config = yaml.safe_load(f)["/**"]["ros__parameters"] + gpu_monitor = ComposableNode( + package="system_monitor", + plugin="GPUMonitor", + name="gpu_monitor", + parameters=[ + gpu_monitor_config, + ], + ) + + # set container to run all required components in the same process + container = ComposableNodeContainer( + name="system_monitor_container", + namespace="system_monitor", + package="rclcpp_components", + executable="component_container_mt", + composable_node_descriptions=[ + cpu_monitor, + hdd_monitor, + mem_monitor, + net_monitor, + ntp_monitor, + process_monitor, + gpu_monitor, + ], + output="screen", + ) + return [container] + + +def generate_launch_description(): + system_monitor_path = os.path.join( + get_package_share_directory("system_launch"), "config", "system_monitor" + ) + return launch.LaunchDescription( + [ + DeclareLaunchArgument( + "cpu_monitor_config_file", + default_value=os.path.join(system_monitor_path, "cpu_monitor.param.yaml"), + ), + DeclareLaunchArgument( + "hdd_monitor_config_file", + default_value=os.path.join(system_monitor_path, "hdd_monitor.param.yaml"), + ), + DeclareLaunchArgument( + "mem_monitor_config_file", + default_value=os.path.join(system_monitor_path, "mem_monitor.param.yaml"), + ), + DeclareLaunchArgument( + "net_monitor_config_file", + default_value=os.path.join(system_monitor_path, "net_monitor.param.yaml"), + ), + DeclareLaunchArgument( + "ntp_monitor_config_file", + default_value=os.path.join(system_monitor_path, "ntp_monitor.param.yaml"), + ), + DeclareLaunchArgument( + "process_monitor_config_file", + default_value=os.path.join(system_monitor_path, "process_monitor.param.yaml"), + ), + DeclareLaunchArgument( + "gpu_monitor_config_file", + default_value=os.path.join(system_monitor_path, "gpu_monitor.param.yaml"), + ), + OpaqueFunction(function=launch_setup), + ] + ) diff --git a/system/system_monitor/launch/system_monitor.launch.xml b/system/system_monitor/launch/system_monitor.launch.xml new file mode 100644 index 0000000000000..ac6e2918cd18c --- /dev/null +++ b/system/system_monitor/launch/system_monitor.launch.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/system/system_monitor/package.xml b/system/system_monitor/package.xml new file mode 100644 index 0000000000000..3e44457ebdfb6 --- /dev/null +++ b/system/system_monitor/package.xml @@ -0,0 +1,31 @@ + + + + system_monitor + 1.13.0 + The system_monitor package + Fumihito Ito + + Apache License 2.0 + + ament_cmake_auto + + diagnostic_msgs + diagnostic_updater + fmt + libnl-3-dev + rclcpp + rclcpp_components + std_msgs + + chrony + sysstat + + ament_cmake_gtest + ament_lint_auto + autoware_lint_common + + + ament_cmake + + diff --git a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp new file mode 100644 index 0000000000000..b5b691e64c854 --- /dev/null +++ b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp @@ -0,0 +1,591 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file hdd_reader.cpp + * @brief HDD information read class + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// 7634-7647 Unassigned +constexpr int PORT = 7635; + +/** + * @brief HDD information + */ +struct HDD_Info +{ + std::string model_; //!< @brief Model number + std::string serial_; //!< @brief Serial number + int temperature_; //!< @brief Temperature +}; + +/** + * @brief ATA PASS-THROUGH (12) command + * @note For details please see the document below. + * - ATA Command Pass-Through + * https://www.t10.org/ftp/t10/document.04/04-262r8.pdf + */ +struct ATAPassThrough12 +{ + uint8_t operation_code_; //!< @brief OPERATION CODE (A1h) + uint8_t reserved0_ : 1; //!< @brief Reserved + uint8_t protocol_ : 4; //!< @brief PROTOCOL + uint8_t multiple_count_ : 3; //!< @brief MULTIPLE_COUNT + uint8_t t_length_ : 2; //!< @brief T_LENGTH + uint8_t byt_blok_ : 1; //!< @brief BYT_BLOK + uint8_t t_dir_ : 1; //!< @brief T_DIR + uint8_t reserved1_ : 1; //!< @brief Reserved + uint8_t ck_cond_ : 1; //!< @brief CK_COND + uint8_t off_line_ : 2; //!< @brief OFF_LINE + uint8_t features_; //!< @brief FEATURES (0:7) + uint8_t sector_count_; //!< @brief SECTOR_COUNT (0:7) + uint8_t lba_low_; //!< @brief LBA_LOW (0:7) + uint8_t lba_mid_; //!< @brief LBA_MID (0:7) + uint8_t lbe_high_; //!< @brief LBE_HIGH (0:7) + uint8_t device_; //!< @brief DEVICE + uint8_t command_; //!< @brief COMMAND + uint8_t reserved2_; //!< @brief Reserved + uint8_t control_; //!< @brief CONTROL +}; + +/** + * @brief Attribute Table Format + * @note For details please see the documents below. + * - SMART Attribute Overview + * http://www.t13.org/Documents/UploadedDocuments/docs2005/e05171r0-ACS-SMARTAttributes_Overview.pdf + */ +struct AttributeEntry +{ + uint8_t attribute_id_; //!< @brief Attribute ID + // Flags + uint16_t warranty_ : 1; //!< @brief Bit 0 – Warranty + uint16_t offline_ : 1; //!< @brief Bit 1 – Offline + uint16_t performance_ : 1; //!< @brief Bit 2 – Performance + uint16_t error_rate_ : 1; //!< @brief Bit 3 – Error rate + uint16_t event_count_ : 1; //!< @brief Bit 4 – Event count + uint16_t self_preservation_ : 1; //!< @brief Bit 5 – Self-preservation + uint16_t reserved_ : 10; //!< @brief Bits 6–15 – Reserved + + uint8_t current_value_; //!< @brief Current value + uint8_t worst_value_; //!< @brief Worst value + uint32_t data_; //!< @brief Data + uint16_t attribute_specific_; //!< @brief Attribute-specific + uint8_t threshold_; //!< @brief Threshold +} __attribute__((packed)); // Minimize total struct memory 16 to 12 + +/** + * @brief Device SMART data structure + * @note For details please see the documents below. + * - ATA/ATAPI Command Set - 3 (ACS-3) + * http://www.t13.org/Documents/UploadedDocuments/docs2013/d2161r5-ATAATAPI_Command_Set_-_3.pdf + * - SMART Attribute Overview + * http://www.t13.org/Documents/UploadedDocuments/docs2005/e05171r0-ACS-SMARTAttributes_Overview.pdf + */ +struct SMARTData +{ + // Offset 0..361 X Vendor specific + uint16_t smart_structure_version_; //!< @brief SMART structure version + AttributeEntry attribute_entry_[30]; //!< @brief Attribute entry 1 - 30 + // Offset 362 to 511 + uint8_t off_line_data_collection_status_; //!< @brief Off-line data collection status + uint8_t self_test_execution_status_byte_; //!< @brief Self-test execution status byte + uint16_t vendor_specific0_; //!< @brief Vendor specific + uint8_t vendor_specific1_; //!< @brief Vendor specific + uint8_t off_line_data_collection_capability_; //!< @brief Off-line data collection capability + uint16_t smart_capability_; //!< @brief SMART capability + uint8_t error_logging_capability_; //!< @brief Error logging capability + uint8_t vendor_specific2_; //!< @brief Vendor specific + uint8_t short_self_test_polling_time_; //!< @brief Short self-test polling time (in minutes) + uint8_t extended_self_test_polling_time_; //!< @brief Extended self-test polling time in minutes + uint8_t + conveyance_self_test_polling_time_; //!< @brief Conveyance self-test polling time in minutes + uint16_t //!< @brief Extended self-test polling time + extended_self_test_polling_time_word_; //!< in minutes (word) + uint8_t reserved_[9]; //!< @brief Reserved + uint8_t vendor_specific3_[125]; //!< @brief Vendor specific + uint8_t data_structure_checksum_; //!< @brief Data structure checksum +} __attribute__((packed)); // Minimize total struct memory 514 to 512 + +/** + * @brief print usage + */ +void usage() +{ + printf("Usage: hdd_reader [options]\n"); + printf(" -h --help : Display help\n"); + printf(" -p --port # : Port number to listen to.\n"); + printf("\n"); +} + +/** + * @brief exchanges the values of 2 bytes + * @param [inout] ptr a pointer to ATA string + * @param [in] size size of ATA string + * @note Each pair of bytes in an ATA string is swapped. + * FIRMWARE REVISION field example + * Word Value + * 23 6162h ("ba") + * 24 6364h ("dc") + * 25 6566h ("fe") + * 26 6720h (" g") + * -> "abcdefg " + */ +void swap_char(char * ptr, size_t size) +{ + for (auto i = 0U; i < size; i += 2U) { + std::swap(ptr[i], ptr[i + 1]); + } +} + +/** + * @brief get IDENTIFY DEVICE for ATA drive + * @param [in] fd file descriptor to device + * @param [out] info a pointer to HDD information + * @return 0 on success, otherwise error + * @note For details please see the documents below. + * - ATA Command Pass-Through + * https://www.t10.org/ftp/t10/document.04/04-262r8.pdf + * - ATA Command Set - 4 (ACS-4) + * http://www.t13.org/Documents/UploadedDocuments/docs2016/di529r14-ATAATAPI_Command_Set_-_4.pdf + */ +int get_ata_identify(int fd, HDDInfo * info) +{ + sg_io_hdr_t hdr{}; + ATAPassThrough12 ata{}; + unsigned char data[512]{}; // 256 words + + // Create a command descriptor block(CDB) + ata.operation_code_ = 0xA1; // ATA PASS-THROUGH (12) command + ata.protocol_ = 0x4; // PIO Data-In + ata.t_dir_ = 0x1; // from the ATA device to the application client + ata.byt_blok_ = 0x1; // the number of blocks specified in the T_LENGTH field + ata.t_length_ = 0x2; // length is specified in the SECTOR_COUNT field + ata.sector_count_ = 0x01; // 1 sector + ata.command_ = 0xEC; // IDENTIFY DEVICE + + // Create a control structure + hdr.interface_id = 'S'; // This must be set to 'S' + hdr.dxfer_direction = SG_DXFER_FROM_DEV; // a SCSI READ command + hdr.cmd_len = sizeof(ata); // length in bytes of the SCSI command that 'cmdp' points to + hdr.cmdp = (unsigned char *)&ata; // SCSI command to be executed + hdr.dxfer_len = sizeof(data); // number of bytes to be moved in the data transfer + hdr.dxferp = data; // a pointer to user memory + hdr.timeout = 1000; // 1 second + + // send SCSI command to device + if (ioctl(fd, SG_IO, &hdr) < 0) { + return errno; + } + + // IDENTIFY DEVICE + // Word 10..19 Serial number + char serial_number[20 + 1]{}; + strncpy(serial_number, reinterpret_cast(data) + 20, 20); + swap_char(serial_number, 20); + info->serial_ = serial_number; + boost::trim(info->serial_); + + // Word 27..46 Model number + char model_number[40 + 1]{}; + strncpy(model_number, reinterpret_cast(data) + 54, 40); + swap_char(model_number, 40); + info->model_ = model_number; + boost::trim(info->model_); + + return EXIT_SUCCESS; +} + +/** + * @brief get SMART DATA for ATA drive + * @param [in] fd file descriptor to device + * @param [out] info a pointer to HDD information + * @return 0 on success, otherwise error + * @note For details please see the documents below. + * - ATA Command Pass-Through + * https://www.t10.org/ftp/t10/document.04/04-262r8.pdf + * - ATA/ATAPI Command Set - 3 (ACS-3) + * http://www.t13.org/Documents/UploadedDocuments/docs2013/d2161r5-ATAATAPI_Command_Set_-_3.pdf + * - SMART Attribute Overview + * http://www.t13.org/Documents/UploadedDocuments/docs2005/e05171r0-ACS-SMARTAttributes_Overview.pdf + * - SMART Attribute Annex + * http://www.t13.org/documents/uploadeddocuments/docs2005/e05148r0-acs-smartattributesannex.pdf + */ +int get_ata_SMARTData(int fd, HDDInfo * info) +{ + sg_io_hdr_t hdr{}; + ATAPassThrough12 ata{}; + SMARTData data{}; + + // Create a command descriptor block(CDB) + ata.operation_code_ = 0xA1; // ATA PASS-THROUGH (12) command + ata.protocol_ = 0x4; // PIO Data-In + ata.t_dir_ = 0x1; // from the ATA device to the application client + ata.byt_blok_ = 0x1; // the number of blocks specified in the T_LENGTH field + ata.t_length_ = 0x2; // length is specified in the SECTOR_COUNT field + ata.features_ = 0xD0; // SMART READ DATA + ata.sector_count_ = 0x01; // 1 sector + ata.lba_mid_ = 0x4F; // Fixed + ata.lbe_high_ = 0xC2; // Fixed + ata.command_ = 0xB0; // SMART READ DATA + + // Create a control structure + hdr.interface_id = 'S'; // This must be set to 'S' + hdr.dxfer_direction = SG_DXFER_FROM_DEV; // a SCSI READ command + hdr.cmd_len = sizeof(ata); // length in bytes of the SCSI command that 'cmdp' points to + hdr.cmdp = (unsigned char *)&ata; // SCSI command to be executed + hdr.dxfer_len = sizeof(data); // number of bytes to be moved in the data transfer + hdr.dxferp = &data; // a pointer to user memory + hdr.timeout = 1000; // 1 second + + // send SCSI command to device + if (ioctl(fd, SG_IO, &hdr) < 0) { + return errno; + } + + // Retrieve C2h Enclosure Temperature + for (int i = 0; i < 30; ++i) { + if (data.attribute_entry_[i].attribute_id_ == 0xC2) { + info->temp_ = static_cast(data.attribute_entry_[i].data_); + return EXIT_SUCCESS; + } + } + + return ENOENT; +} + +/** + * @brief get Identify for NVMe drive + * @param [in] fd file descriptor to device + * @param [out] info a pointer to HDD information + * @return 0 on success, otherwise error + * @note For details please see the document below. + * - NVM Express 1.2b + * https://www.nvmexpress.org/wp-content/uploads/NVM_Express_1_2b_Gold_20160603.pdf + */ +int get_nvme_identify(int fd, HDDInfo * info) +{ + nvme_admin_cmd cmd{}; + char data[4096]{}; // Fixed size for Identify command + + // The Identify command returns a data buffer that describes information about the NVM subsystem + cmd.opcode = 0x06; // Identify + cmd.addr = (uint64_t)data; // memory address of data + cmd.data_len = sizeof(data); // length of data + cmd.cdw10 = 0x01; // Identify Controller data structure + + // send Admin Command to device + int ret = ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); + if (ret < 0) { + return errno; + } + + // Identify Controller Data Structure + // Bytes 23:04 Serial Number (SN) + char serial_number[20 + 1]{}; + strncpy(serial_number, data + 4, 20); + info->serial_ = serial_number; + boost::trim(info->serial_); + + // Bytes 63:24 Model Number (MN) + char model_number[40 + 1]{}; + strncpy(model_number, data + 24, 40); + info->model_ = model_number; + boost::trim(info->model_); + + return EXIT_SUCCESS; +} + +/** + * @brief get SMART / Health Information for NVMe drive + * @param [in] fd file descriptor to device + * @param [inout] info a pointer to HDD information + * @return 0 on success, otherwise error + * @note For details please see the document below. + * - NVM Express 1.2b + * https://www.nvmexpress.org/wp-content/uploads/NVM_Express_1_2b_Gold_20160603.pdf + */ +int get_nvme_SMARTData(int fd, HDDInfo * info) +{ + nvme_admin_cmd cmd{}; + char data[4]{}; // 1 Dword (get byte 0 to 3) + + // The Get Log Page command returns a data buffer containing the log page requested + cmd.opcode = 0x02; // Get Log Page + cmd.nsid = 0xFFFFFFFF; // Global log page + cmd.addr = (uint64_t)data; // memory address of data + cmd.data_len = sizeof(data); // length of data + cmd.cdw10 = 0x00010002; // Bit 27:16 Number of Dwords (NUMD) = 001h (1 Dword) + // - Minimum necessary size to obtain a temperature + // Bit 07:00 = 02h (SMART / Health Information) + + // send Admin Command to device + int ret = ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); + if (ret < 0) { + return errno; + } + + // Bytes 2:1 Composite Temperature + // Convert kelvin to celsius + unsigned int temperature = ((data[2] << 8) | data[1]) - 273; + info->temp_ = static_cast(temperature); + + return EXIT_SUCCESS; +} + +/** + * @brief check HDD temperature + * @param [in] port port to listen + */ +void run(int port) +{ + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + syslog(LOG_ERR, "Failed to create a new socket. %s\n", strerror(errno)); + return; + } + + // Allow address reuse + int ret = 0; + int opt = 1; + ret = setsockopt( + sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast(&opt), (socklen_t)sizeof(opt)); + if (ret < 0) { + syslog(LOG_ERR, "Failed to set socket FD's option. %s\n", strerror(errno)); + close(sock); + return; + } + + // Give the socket FD the local address ADDR + sockaddr_in addr{}; + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + syslog(LOG_ERR, "Failed to give the socket FD the local address ADDR. %s\n", strerror(errno)); + close(sock); + return; + } + + // Prepare to accept connections on socket FD + ret = listen(sock, 5); + if (ret < 0) { + syslog(LOG_ERR, "Failed to prepare to accept connections on socket FD. %s\n", strerror(errno)); + close(sock); + return; + } + + sockaddr_in client{}; + socklen_t len = sizeof(client); + + while (true) { + // Await a connection on socket FD + int new_sock = accept(sock, reinterpret_cast(&client), &len); + if (new_sock < 0) { + syslog( + LOG_ERR, "Failed to prepare to accept connections on socket FD. %s\n", strerror(errno)); + close(sock); + return; + } + + // Receive list of device from a socket + char buf[1024]{}; + ret = recv(new_sock, buf, sizeof(buf) - 1, 0); + if (ret < 0) { + syslog(LOG_ERR, "Failed to receive. %s\n", strerror(errno)); + close(new_sock); + close(sock); + return; + } + // No data received + if (ret == 0) { + syslog(LOG_ERR, "No data received. %s\n", strerror(errno)); + close(new_sock); + close(sock); + return; + } + + // Restore list of devices + std::vector hdd_devices; + + try { + buf[sizeof(buf) - 1] = '\0'; + std::istringstream iss(buf); + boost::archive::text_iarchive oa(iss); + oa & hdd_devices; + } catch (const std::exception & e) { + syslog(LOG_ERR, "exception. %s\n", e.what()); + close(new_sock); + close(sock); + return; + } + + HDDInfoList list; + std::ostringstream oss; + boost::archive::text_oarchive oa(oss); + + for (auto & hdd_device : hdd_devices) { + HDDInfo info{}; + + // Open a file + int fd = open(hdd_device.c_str(), O_RDONLY); + if (fd < 0) { + info.error_code_ = errno; + syslog(LOG_ERR, "Failed to open a file. %s\n", strerror(info.error_code_)); + continue; + } + + // AHCI device + if (boost::starts_with(hdd_device.c_str(), "/dev/sd")) { + // Get IDENTIFY DEVICE for ATA drive + info.error_code_ = get_ata_identify(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get IDENTIFY DEVICE for ATA drive. %s\n", + strerror(info.error_code_)); + close(fd); + continue; + } + // Get SMART DATA for ATA drive + info.error_code_ = get_ata_SMARTData(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get SMART LOG for ATA drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + } else if (boost::starts_with(hdd_device.c_str(), "/dev/nvme")) { // NVMe device + // Get Identify for NVMe drive + info.error_code_ = get_nvme_identify(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get Identify for NVMe drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + // Get SMART / Health Information for NVMe drive + info.error_code_ = get_nvme_SMARTData(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get SMART / Health Information for NVMe drive. %s\n", + strerror(info.error_code_)); + close(fd); + continue; + } + } + + // Close the file descriptor FD + info.error_code_ = close(fd); + if (info.error_code_ < 0) { + info.error_code_ = errno; + syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(info.error_code_)); + } + + list[hdd_device] = info; + } + + oa << list; + // Write N bytes of BUF to FD + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + if (ret < 0) { + syslog(LOG_ERR, "Failed to write N bytes of BUF to FD. %s\n", strerror(errno)); + } + + // Close the file descriptor FD + ret = close(new_sock); + if (ret < 0) { + syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(errno)); + } + } + + close(sock); +} + +int main(int argc, char ** argv) +{ + static struct option long_options[] = { + {"help", no_argument, nullptr, 'h'}, + {"port", required_argument, nullptr, 'p'}, + {nullptr, 0, nullptr, 0}}; + + // Parse command-line options + int c = 0; + int option_index = 0; + int port = PORT; + while ((c = getopt_long(argc, argv, "hp:", long_options, &option_index)) != -1) { + switch (c) { + case 'h': + usage(); + return EXIT_SUCCESS; + + case 'p': + try { + port = boost::lexical_cast(optarg); + } catch (const boost::bad_lexical_cast & e) { + printf("Error: %s\n", e.what()); + return EXIT_FAILURE; + } + break; + + default: + break; + } + } + + // Put the program in the background + if (daemon(0, 0) < 0) { + printf("Failed to put the program in the background. %s\n", strerror(errno)); + return errno; + } + + // Open connection to system logger + openlog(nullptr, LOG_PID, LOG_DAEMON); + + run(port); + + // Close descriptor used to write to system logger + closelog(); + + return EXIT_SUCCESS; +} diff --git a/system/system_monitor/reader/msr_reader/msr_reader.cpp b/system/system_monitor/reader/msr_reader/msr_reader.cpp new file mode 100644 index 0000000000000..17bbfd20a7667 --- /dev/null +++ b/system/system_monitor/reader/msr_reader/msr_reader.cpp @@ -0,0 +1,285 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file msr_reader.cpp + * @brief MSR read class + */ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace fs = boost::filesystem; + +// 7634-7647 Unassigned +constexpr int PORT = 7634; + +/** + * @brief Package Thermal Status Information + * For details please see the documents below. + * - Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual + * https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf + */ +typedef struct +{ + uint64_t pkg_thermal_status_ : 1; //!< @brief 0 Pkg Thermal Status (RO) + uint64_t pkg_thermal_status_log_ : 1; //!< @brief 1 Pkg Thermal Status Log (R/W) + uint64_t pkg_prochot_event_ : 1; //!< @brief 2 Pkg PROCHOT # event (RO) + uint64_t pkg_prochot_log_ : 1; //!< @brief 3 Pkg PROCHOT # log (R/WC0) + uint64_t pkg_critical_temperature_status_ : 1; //!< @brief 4 Pkg Critical Temperature Status (RO) + uint64_t //!< @brief 5 Pkg Critical Temperature + pkg_critical_temperature_status_log_ : 1; //!< Status Log (R/WC0) + uint64_t pkg_thermal_threshold_1_status_ : 1; //!< @brief 6 Pkg Thermal Threshold #1 Status (RO) + uint64_t pkg_thermal_threshold_1_log_ : 1; //!< @brief 7 Pkg Thermal Threshold #1 log (R/WC0) + uint64_t pkg_thermal_threshold_2_status_ : 1; //!< @brief 8 Pkg Thermal Threshold #2 Status (RO) + uint64_t pkg_thermal_threshold_2_log_ : 1; //!< @brief 9 Pkg Thermal Threshold #2 log (R/WC0) + uint64_t pkg_power_limitation_status_ : 1; //!< @brief 10 Pkg Power Limitation Status (RO) + uint64_t pkg_power_limitation_log_ : 1; //!< @brief 11 Pkg Power Limitation log (R/WC0) + uint64_t reserved1_ : 4; //!< @brief 15:12 Reserved + uint64_t pkg_digital_readout_ : 7; //!< @brief 22:16 Pkg Digital Readout (RO) + uint64_t reserved2_ : 41; //!< @brief 63:23 Reserved +} PackageThermalStatus; + +/** + * @brief print usage + */ +void usage() +{ + printf("Usage: msr_reader [options]\n"); + printf(" -h --help : Display help\n"); + printf(" -p --port # : Port number to listen to.\n"); + printf("\n"); +} + +/** + * @brief check CPU thermal throttling + * @param [in] port port to listen + * @param [in] list list of path to msr + */ +void run(int port, const std::vector & list) +{ + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + syslog(LOG_ERR, "Failed to create a new socket. %s\n", strerror(errno)); + return; + } + + // Allow address reuse + int ret = 0; + int opt = 1; + ret = setsockopt( + sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast(&opt), (socklen_t)sizeof(opt)); + if (ret < 0) { + syslog(LOG_ERR, "Failed to set socket FD's option. %s\n", strerror(errno)); + close(sock); + return; + } + + // Give the socket FD the local address ADDR + sockaddr_in addr; + memset(&addr, 0, sizeof(sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + syslog(LOG_ERR, "Failed to give the socket FD the local address ADDR. %s\n", strerror(errno)); + close(sock); + return; + } + + // Prepare to accept connections on socket FD + ret = listen(sock, 5); + if (ret < 0) { + syslog(LOG_ERR, "Failed to prepare to accept connections on socket FD. %s\n", strerror(errno)); + close(sock); + return; + } + + sockaddr_in client; + socklen_t len = sizeof(client); + + while (true) { + // Await a connection on socket FD + int new_sock = accept(sock, reinterpret_cast(&client), &len); + if (new_sock < 0) { + syslog( + LOG_ERR, "Failed to prepare to accept connections on socket FD. %s\n", strerror(errno)); + close(sock); + return; + } + + ret = 0; + std::ostringstream oss; + boost::archive::text_oarchive oa(oss); + MSRInfo msr{0, {}}; + + for (auto itr = list.begin(); itr != list.end(); ++itr) { + // Open a file + int fd = open(itr->c_str(), O_RDONLY); + if (fd < 0) { + msr.error_code_ = errno; + syslog(LOG_ERR, "Failed to open a file. %s\n", strerror(msr.error_code_)); + break; + } + + // Read from a file descriptor + PackageThermalStatus val; + ret = pread(fd, &val, sizeof(uint64_t), 0x1b1); + if (ret < 0) { + msr.error_code_ = errno; + syslog(LOG_ERR, "Failed to read from a file descriptor. %s\n", strerror(msr.error_code_)); + close(fd); + break; + } + + // Close the file descriptor FD + ret = close(fd); + if (ret < 0) { + msr.error_code_ = errno; + syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(msr.error_code_)); + break; + } + + msr.pkg_thermal_status_.push_back(val.pkg_thermal_status_); + } + + oa << msr; + // Write N bytes of BUF to FD + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + if (ret < 0) { + syslog(LOG_ERR, "Failed to write N bytes of BUF to FD. %s\n", strerror(errno)); + } + + // Close the file descriptor FD + ret = close(new_sock); + if (ret < 0) { + syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(errno)); + } + } + + close(sock); +} + +int main(int argc, char ** argv) +{ + static struct option long_options[] = { + {"help", no_argument, 0, 'h'}, {"port", required_argument, 0, 'p'}, {0, 0, 0, 0}}; + + // Parse command-line options + int c = 0; + int option_index = 0; + int port = PORT; + while ((c = getopt_long(argc, argv, "hp:", long_options, &option_index)) != -1) { + switch (c) { + case 'h': + usage(); + return EXIT_SUCCESS; + + case 'p': + try { + port = boost::lexical_cast(optarg); + } catch (const boost::bad_lexical_cast & e) { + printf("Error: %s\n", e.what()); + return EXIT_FAILURE; + } + break; + + default: + break; + } + } + + if (!fs::exists("/dev/cpu")) { + printf("Failed to access /dev/cpu.\n"); + return EXIT_FAILURE; + } + + std::vector list; + const fs::path root("/dev/cpu"); + + for (const fs::path & path : boost::make_iterator_range( + fs::recursive_directory_iterator(root), fs::recursive_directory_iterator())) { + if (fs::is_directory(path)) { + continue; + } + + std::cmatch match; + const char * msr = path.generic_string().c_str(); + + // /dev/cpu/[0-9]/msr ? + if (!std::regex_match(msr, match, std::regex(".*msr"))) { + continue; + } + + list.push_back(path.generic_string()); + } + + std::sort(list.begin(), list.end(), [](const std::string & c1, const std::string & c2) { + std::cmatch match; + const std::regex filter(".*/(\\d+)/msr"); + int n1 = 0; + int n2 = 0; + if (std::regex_match(c1.c_str(), match, filter)) { + n1 = std::stoi(match[1].str()); + } + if (std::regex_match(c2.c_str(), match, filter)) { + n2 = std::stoi(match[1].str()); + } + return n1 < n2; + }); // NOLINT + + if (list.empty()) { + printf("No msr found in /dev/cpu.\n"); + return EXIT_FAILURE; + } + + // Put the program in the background + if (daemon(0, 0) < 0) { + printf("Failed to put the program in the background. %s\n", strerror(errno)); + return errno; + } + + // Open connection to system logger + openlog(nullptr, LOG_PID, LOG_DAEMON); + + run(port, list); + + // Close descriptor used to write to system logger + closelog(); + + return EXIT_SUCCESS; +} diff --git a/system/system_monitor/src/cpu_monitor/arm_cpu_monitor.cpp b/system/system_monitor/src/cpu_monitor/arm_cpu_monitor.cpp new file mode 100644 index 0000000000000..4b7fe6c03e32d --- /dev/null +++ b/system/system_monitor/src/cpu_monitor/arm_cpu_monitor.cpp @@ -0,0 +1,48 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file arm_cpu_monitor.cpp + * @brief ARM CPU monitor class + */ + +#include "system_monitor/cpu_monitor/arm_cpu_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include +#include + +CPUMonitor::CPUMonitor(const rclcpp::NodeOptions & options) : CPUMonitorBase("cpu_monitor", options) +{ +} + +void CPUMonitor::checkThrottling(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // TODO(Fumihito Ito): implement me +} + +void CPUMonitor::getTempNames() +{ + // Jetson TX1 TX2 Nano: thermal_zone1, Xavier: thermal_zone0 + std::vector therms; + SystemMonitorUtility::getThermalZone("CPU-therm", &therms); + + for (auto itr = therms.begin(); itr != therms.end(); ++itr) { + temps_.emplace_back(itr->label_, itr->path_); + } +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(CPUMonitor) diff --git a/system/system_monitor/src/cpu_monitor/cpu_monitor_base.cpp b/system/system_monitor/src/cpu_monitor/cpu_monitor_base.cpp new file mode 100644 index 0000000000000..4f30892125955 --- /dev/null +++ b/system/system_monitor/src/cpu_monitor/cpu_monitor_base.cpp @@ -0,0 +1,333 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file cpu_monitor_base.cpp + * @brief CPU monitor base class + */ + +#include "system_monitor/cpu_monitor/cpu_monitor_base.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace bp = boost::process; +namespace fs = boost::filesystem; +namespace pt = boost::property_tree; + +CPUMonitorBase::CPUMonitorBase(const std::string & node_name, const rclcpp::NodeOptions & options) +: Node(node_name, options), + updater_(this), + hostname_(), + num_cores_(0), + temps_(), + freqs_(), + mpstat_exists_(false), + temp_warn_(declare_parameter("temp_warn", 90.0)), + temp_error_(declare_parameter("temp_error", 95.0)), + usage_warn_(declare_parameter("usage_warn", 0.90)), + usage_error_(declare_parameter("usage_error", 1.00)), + usage_avg_(declare_parameter("usage_avg", true)) +{ + gethostname(hostname_, sizeof(hostname_)); + num_cores_ = boost::thread::hardware_concurrency(); + + // Check if command exists + fs::path p = bp::search_path("mpstat"); + mpstat_exists_ = (p.empty()) ? false : true; + + updater_.setHardwareID(hostname_); + updater_.add("CPU Temperature", this, &CPUMonitorBase::checkTemp); + updater_.add("CPU Usage", this, &CPUMonitorBase::checkUsage); + updater_.add("CPU Load Average", this, &CPUMonitorBase::checkLoad); + updater_.add("CPU Thermal Throttling", this, &CPUMonitorBase::checkThrottling); + updater_.add("CPU Frequency", this, &CPUMonitorBase::checkFrequency); +} + +void CPUMonitorBase::update() { updater_.force_update(); } + +void CPUMonitorBase::checkTemp(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (temps_.empty()) { + stat.summary(DiagStatus::ERROR, "temperature files not found"); + return; + } + + int level = DiagStatus::OK; + std::string error_str = ""; + + for (auto itr = temps_.begin(); itr != temps_.end(); ++itr) { + // Read temperature file + const fs::path path(itr->path_); + fs::ifstream ifs(path, std::ios::in); + if (!ifs) { + stat.add("file open error", itr->path_); + error_str = "file open error"; + continue; + } + + float temp; + ifs >> temp; + ifs.close(); + temp /= 1000; + stat.addf(itr->label_, "%.1f DegC", temp); + + if (temp >= temp_error_) { + level = std::max(level, static_cast(DiagStatus::ERROR)); + } else if (temp >= temp_warn_) { + level = std::max(level, static_cast(DiagStatus::WARN)); + } + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, error_str); + } else { + stat.summary(level, temp_dict_.at(level)); + } + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void CPUMonitorBase::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (!mpstat_exists_) { + stat.summary(DiagStatus::ERROR, "mpstat error"); + stat.add( + "mpstat", "Command 'mpstat' not found, but can be installed with: sudo apt install sysstat"); + return; + } + + // Get CPU Usage + bp::ipstream is_out; + bp::ipstream is_err; + bp::child c("mpstat -P ALL 1 1 -o JSON", bp::std_out > is_out, bp::std_err > is_err); + c.wait(); + + if (c.exit_code() != 0) { + std::ostringstream os; + is_err >> os.rdbuf(); + stat.summary(DiagStatus::ERROR, "mpstat error"); + stat.add("mpstat", os.str().c_str()); + return; + } + + std::string cpu_name; + float usr{0.0}; + float nice{0.0}; + float sys{0.0}; + float idle{0.0}; + float usage{0.0}; + float total{0.0}; + int level = DiagStatus::OK; + int whole_level = DiagStatus::OK; + + pt::ptree pt; + try { + // Analyze JSON output + read_json(is_out, pt); + + for (const pt::ptree::value_type & child1 : pt.get_child("sysstat.hosts")) { + const pt::ptree & hosts = child1.second; + + for (const pt::ptree::value_type & child2 : hosts.get_child("statistics")) { + const pt::ptree & statistics = child2.second; + + for (const pt::ptree::value_type & child3 : statistics.get_child("cpu-load")) { + const pt::ptree & cpu_load = child3.second; + + if (boost::optional v = cpu_load.get_optional("cpu")) { + cpu_name = v.get(); + } + if (boost::optional v = cpu_load.get_optional("usr")) { + usr = v.get(); + } + if (boost::optional v = cpu_load.get_optional("nice")) { + nice = v.get(); + } + if (boost::optional v = cpu_load.get_optional("sys")) { + sys = v.get(); + } + if (boost::optional v = cpu_load.get_optional("idle")) { + idle = v.get(); + } + + total = usr + nice + sys; + usage = total * 1e-2; + + level = DiagStatus::OK; + if (usage >= usage_error_) { + level = DiagStatus::ERROR; + } else if (usage >= usage_warn_) { + level = DiagStatus::WARN; + } + + stat.add(fmt::format("CPU {}: status", cpu_name), load_dict_.at(level)); + stat.addf(fmt::format("CPU {}: total", cpu_name), "%.2f%%", total); + stat.addf(fmt::format("CPU {}: usr", cpu_name), "%.2f%%", usr); + stat.addf(fmt::format("CPU {}: nice", cpu_name), "%.2f%%", nice); + stat.addf(fmt::format("CPU {}: sys", cpu_name), "%.2f%%", sys); + stat.addf(fmt::format("CPU {}: idle", cpu_name), "%.2f%%", idle); + + if (usage_avg_ == true) { + if (cpu_name == "all") { + whole_level = level; + } + } else { + whole_level = std::max(whole_level, level); + } + } + } + } + } catch (const std::exception & e) { + stat.summary(DiagStatus::ERROR, "mpstat exception"); + stat.add("mpstat", e.what()); + return; + } + + stat.summary(whole_level, load_dict_.at(whole_level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void CPUMonitorBase::checkLoad(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + double loadavg[3]; + + std::ifstream ifs("/proc/loadavg", std::ios::in); + + if (!ifs) { + stat.summary(DiagStatus::ERROR, "uptime error"); + stat.add("uptime", strerror(errno)); + return; + } + + std::string line; + + if (!std::getline(ifs, line)) { + stat.summary(DiagStatus::ERROR, "uptime error"); + stat.add("uptime", "format error"); + return; + } + + if (sscanf(line.c_str(), "%lf %lf %lf", &loadavg[0], &loadavg[1], &loadavg[2]) != 3) { + stat.summary(DiagStatus::ERROR, "uptime error"); + stat.add("uptime", "format error"); + return; + } + + loadavg[0] /= num_cores_; + loadavg[1] /= num_cores_; + loadavg[2] /= num_cores_; + + stat.summary(DiagStatus::OK, "OK"); + stat.addf("1min", "%.2f%%", loadavg[0] * 1e2); + stat.addf("5min", "%.2f%%", loadavg[1] * 1e2); + stat.addf("15min", "%.2f%%", loadavg[2] * 1e2); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void CPUMonitorBase::checkThrottling( + [[maybe_unused]] diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + RCLCPP_INFO(this->get_logger(), "CPUMonitorBase::checkThrottling not implemented."); +} + +void CPUMonitorBase::checkFrequency(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (freqs_.empty()) { + stat.summary(DiagStatus::ERROR, "frequency files not found"); + return; + } + + for (auto itr = freqs_.begin(); itr != freqs_.end(); ++itr) { + // Read scaling_cur_freq file + const fs::path path(itr->path_); + fs::ifstream ifs(path, std::ios::in); + if (ifs) { + std::string line; + if (std::getline(ifs, line)) { + stat.addf(fmt::format("CPU {}: clock", itr->index_), "%d MHz", std::stoi(line) / 1000); + } + } + ifs.close(); + } + + stat.summary(DiagStatus::OK, "OK"); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void CPUMonitorBase::getTempNames() +{ + RCLCPP_INFO(this->get_logger(), "CPUMonitorBase::getTempNames not implemented."); +} + +void CPUMonitorBase::getFreqNames() +{ + const fs::path root("/sys/devices/system/cpu"); + + for (const fs::path & path : + boost::make_iterator_range(fs::directory_iterator(root), fs::directory_iterator())) { + if (!fs::is_directory(path)) { + continue; + } + + std::cmatch match; + const char * cpu_dir = path.generic_string().c_str(); + + // /sys/devices/system/cpu[0-9] ? + if (!std::regex_match(cpu_dir, match, std::regex(".*cpu(\\d+)"))) { + continue; + } + + // /sys/devices/system/cpu[0-9]/cpufreq/scaling_cur_freq + cpu_freq_info freq; + const fs::path freq_path = path / "cpufreq/scaling_cur_freq"; + freq.index_ = std::stoi(match[1].str()); + freq.path_ = freq_path.generic_string(); + freqs_.push_back(freq); + } + + std::sort(freqs_.begin(), freqs_.end(), [](const cpu_freq_info & c1, const cpu_freq_info & c2) { + return c1.index_ < c2.index_; + }); // NOLINT +} diff --git a/system/system_monitor/src/cpu_monitor/intel_cpu_monitor.cpp b/system/system_monitor/src/cpu_monitor/intel_cpu_monitor.cpp new file mode 100644 index 0000000000000..3cc7b5f5c629c --- /dev/null +++ b/system/system_monitor/src/cpu_monitor/intel_cpu_monitor.cpp @@ -0,0 +1,211 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file _cpu_monitor.cpp + * @brief CPU monitor class + */ + +#include "system_monitor/cpu_monitor/intel_cpu_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace fs = boost::filesystem; + +CPUMonitor::CPUMonitor(const rclcpp::NodeOptions & options) : CPUMonitorBase("cpu_monitor", options) +{ + msr_reader_port_ = declare_parameter("msr_reader_port", 7634); + + this->getTempNames(); + this->getFreqNames(); +} + +void CPUMonitor::checkThrottling(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + stat.summary(DiagStatus::ERROR, "socket error"); + stat.add("socket", strerror(errno)); + return; + } + + // Specify the receiving timeouts until reporting an error + struct timeval tv; + tv.tv_sec = 10; + tv.tv_usec = 0; + int ret = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "setsockopt error"); + stat.add("setsockopt", strerror(errno)); + close(sock); + return; + } + + // Connect the socket referred to by the file descriptor + sockaddr_in addr; + memset(&addr, 0, sizeof(sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_port = htons(msr_reader_port_); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "connect error"); + stat.add("connect", strerror(errno)); + close(sock); + return; + } + + // Receive messages from a socket + char buf[1024] = ""; + ret = recv(sock, buf, sizeof(buf) - 1, 0); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "recv error"); + stat.add("recv", strerror(errno)); + close(sock); + return; + } + // No data received + if (ret == 0) { + stat.summary(DiagStatus::ERROR, "recv error"); + stat.add("recv", "No data received"); + close(sock); + return; + } + + // Close the file descriptor FD + ret = close(sock); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "close error"); + stat.add("close", strerror(errno)); + return; + } + + // Restore MSR information + MSRInfo info; + + try { + std::istringstream iss(buf); + boost::archive::text_iarchive oa(iss); + oa >> info; + } catch (const std::exception & e) { + stat.summary(DiagStatus::ERROR, "recv error"); + stat.add("recv", e.what()); + return; + } + + // msr_reader returns an error + if (info.error_code_ != 0) { + stat.summary(DiagStatus::ERROR, "msr_reader error"); + stat.add("msr_reader", strerror(info.error_code_)); + return; + } + + int level = DiagStatus::OK; + int whole_level = DiagStatus::OK; + int index = 0; + + for (auto itr = info.pkg_thermal_status_.begin(); itr != info.pkg_thermal_status_.end(); + ++itr, ++index) { + if (*itr) { + level = DiagStatus::ERROR; + } else { + level = DiagStatus::OK; + } + + stat.add(fmt::format("CPU {}: Pkg Thermal Status", index), thermal_dict_.at(level)); + + whole_level = std::max(whole_level, level); + } + + stat.summary(whole_level, thermal_dict_.at(whole_level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void CPUMonitor::getTempNames() +{ + const fs::path root("/sys/devices/platform/coretemp.0"); + + if (!fs::exists(root)) { + return; + } + + for (const fs::path & path : boost::make_iterator_range( + fs::recursive_directory_iterator(root), fs::recursive_directory_iterator())) { + if (fs::is_directory(path)) { + continue; + } + + std::cmatch match; + const std::string temp_input = path.generic_string(); + + // /sys/devices/platform/coretemp.0/hwmon/hwmon[0-9]/temp[0-9]_input ? + if (!std::regex_match(temp_input.c_str(), match, std::regex(".*temp(\\d+)_input"))) { + continue; + } + + cpu_temp_info temp; + temp.path_ = temp_input; + temp.label_ = path.filename().generic_string(); + + std::string label = boost::algorithm::replace_all_copy(temp_input, "input", "label"); + const fs::path label_path(label); + fs::ifstream ifs(label_path, std::ios::in); + if (ifs) { + std::string line; + if (std::getline(ifs, line)) { + temp.label_ = line; + } + } + ifs.close(); + temps_.push_back(temp); + } + + std::sort(temps_.begin(), temps_.end(), [](const cpu_temp_info & c1, const cpu_temp_info & c2) { + std::smatch match; + const std::regex filter(".*temp(\\d+)_input"); + int n1 = 0; + int n2 = 0; + if (std::regex_match(c1.path_, match, filter)) { + n1 = std::stoi(match[1].str()); + } + if (std::regex_match(c2.path_, match, filter)) { + n2 = std::stoi(match[1].str()); + } + return n1 < n2; + }); // NOLINT +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(CPUMonitor) diff --git a/system/system_monitor/src/cpu_monitor/raspi_cpu_monitor.cpp b/system/system_monitor/src/cpu_monitor/raspi_cpu_monitor.cpp new file mode 100644 index 0000000000000..995bfddff0bfe --- /dev/null +++ b/system/system_monitor/src/cpu_monitor/raspi_cpu_monitor.cpp @@ -0,0 +1,85 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file raspi_cpu_monitor.cpp + * @brief Raspberry Pi CPU monitor class + */ + +#include "system_monitor/cpu_monitor/raspi_cpu_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include +#include + +#include +#include + +namespace fs = boost::filesystem; + +CPUMonitor::CPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) +: CPUMonitorBase("cpu_monitor", options) +{ +} + +void CPUMonitor::checkThrottling(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + int level = DiagStatus::OK; + std::vector status; + + const fs::path path("/sys/devices/platform/soc/soc:firmware/get_throttled"); + fs::ifstream ifs(path, std::ios::in); + if (!ifs) { + stat.summary(DiagStatus::ERROR, "file open error"); + stat.add("get_throttled", "file open error"); + return; + } + + int throttled; + ifs >> std::hex >> throttled; + ifs.close(); + + // Consider only thermal throttling as an error + if ((throttled & raspiThermalThrottlingMask) == raspiThermalThrottlingMask) { + level = DiagStatus::ERROR; + } + + while (throttled) { + int flag = throttled & ((~throttled) + 1); + throttled ^= flag; + status.push_back(throttledToString(flag)); + } + if (status.empty()) { + status.emplace_back("All clear"); + } + + stat.add("status", boost::algorithm::join(status, ", ")); + + stat.summary(level, thermal_dict_.at(level)); +} + +void CPUMonitor::getTempNames() +{ + // thermal_zone0 + std::vector therms; + SystemMonitorUtility::getThermalZone("cpu-thermal", &therms); + + for (auto itr = therms.begin(); itr != therms.end(); ++itr) { + temps_.emplace_back(itr->label_, itr->path_); + } +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(CPUMonitor) diff --git a/system/system_monitor/src/cpu_monitor/tegra_cpu_monitor.cpp b/system/system_monitor/src/cpu_monitor/tegra_cpu_monitor.cpp new file mode 100644 index 0000000000000..19e9f362c1fbb --- /dev/null +++ b/system/system_monitor/src/cpu_monitor/tegra_cpu_monitor.cpp @@ -0,0 +1,48 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file tegra_cpu_monitor.cpp + * @brief TEGRA PU monitor class + */ + +#include "system_monitor/cpu_monitor/tegra_cpu_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include +#include + +CPUMonitor::CPUMonitor(const rclcpp::NodeOptions & options) : CPUMonitorBase("cpu_monitor", options) +{ + // There is no event record for thermal throttling. + // Need to manually monitor temperature to figure out if thermal limits crossed or not. + updater_.removeByName("CPU Thermal Throttling"); +} + +void CPUMonitor::checkThrottling(diagnostic_updater::DiagnosticStatusWrapper & stat) {} + +void CPUMonitor::getTempNames() +{ + // Jetson TX1 TX2 Nano: thermal_zone1, Xavier: thermal_zone0 + std::vector therms; + SystemMonitorUtility::getThermalZone("CPU-therm", &therms); + + for (auto itr = therms.begin(); itr != therms.end(); ++itr) { + temps_.emplace_back(itr->label_, itr->path_); + } +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(CPUMonitor) diff --git a/system/system_monitor/src/cpu_monitor/unknown_cpu_monitor.cpp b/system/system_monitor/src/cpu_monitor/unknown_cpu_monitor.cpp new file mode 100644 index 0000000000000..79b115e4f7af5 --- /dev/null +++ b/system/system_monitor/src/cpu_monitor/unknown_cpu_monitor.cpp @@ -0,0 +1,29 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file unknown_cpu_monitor.cpp + * @brief Unknown CPU monitor class + */ + +#include "system_monitor/cpu_monitor/unknown_cpu_monitor.hpp" + +#include + +CPUMonitor::CPUMonitor(const rclcpp::NodeOptions & options) : CPUMonitorBase("cpu_monitor", options) +{ +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(CPUMonitor) diff --git a/system/system_monitor/src/gpu_monitor/gpu_monitor_base.cpp b/system/system_monitor/src/gpu_monitor/gpu_monitor_base.cpp new file mode 100644 index 0000000000000..d21de67f643ce --- /dev/null +++ b/system/system_monitor/src/gpu_monitor/gpu_monitor_base.cpp @@ -0,0 +1,76 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file nvml_gpu_monitor.cpp + * @brief GPU monitor class + */ + +#include "system_monitor/gpu_monitor/gpu_monitor_base.hpp" + +#include + +#include + +GPUMonitorBase::GPUMonitorBase(const std::string & node_name, const rclcpp::NodeOptions & options) +: Node(node_name, options), + updater_(this), + hostname_{""}, + temp_warn_(declare_parameter("temp_warn", 90.0)), + temp_error_(declare_parameter("temp_error", 95.0)), + gpu_usage_warn_(declare_parameter("gpu_usage_warn", 0.90)), + gpu_usage_error_(declare_parameter("gpu_usage_error", 1.00)), + memory_usage_warn_(declare_parameter("memory_usage_warn", 0.95)), + memory_usage_error_(declare_parameter("memory_usage_error", 0.99)) +{ + gethostname(hostname_, sizeof(hostname_)); + + updater_.setHardwareID(hostname_); + updater_.add("GPU Temperature", this, &GPUMonitorBase::checkTemp); + updater_.add("GPU Usage", this, &GPUMonitorBase::checkUsage); + updater_.add("GPU Memory Usage", this, &GPUMonitorBase::checkMemoryUsage); + updater_.add("GPU Thermal Throttling", this, &GPUMonitorBase::checkThrottling); + updater_.add("GPU Frequency", this, &GPUMonitorBase::checkFrequency); +} + +void GPUMonitorBase::update() { updater_.force_update(); } + +void GPUMonitorBase::shut_down() +{ /*NOOP by default.*/ +} + +void GPUMonitorBase::checkTemp(diagnostic_updater::DiagnosticStatusWrapper & /* stat */) +{ + RCLCPP_INFO_ONCE(get_logger(), "GPUMonitorBase::checkTemp not implemented."); +} + +void GPUMonitorBase::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & /* stat */) +{ + RCLCPP_INFO_ONCE(get_logger(), "GPUMonitorBase::checkUsage not implemented."); +} + +void GPUMonitorBase::checkMemoryUsage(diagnostic_updater::DiagnosticStatusWrapper & /* stat */) +{ + RCLCPP_INFO_ONCE(get_logger(), "GPUMonitorBase::checkMemoryUsage not implemented."); +} + +void GPUMonitorBase::checkThrottling(diagnostic_updater::DiagnosticStatusWrapper & /* stat */) +{ + RCLCPP_INFO_ONCE(get_logger(), "GPUMonitorBase::checkThrottling not implemented."); +} + +void GPUMonitorBase::checkFrequency(diagnostic_updater::DiagnosticStatusWrapper & /* stat */) +{ + RCLCPP_INFO_ONCE(get_logger(), "GPUMonitorBase::checkFrequency not implemented."); +} diff --git a/system/system_monitor/src/gpu_monitor/nvml_gpu_monitor.cpp b/system/system_monitor/src/gpu_monitor/nvml_gpu_monitor.cpp new file mode 100644 index 0000000000000..19026717bc77b --- /dev/null +++ b/system/system_monitor/src/gpu_monitor/nvml_gpu_monitor.cpp @@ -0,0 +1,401 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file nvml_gpu_monitor.cpp + * @brief GPU monitor class + */ + +#include "system_monitor/gpu_monitor/nvml_gpu_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include + +#include +#include + +#include +#include +#include +#include +#include + +GPUMonitor::GPUMonitor(const rclcpp::NodeOptions & options) : GPUMonitorBase("gpu_monitor", options) +{ + // Include frequency into GPU Thermal Throttling thus remove. + updater_.removeByName("GPU Frequency"); + + nvmlReturn_t ret = nvmlInit(); + if (ret != NVML_SUCCESS) { + RCLCPP_ERROR(this->get_logger(), "Failed to initialize NVML: %s\n", nvmlErrorString(ret)); + } + + unsigned int deviceCount = 0; + ret = nvmlDeviceGetCount(&deviceCount); + if (ret != NVML_SUCCESS) { + RCLCPP_ERROR( + this->get_logger(), "Failed to retrieve the number of compute devices: %s", + nvmlErrorString(ret)); + } + + for (unsigned int index = 0; index < deviceCount; ++index) { + gpu_info info{}; + ret = nvmlDeviceGetHandleByIndex(index, &info.device); + if (ret != NVML_SUCCESS) { + RCLCPP_ERROR( + this->get_logger(), "Failed to acquire the handle for a particular device [%d]: %s", index, + nvmlErrorString(ret)); + continue; + } + ret = nvmlDeviceGetName(info.device, info.name, NVML_DEVICE_NAME_BUFFER_SIZE); + if (ret != NVML_SUCCESS) { + RCLCPP_ERROR( + this->get_logger(), "Failed to retrieve the name of this device [%d]: %s", index, + nvmlErrorString(ret)); + continue; + } + ret = nvmlDeviceGetPciInfo(info.device, &info.pci); + if (ret != NVML_SUCCESS) { + RCLCPP_ERROR( + this->get_logger(), "Failed to retrieve the PCI attributes [%d]: %s", index, + nvmlErrorString(ret)); + continue; + } + gpus_.push_back(info); + } +} + +void GPUMonitor::shut_down() +{ + nvmlReturn_t ret = nvmlShutdown(); + if (ret != NVML_SUCCESS) { + RCLCPP_ERROR(this->get_logger(), "Failed to shut down NVML: %s", nvmlErrorString(ret)); + } +} + +void GPUMonitor::checkTemp(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + int level = DiagStatus::OK; + int index = 0; + nvmlReturn_t ret{}; + + if (gpus_.empty()) { + stat.summary(DiagStatus::ERROR, "gpu not found"); + return; + } + + for (auto itr = gpus_.begin(); itr != gpus_.end(); ++itr, ++index) { + unsigned int temp = 0; + ret = nvmlDeviceGetTemperature(itr->device, NVML_TEMPERATURE_GPU, &temp); + if (ret != NVML_SUCCESS) { + stat.summary(DiagStatus::ERROR, "Failed to retrieve the current temperature"); + stat.add(fmt::format("GPU {}: name", index), itr->name); + stat.add(fmt::format("GPU {}: bus-id", index), itr->pci.busId); + stat.add(fmt::format("GPU {}: content", index), nvmlErrorString(ret)); + return; + } + + level = DiagStatus::OK; + stat.addf(itr->name, "%d.0 DegC", temp); + if (temp >= temp_error_) { + level = std::max(level, static_cast(DiagStatus::ERROR)); + } else if (temp >= temp_warn_) { + level = std::max(level, static_cast(DiagStatus::WARN)); + } + } + + stat.summary(level, temp_dict_.at(level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void GPUMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + int level = DiagStatus::OK; + int whole_level = DiagStatus::OK; + int index = 0; + nvmlReturn_t ret{}; + + if (gpus_.empty()) { + stat.summary(DiagStatus::ERROR, "gpu not found"); + return; + } + + for (auto itr = gpus_.begin(); itr != gpus_.end(); ++itr, ++index) { + ret = nvmlDeviceGetUtilizationRates(itr->device, &itr->utilization); + if (ret != NVML_SUCCESS) { + stat.summary(DiagStatus::ERROR, "Failed to retrieve the current utilization rates"); + stat.add(fmt::format("GPU {}: name", index), itr->name); + stat.add(fmt::format("GPU {}: bus-id", index), itr->pci.busId); + stat.add(fmt::format("GPU {}: content", index), nvmlErrorString(ret)); + return; + } + + level = DiagStatus::OK; + float usage = static_cast(itr->utilization.gpu) / 100.0; + if (usage >= gpu_usage_error_) { + level = std::max(level, static_cast(DiagStatus::ERROR)); + } else if (usage >= gpu_usage_warn_) { + level = std::max(level, static_cast(DiagStatus::WARN)); + } + + stat.add(fmt::format("GPU {}: status", index), load_dict_.at(level)); + stat.add(fmt::format("GPU {}: name", index), itr->name); + stat.addf(fmt::format("GPU {}: usage", index), "%d.0%%", itr->utilization.gpu); + + addProcessUsage(index, itr->device, stat); + + whole_level = std::max(whole_level, level); + } + + stat.summary(whole_level, load_dict_.at(whole_level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void GPUMonitor::addProcessUsage( + int index, nvmlDevice_t device, diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + nvmlReturn_t ret{}; + std::list running_pid_list; + + // Get Compute Process ID + uint32_t info_count = MAX_ARRAY_SIZE; + std::unique_ptr infos; + infos = std::make_unique(MAX_ARRAY_SIZE); + ret = nvmlDeviceGetComputeRunningProcesses_v2(device, &info_count, infos.get()); + if (ret != NVML_SUCCESS) { + RCLCPP_WARN( + this->get_logger(), "Failed to nvmlDeviceGetComputeRunningProcesses_v2 NVML: %s", + nvmlErrorString(ret)); + return; + } + for (uint32_t cnt = 0; cnt < info_count; ++cnt) { + running_pid_list.push_back(infos[cnt].pid); + } + + // Get Graphics Process ID + info_count = MAX_ARRAY_SIZE; + infos = std::make_unique(MAX_ARRAY_SIZE); + ret = nvmlDeviceGetGraphicsRunningProcesses_v2(device, &info_count, infos.get()); + if (ret != NVML_SUCCESS) { + RCLCPP_WARN( + this->get_logger(), "Failed to nvmlDeviceGetGraphicsRunningProcesses_v2 NVML: %s", + nvmlErrorString(ret)); + return; + } + for (uint32_t cnt = 0; cnt < info_count; ++cnt) { + running_pid_list.push_back(infos[cnt].pid); + } + + // Get util_count(1st call of nvmlDeviceGetProcessUtilization) + uint32_t util_count = 0; + ret = nvmlDeviceGetProcessUtilization(device, NULL, &util_count, current_timestamp_); + // This function result will not succeed, because arg[util_count(in)] is 0. + if (ret != NVML_ERROR_INSUFFICIENT_SIZE) { + RCLCPP_WARN( + this->get_logger(), "Failed to nvmlDeviceGetProcessUtilization(1st) NVML: %s", + nvmlErrorString(ret)); + return; + } + // Check util_count + if (util_count <= 0) { + RCLCPP_WARN(this->get_logger(), "Illegal util_count: %d", util_count); + return; + } + + // Get utils data(2nd call of nvmlDeviceGetProcessUtilization) + std::unique_ptr utils; + utils = std::make_unique(util_count); + ret = nvmlDeviceGetProcessUtilization(device, utils.get(), &util_count, current_timestamp_); + if (ret != NVML_SUCCESS) { + RCLCPP_WARN( + this->get_logger(), "Failed to nvmlDeviceGetProcessUtilization(2nd) NVML: %s", + nvmlErrorString(ret)); + return; + } + + // Add data to diagnostic + int add_cnt = 0; + for (uint32_t cnt = 0; cnt < util_count; ++cnt) { + for (auto pid : running_pid_list) { + // PID check, because it contains illegal PID data. ex) PID:0 + if (utils[cnt].pid == pid) { + char name[MAX_NAME_LENGTH + 1] = {}; + nvmlSystemGetProcessName(utils[cnt].pid, name, MAX_NAME_LENGTH); + stat.add(fmt::format("GPU {0}: process {1}: pid", index, add_cnt), utils[cnt].pid); + stat.add(fmt::format("GPU {0}: process {1}: name", index, add_cnt), name); + stat.addf( + fmt::format("GPU {0}: process {1}: usage", index, add_cnt), "%ld.0%%", + ((utils[cnt].smUtil != UINT32_MAX) ? utils[cnt].smUtil : 0)); + ++add_cnt; + break; + } + } + } + + // Update timestamp(usec) + rclcpp::Clock system_clock(RCL_SYSTEM_TIME); + current_timestamp_ = system_clock.now().nanoseconds() / 1000; +} + +void GPUMonitor::checkMemoryUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + int level = DiagStatus::OK; + int whole_level = DiagStatus::OK; + int index = 0; + nvmlReturn_t ret{}; + + if (gpus_.empty()) { + stat.summary(DiagStatus::ERROR, "gpu not found"); + return; + } + + for (auto itr = gpus_.begin(); itr != gpus_.end(); ++itr, ++index) { + nvmlMemory_t memory; + ret = nvmlDeviceGetMemoryInfo(itr->device, &memory); + if (ret != NVML_SUCCESS) { + stat.summary( + DiagStatus::ERROR, "Failed to retrieve the amount of used, free and total memory"); + stat.add(fmt::format("GPU {}: name", index), itr->name); + stat.add(fmt::format("GPU {}: bus-id", index), itr->pci.busId); + stat.add(fmt::format("GPU {}: content", index), nvmlErrorString(ret)); + return; + } + + level = DiagStatus::OK; + float usage = static_cast(itr->utilization.memory) / 100.0; + if (usage >= memory_usage_error_) { + level = std::max(level, static_cast(DiagStatus::ERROR)); + } else if (usage >= memory_usage_warn_) { + level = std::max(level, static_cast(DiagStatus::WARN)); + } + + stat.add(fmt::format("GPU {}: status", index), load_dict_.at(level)); + stat.add(fmt::format("GPU {}: name", index), itr->name); + stat.addf(fmt::format("GPU {}: usage", index), "%d.0%%", itr->utilization.memory); + stat.add(fmt::format("GPU {}: total", index), toHumanReadable(memory.total)); + stat.add(fmt::format("GPU {}: used", index), toHumanReadable(memory.used)); + stat.add(fmt::format("GPU {}: free", index), toHumanReadable(memory.free)); + + whole_level = std::max(whole_level, level); + } + + stat.summary(whole_level, load_dict_.at(whole_level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void GPUMonitor::checkThrottling(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + int level = DiagStatus::OK; + int whole_level = DiagStatus::OK; + int index = 0; + nvmlReturn_t ret{}; + std::vector reasons; + + if (gpus_.empty()) { + stat.summary(DiagStatus::ERROR, "gpu not found"); + return; + } + + for (auto itr = gpus_.begin(); itr != gpus_.end(); ++itr, ++index) { + unsigned int clock = 0; + ret = nvmlDeviceGetClockInfo(itr->device, NVML_CLOCK_GRAPHICS, &clock); + if (ret != NVML_SUCCESS) { + stat.summary(DiagStatus::ERROR, "Failed to retrieve the current clock speeds"); + stat.add(fmt::format("GPU {}: name", index), itr->name); + stat.add(fmt::format("GPU {}: bus-id", index), itr->pci.busId); + stat.add(fmt::format("GPU {}: content", index), nvmlErrorString(ret)); + return; + } + + unsigned long long clocksThrottleReasons = 0LL; // NOLINT + ret = nvmlDeviceGetCurrentClocksThrottleReasons(itr->device, &clocksThrottleReasons); + if (ret != NVML_SUCCESS) { + stat.summary(DiagStatus::ERROR, "Failed to retrieve current clocks throttling reasons"); + stat.add(fmt::format("GPU {}: name", index), itr->name); + stat.add(fmt::format("GPU {}: bus-id", index), itr->pci.busId); + stat.add(fmt::format("GPU {}: content", index), nvmlErrorString(ret)); + return; + } + + while (clocksThrottleReasons) { + unsigned long long flag = clocksThrottleReasons & ((~clocksThrottleReasons) + 1); // NOLINT + clocksThrottleReasons ^= flag; + reasons.emplace_back(reasonToString(flag)); + + switch (flag) { + case nvmlClocksThrottleReasonGpuIdle: + case nvmlClocksThrottleReasonApplicationsClocksSetting: + case nvmlClocksThrottleReasonSwPowerCap: + // we do not treat as error + break; + default: + level = DiagStatus::ERROR; + break; + } + } + + stat.add(fmt::format("GPU {}: status", index), throttling_dict_.at(level)); + stat.add(fmt::format("GPU {}: name", index), itr->name); + stat.addf(fmt::format("GPU {}: graphics clock", index), "%d MHz", clock); + + if (reasons.empty()) { + reasons.emplace_back("ReasonNone"); + } + + stat.add(fmt::format("GPU {}: reasons", index), boost::algorithm::join(reasons, ", ")); + + whole_level = std::max(whole_level, level); + } + + stat.summary(whole_level, throttling_dict_.at(whole_level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +std::string GPUMonitor::toHumanReadable(unsigned long long size) // NOLINT +{ + const char * units[] = {"B", "K", "M", "G", "T"}; + int count = 0; + double dsize = size; + + while (dsize > 1024) { + dsize /= 1024; + ++count; + } + const char * format = (dsize > 0 && dsize < 10) ? "{:.1f}{}" : "{:.0f}{}"; + return fmt::format(format, dsize, units[count]); +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(GPUMonitor) diff --git a/system/system_monitor/src/gpu_monitor/tegra_gpu_monitor.cpp b/system/system_monitor/src/gpu_monitor/tegra_gpu_monitor.cpp new file mode 100644 index 0000000000000..a9ae72321efd1 --- /dev/null +++ b/system/system_monitor/src/gpu_monitor/tegra_gpu_monitor.cpp @@ -0,0 +1,210 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file tegra_gpu_monitor.cpp + * @brief Tegra GPU monitor class + */ + +#include "system_monitor/gpu_monitor/tegra_gpu_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include + +#include + +#include +#include +#include +#include + +namespace fs = boost::filesystem; + +GPUMonitor::GPUMonitor(const rclcpp::NodeOptions & options) : GPUMonitorBase("gpu_monitor", options) +{ + getTempNames(); + getLoadNames(); + getFreqNames(); + + // There is no separate gpu memory in tegra. Both cpu and gpu uses cpu memory. thus remove. + updater_.removeByName("GPU Memory Usage"); + // There is no event record for thermal throttling. + // Need to manually monitor temperature to figure out if thermal limits crossed or not. + updater_.removeByName("GPU Thermal Throttling"); +} + +void GPUMonitor::checkTemp(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + if (temps_.empty()) { + stat.summary(DiagStatus::ERROR, "temperature files not found"); + return; + } + + int level = DiagStatus::OK; + std::string error_str; + + for (const auto & itr : temps_) { + // Read temperature file + const fs::path path(itr.path_); + fs::ifstream ifs(path, std::ios::in); + if (!ifs) { + stat.add("file open error", itr.path_); + error_str = "file open error"; + continue; + } + + float temp{}; + ifs >> temp; + ifs.close(); + temp /= 1000; + stat.addf(itr.label_, "%.1f DegC", temp); + + level = DiagStatus::OK; + if (temp >= temp_error_) { + level = std::max(level, static_cast(DiagStatus::ERROR)); + } else if (temp >= temp_warn_) { + level = std::max(level, static_cast(DiagStatus::WARN)); + } + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, error_str); + } else { + stat.summary(level, temp_dict_.at(level)); + } +} + +void GPUMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + if (loads_.empty()) { + stat.summary(DiagStatus::ERROR, "load files not found"); + return; + } + + int level = DiagStatus::OK; + std::string error_str; + + for (const auto & itr : loads_) { + // Read load file + const fs::path path(itr.path_); + fs::ifstream ifs(path, std::ios::in); + if (!ifs) { + stat.add("file open error", itr.path_); + error_str = "file open error"; + continue; + } + + float load{}; + ifs >> load; + ifs.close(); + stat.addf(itr.label_, "%.1f%%", load / 10); + + level = DiagStatus::OK; + load /= 1000; + if (load >= gpu_usage_error_) { + level = std::max(level, static_cast(DiagStatus::ERROR)); + } else if (load >= gpu_usage_warn_) { + level = std::max(level, static_cast(DiagStatus::WARN)); + } + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, error_str); + } else { + stat.summary(level, load_dict_.at(level)); + } +} + +void GPUMonitor::checkThrottling(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // TODO(Fumihito Ito): implement me +} + +void GPUMonitor::checkFrequency(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + if (freqs_.empty()) { + stat.summary(DiagStatus::ERROR, "frequency files not found"); + return; + } + + for (const auto & freq : freqs_) { + // Read cur_freq file + const fs::path path(freq.path_); + fs::ifstream ifs(path, std::ios::in); + if (ifs) { + std::string line; + if (std::getline(ifs, line)) { + stat.addf(fmt::format("GPU {}: clock", freq.label_), "%d MHz", std::stoi(line) / 1000000); + } + } + ifs.close(); + } + + stat.summary(DiagStatus::OK, "OK"); +} + +void GPUMonitor::getTempNames() +{ + // Jetson TX1 TX2 Nano: thermal_zone1, Xavier: thermal_zone0 + std::vector therms; + SystemMonitorUtility::getThermalZone("GPU-therm", &therms); + + for (const auto & therm : therms) { + temps_.emplace_back(therm.label_, therm.path_); + } +} + +void GPUMonitor::getLoadNames() +{ + const fs::path root("/sys/devices"); + + for (const fs::path & path : + boost::make_iterator_range(fs::directory_iterator(root), fs::directory_iterator())) { + if (!fs::is_directory(path)) { + continue; + } + + std::cmatch match; + const char * str_path = path.generic_string().c_str(); + + // /sys/devices/gpu.[0-9] ? + if (!std::regex_match(str_path, match, std::regex(".*gpu\\.(\\d+)"))) { + continue; + } + + // /sys/devices/gpu.[0-9]/load + const fs::path load_path = path / "load"; + loads_.emplace_back(path.filename().generic_string(), load_path.generic_string()); + } +} + +void GPUMonitor::getFreqNames() +{ + const fs::path root("/sys/class/devfreq"); + + for (const fs::path & path : + boost::make_iterator_range(fs::directory_iterator(root), fs::directory_iterator())) { + // /sys/class/devfreq/?????/cur_freq ? + if (!fs::is_directory(path)) { + continue; + } + + const fs::path freq_path = path / "cur_freq"; + freqs_.emplace_back(path.filename().generic_string(), freq_path.generic_string()); + } +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(GPUMonitor) diff --git a/system/system_monitor/src/gpu_monitor/unknown_gpu_monitor.cpp b/system/system_monitor/src/gpu_monitor/unknown_gpu_monitor.cpp new file mode 100644 index 0000000000000..5043e75ec7c68 --- /dev/null +++ b/system/system_monitor/src/gpu_monitor/unknown_gpu_monitor.cpp @@ -0,0 +1,29 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file unknown_gpu_monitor.cpp + * @brief Unknown GPU monitor class + */ + +#include "system_monitor/gpu_monitor/unknown_gpu_monitor.hpp" + +#include + +GPUMonitor::GPUMonitor(const rclcpp::NodeOptions & options) : GPUMonitorBase("gpu_monitor", options) +{ +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(GPUMonitor) diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp new file mode 100644 index 0000000000000..249b9f93786ab --- /dev/null +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -0,0 +1,304 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file hdd_monitor.cpp + * @brief HDD monitor class + */ + +#include "system_monitor/hdd_monitor/hdd_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace bp = boost::process; + +HDDMonitor::HDDMonitor(const rclcpp::NodeOptions & options) +: Node("hdd_monitor", options), + updater_(this), + hdd_reader_port_(declare_parameter("hdd_reader_port", 7635)) +{ + gethostname(hostname_, sizeof(hostname_)); + + getHDDParams(); + + updater_.setHardwareID(hostname_); + updater_.add("HDD Temperature", this, &HDDMonitor::checkTemp); + updater_.add("HDD Usage", this, &HDDMonitor::checkUsage); +} + +void HDDMonitor::update() { updater_.force_update(); } + +void HDDMonitor::checkTemp(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (hdd_params_.empty()) { + stat.summary(DiagStatus::ERROR, "invalid disk parameter"); + return; + } + + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + stat.summary(DiagStatus::ERROR, "socket error"); + stat.add("socket", strerror(errno)); + return; + } + + // Specify the receiving timeouts until reporting an error + struct timeval tv; + tv.tv_sec = 10; + tv.tv_usec = 0; + int ret = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "setsockopt error"); + stat.add("setsockopt", strerror(errno)); + close(sock); + return; + } + + // Connect the socket referred to by the file descriptor + sockaddr_in addr; + memset(&addr, 0, sizeof(sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_port = htons(hdd_reader_port_); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "connect error"); + stat.add("connect", strerror(errno)); + close(sock); + return; + } + + std::ostringstream oss; + boost::archive::text_oarchive oa(oss); + oa & hdd_devices_; + + // Write list of devices to FD + ret = write(sock, oss.str().c_str(), oss.str().length()); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "write error"); + stat.add("write", strerror(errno)); + RCLCPP_ERROR(get_logger(), "write error"); + close(sock); + return; + } + + // Receive messages from a socket + char buf[1024] = ""; + ret = recv(sock, buf, sizeof(buf) - 1, 0); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "recv error"); + stat.add("recv", strerror(errno)); + close(sock); + return; + } + // No data received + if (ret == 0) { + stat.summary(DiagStatus::ERROR, "recv error"); + stat.add("recv", "No data received"); + close(sock); + return; + } + + // Close the file descriptor FD + ret = close(sock); + if (ret < 0) { + stat.summary(DiagStatus::ERROR, "close error"); + stat.add("close", strerror(errno)); + return; + } + + // Restore HDD information list + HDDInfoList list; + + try { + std::istringstream iss(buf); + boost::archive::text_iarchive oa(iss); + oa >> list; + } catch (const std::exception & e) { + stat.summary(DiagStatus::ERROR, "recv error"); + stat.add("recv", e.what()); + return; + } + + int level = DiagStatus::OK; + int whole_level = DiagStatus::OK; + int index = 0; + std::string error_str = ""; + + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++index) { + // Retrieve HDD information + auto itrh = list.find(itr->first); + if (itrh == list.end()) { + stat.add(fmt::format("HDD {}: status", index), "hdd_reader error"); + stat.add(fmt::format("HDD {}: name", index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: hdd_reader", index), strerror(ENOENT)); + error_str = "hdd_reader error"; + continue; + } + + if (itrh->second.error_code_ != 0) { + stat.add(fmt::format("HDD {}: status", index), "hdd_reader error"); + stat.add(fmt::format("HDD {}: name", index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: hdd_reader", index), strerror(itrh->second.error_code_)); + error_str = "hdd_reader error"; + continue; + } + + float temp = static_cast(itrh->second.temp_); + + level = DiagStatus::OK; + if (temp >= itr->second.temp_error_) { + level = DiagStatus::ERROR; + } else if (temp >= itr->second.temp_warn_) { + level = DiagStatus::WARN; + } + + stat.add(fmt::format("HDD {}: status", index), temp_dict_.at(level)); + stat.add(fmt::format("HDD {}: name", index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: model", index), itrh->second.model_.c_str()); + stat.add(fmt::format("HDD {}: serial", index), itrh->second.serial_.c_str()); + stat.addf(fmt::format("HDD {}: temperature", index), "%.1f DegC", temp); + + whole_level = std::max(whole_level, level); + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, error_str); + } else { + stat.summary(whole_level, temp_dict_.at(whole_level)); + } + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (hdd_params_.empty()) { + stat.summary(DiagStatus::ERROR, "invalid disk parameter"); + return; + } + + int hdd_index = 0; + int whole_level = DiagStatus::OK; + std::string error_str = ""; + + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++hdd_index) { + // Get summary of disk space usage of ext4 + bp::ipstream is_out; + bp::ipstream is_err; + // Invoke shell to use shell wildcard expansion + bp::child c( + "/bin/sh", "-c", fmt::format("df -Pht ext4 {}*", itr->first.c_str()), bp::std_out > is_out, + bp::std_err > is_err); + c.wait(); + + if (c.exit_code() != 0) { + std::ostringstream os; + is_err >> os.rdbuf(); + error_str = "df error"; + stat.add(fmt::format("HDD {}: status", hdd_index), "df error"); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: df", hdd_index), os.str().c_str()); + continue; + } + + int level = DiagStatus::OK; + std::string line; + int index = 0; + std::vector list; + float usage; + + while (std::getline(is_out, line) && !line.empty()) { + // Skip header + if (index <= 0) { + ++index; + continue; + } + + boost::split(list, line, boost::is_space(), boost::token_compress_on); + + usage = std::atof(boost::trim_copy_if(list[4], boost::is_any_of("%")).c_str()) * 1e-2; + + level = DiagStatus::OK; + if (usage >= itr->second.usage_error_) { + level = DiagStatus::ERROR; + } else if (usage >= itr->second.usage_warn_) { + level = DiagStatus::WARN; + } + + stat.add(fmt::format("HDD {}: status", hdd_index), usage_dict_.at(level)); + stat.add(fmt::format("HDD {}: filesystem", hdd_index), list[0].c_str()); + stat.add(fmt::format("HDD {}: size", hdd_index), list[1].c_str()); + stat.add(fmt::format("HDD {}: used", hdd_index), list[2].c_str()); + stat.add(fmt::format("HDD {}: avail", hdd_index), list[3].c_str()); + stat.add(fmt::format("HDD {}: use", hdd_index), list[4].c_str()); + stat.add(fmt::format("HDD {}: mounted on", hdd_index), list[5].c_str()); + + whole_level = std::max(whole_level, level); + ++index; + } + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, error_str); + } else { + stat.summary(whole_level, usage_dict_.at(whole_level)); + } + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void HDDMonitor::getHDDParams() +{ + const auto num_disks = this->declare_parameter("num_disks", 0); + for (auto i = 0; i < num_disks; ++i) { + const auto prefix = "disks.disk" + std::to_string(i); + HDDParam param; + param.temp_warn_ = declare_parameter(prefix + ".temp_warn"); + param.temp_error_ = declare_parameter(prefix + ".temp_error"); + param.usage_warn_ = declare_parameter(prefix + ".usage_warn"); + param.usage_error_ = declare_parameter(prefix + ".usage_error"); + const auto name = declare_parameter(prefix + ".name"); + + hdd_params_[name] = param; + + hdd_devices_.push_back(name); + } +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(HDDMonitor) diff --git a/system/system_monitor/src/mem_monitor/mem_monitor.cpp b/system/system_monitor/src/mem_monitor/mem_monitor.cpp new file mode 100644 index 0000000000000..c9d3f3e14d515 --- /dev/null +++ b/system/system_monitor/src/mem_monitor/mem_monitor.cpp @@ -0,0 +1,137 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file memory_monitor.cpp + * @brief Memory monitor class + */ + +#include "system_monitor/mem_monitor/mem_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include + +#include + +#include +#include + +namespace bp = boost::process; + +MemMonitor::MemMonitor(const rclcpp::NodeOptions & options) +: Node("mem_monitor", options), + updater_(this), + usage_warn_(declare_parameter("usage_warn", 0.95)), + usage_error_(declare_parameter("usage_error", 0.99)) +{ + gethostname(hostname_, sizeof(hostname_)); + updater_.setHardwareID(hostname_); + updater_.add("Memory Usage", this, &MemMonitor::checkUsage); +} + +void MemMonitor::update() { updater_.force_update(); } + +void MemMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + // Get total amount of free and used memory + bp::ipstream is_out; + bp::ipstream is_err; + bp::child c("free -tb", bp::std_out > is_out, bp::std_err > is_err); + c.wait(); + if (c.exit_code() != 0) { + std::ostringstream os; + is_err >> os.rdbuf(); + stat.summary(DiagStatus::ERROR, "free error"); + stat.add("free", os.str().c_str()); + return; + } + + int level = DiagStatus::OK; + std::string line; + int index = 0; + std::vector list; + float usage; + + /* + Output example of `free -tb` + + list[0] list[1] list[2] list[3] list[4] list[5] list[6] + index 0 | total used free shared buff/cache available + index 1 | Mem: 32809744 12554780 13090376 292840 7164588 19622092 + index 2 | Swap: 33554428 1767680 31786748 + index 3 | Total: 66364172 14322460 44877124 + */ + while (std::getline(is_out, line) && !line.empty()) { + // Skip header + if (index <= 0) { + ++index; + continue; + } + + boost::split(list, line, boost::is_space(), boost::token_compress_on); + + // Physical memory + if (index == 1) { + // available divided by total is available memory including calculation for buff/cache, + // so the subtraction of this from 1 gives real usage. + usage = 1.0f - std::atof(list[6].c_str()) / std::atof(list[1].c_str()); + + if (usage >= usage_error_) { + level = DiagStatus::ERROR; + } else if (usage >= usage_warn_) { + level = DiagStatus::WARN; + } + + stat.addf(fmt::format("{} usage", list[0]), "%.2f%%", usage * 1e+2); + } + + stat.add(fmt::format("{} total", list[0]), toHumanReadable(list[1])); + stat.add(fmt::format("{} used", list[0]), toHumanReadable(list[2])); + stat.add(fmt::format("{} free", list[0]), toHumanReadable(list[3])); + + // Add an additional information for physical memory + if (index == 1) { + stat.add(fmt::format("{} shared", list[0]), toHumanReadable(list[4])); + stat.add(fmt::format("{} buff/cache", list[0]), toHumanReadable(list[5])); + stat.add(fmt::format("{} available", list[0]), toHumanReadable(list[6])); + } + ++index; + } + + stat.summary(level, usage_dict_.at(level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +std::string MemMonitor::toHumanReadable(const std::string & str) +{ + const char * units[] = {"B", "K", "M", "G", "T"}; + int count = 0; + double size = std::atol(str.c_str()); + + while (size > 1024) { + size /= 1024; + ++count; + } + const char * format = (size > 0 && size < 10) ? "{:.1f}{}" : "{:.0f}{}"; + return fmt::format(format, size, units[count]); +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(MemMonitor) diff --git a/system/system_monitor/src/net_monitor/net_monitor.cpp b/system/system_monitor/src/net_monitor/net_monitor.cpp new file mode 100644 index 0000000000000..1ca0f71081af1 --- /dev/null +++ b/system/system_monitor/src/net_monitor/net_monitor.cpp @@ -0,0 +1,233 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file net_monitor.cpp + * @brief Net monitor class + */ + +#include "system_monitor/net_monitor/net_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +NetMonitor::NetMonitor(const rclcpp::NodeOptions & options) +: Node("net_monitor", options), + updater_(this), + last_update_time_{0, 0, this->get_clock()->get_clock_type()}, + device_params_( + declare_parameter>("devices", std::vector())), + usage_warn_(declare_parameter("usage_warn", 0.95)) +{ + gethostname(hostname_, sizeof(hostname_)); + updater_.setHardwareID(hostname_); + updater_.add("Network Usage", this, &NetMonitor::checkUsage); + + nl80211_.init(); +} + +NetMonitor::~NetMonitor() { shutdown_nl80211(); } + +void NetMonitor::update() { updater_.force_update(); } + +void NetMonitor::shutdown_nl80211() { nl80211_.shutdown(); } + +void NetMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (device_params_.empty()) { + stat.summary(DiagStatus::ERROR, "invalid device parameter"); + return; + } + + const struct ifaddrs * ifa; + struct ifaddrs * ifas = nullptr; + + rclcpp::Duration duration = this->now() - last_update_time_; + + // Get network interfaces + if (getifaddrs(&ifas) < 0) { + stat.summary(DiagStatus::ERROR, "getifaddrs error"); + stat.add("getifaddrs", strerror(errno)); + return; + } + + int level = DiagStatus::OK; + int whole_level = DiagStatus::OK; + int index = 0; + std::string error_str; + float rx_traffic{0.0}; + float tx_traffic{0.0}; + float rx_usage{0.0}; + float tx_usage{0.0}; + std::vector interface_names; + + for (ifa = ifas; ifa; ifa = ifa->ifa_next) { + // Skip no addr + if (!ifa->ifa_addr) { + continue; + } + // Skip loopback + if (ifa->ifa_flags & IFF_LOOPBACK) { + continue; + } + // Skip non AF_PACKET + if (ifa->ifa_addr->sa_family != AF_PACKET) { + continue; + } + // Skip device not specified + if ( + boost::find(device_params_, ifa->ifa_name) == device_params_.end() && + boost::find(device_params_, "*") == device_params_.end()) { + continue; + } + + int fd; + struct ifreq ifrm; + struct ifreq ifrc; + struct ethtool_cmd edata; + + // Get MTU information + fd = socket(AF_INET, SOCK_DGRAM, 0); + strncpy(ifrm.ifr_name, ifa->ifa_name, IFNAMSIZ - 1); + if (ioctl(fd, SIOCGIFMTU, &ifrm) < 0) { + if (errno == ENOTSUP) { + stat.add(fmt::format("Network {}: status", index), "Not Supported"); + } else { + stat.add(fmt::format("Network {}: status", index), "Error"); + error_str = "ioctl error"; + } + + stat.add(fmt::format("Network {}: interface name", index), ifa->ifa_name); + stat.add("ioctl(SIOCGIFMTU)", strerror(errno)); + + ++index; + close(fd); + interface_names.push_back(ifa->ifa_name); + continue; + } + + // Get network capacity + float speed = 0.0; + strncpy(ifrc.ifr_name, ifa->ifa_name, IFNAMSIZ - 1); + ifrc.ifr_data = (caddr_t)&edata; + edata.cmd = ETHTOOL_GSET; + if (ioctl(fd, SIOCETHTOOL, &ifrc) < 0) { + // possibly wireless connection, get bitrate(MBit/s) + speed = nl80211_.getBitrate(ifa->ifa_name); + if (speed <= 0) { + if (errno == ENOTSUP) { + stat.add(fmt::format("Network {}: status", index), "Not Supported"); + } else { + stat.add(fmt::format("Network {}: status", index), "Error"); + error_str = "ioctl error"; + } + + stat.add(fmt::format("Network {}: interface name", index), ifa->ifa_name); + stat.add("ioctl(SIOCETHTOOL)", strerror(errno)); + + ++index; + close(fd); + interface_names.push_back(ifa->ifa_name); + continue; + } + } else { + speed = edata.speed; + } + + level = (ifa->ifa_flags & IFF_RUNNING) ? DiagStatus::OK : DiagStatus::ERROR; + + auto * stats = (struct rtnl_link_stats *)ifa->ifa_data; + if (bytes_.find(ifa->ifa_name) != bytes_.end()) { + rx_traffic = toMbit(stats->rx_bytes - bytes_[ifa->ifa_name].rx_bytes) / duration.seconds(); + tx_traffic = toMbit(stats->tx_bytes - bytes_[ifa->ifa_name].tx_bytes) / duration.seconds(); + rx_usage = rx_traffic / speed; + tx_usage = tx_traffic / speed; + if (rx_usage >= usage_warn_ || tx_usage > usage_warn_) { + level = std::max(level, static_cast(DiagStatus::WARN)); + } + } + + stat.add(fmt::format("Network {}: status", index), usage_dict_.at(level)); + stat.add(fmt::format("Network {}: interface name", index), ifa->ifa_name); + stat.addf(fmt::format("Network {}: rx_usage", index), "%.2f%%", rx_usage * 1e+2); + stat.addf(fmt::format("Network {}: tx_usage", index), "%.2f%%", tx_usage * 1e+2); + stat.addf(fmt::format("Network {}: rx_traffic", index), "%.2f MBit/s", rx_traffic); + stat.addf(fmt::format("Network {}: tx_traffic", index), "%.2f MBit/s", tx_traffic); + stat.addf(fmt::format("Network {}: capacity", index), "%.1f MBit/s", speed); + stat.add(fmt::format("Network {}: mtu", index), ifrm.ifr_mtu); + stat.add(fmt::format("Network {}: rx_bytes", index), stats->rx_bytes); + stat.add(fmt::format("Network {}: rx_errors", index), stats->rx_errors); + stat.add(fmt::format("Network {}: tx_bytes", index), stats->tx_bytes); + stat.add(fmt::format("Network {}: tx_errors", index), stats->tx_errors); + stat.add(fmt::format("Network {}: collisions", index), stats->collisions); + + close(fd); + + bytes_[ifa->ifa_name].rx_bytes = stats->rx_bytes; + bytes_[ifa->ifa_name].tx_bytes = stats->tx_bytes; + whole_level = std::max(whole_level, level); + ++index; + + interface_names.push_back(ifa->ifa_name); + } + + freeifaddrs(ifas); + + // Check if specified device exists + for (const auto & device : device_params_) { + // Skip if all devices specified + if (device == "*") { + continue; + } + // Skip if device already appended + if (boost::find(interface_names, device) != interface_names.end()) { + continue; + } + + stat.add(fmt::format("Network {}: status", index), "No Such Device"); + stat.add(fmt::format("Network {}: interface name", index), device); + error_str = "no such device"; + ++index; + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, error_str); + } else { + stat.summary(whole_level, usage_dict_.at(whole_level)); + } + + last_update_time_ = this->now(); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(NetMonitor) diff --git a/system/system_monitor/src/net_monitor/nl80211.cpp b/system/system_monitor/src/net_monitor/nl80211.cpp new file mode 100644 index 0000000000000..5779372f36079 --- /dev/null +++ b/system/system_monitor/src/net_monitor/nl80211.cpp @@ -0,0 +1,205 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file nl80211.cpp + * @brief 802.11 netlink-based interface class + */ + +#include "system_monitor/net_monitor/nl80211.hpp" + +#include +#include +#include +#include + +NL80211::NL80211() : bitrate_(0.0), initialized_(false), socket_(nullptr), id_(-1), cb_(nullptr) {} + +// Attribute validation policy +static struct nla_policy stats_policy[NL80211_STA_INFO_MAX + 1]; +static struct nla_policy rate_policy[NL80211_RATE_INFO_MAX + 1]; + +static int callback(struct nl_msg * msg, void * arg) +{ + int ret; + auto * rate = reinterpret_cast(arg); + + // Return actual netlink message. + struct nlmsghdr * nlh = nlmsg_hdr(msg); + // Return pointer to message payload. + auto * ghdr = static_cast(nlmsg_data(nlh)); + + struct nlattr * tb[NL80211_ATTR_MAX + 1]; + struct nlattr * sinfo[NL80211_STA_INFO_MAX + 1]; + struct nlattr * rinfo[NL80211_RATE_INFO_MAX + 1]; + + // Create attribute index based on a stream of attributes. + ret = + nla_parse(tb, NL80211_ATTR_MAX, genlmsg_attrdata(ghdr, 0), genlmsg_attrlen(ghdr, 0), nullptr); + // Returns 0 on success or a negative error code. + if (ret < 0) { + return NL_SKIP; + } + + // Information about a station missing + if (!tb[NL80211_ATTR_STA_INFO]) { + return NL_SKIP; + } + + // Create attribute index based on nested attribute. + ret = nla_parse_nested(sinfo, NL80211_STA_INFO_MAX, tb[NL80211_ATTR_STA_INFO], stats_policy); + // Returns 0 on success or a negative error code. + if (ret < 0) { + return NL_SKIP; + } + + // current unicast tx rate missing + if (!sinfo[NL80211_STA_INFO_TX_BITRATE]) { + return NL_SKIP; + } + + // Create attribute index based on nested attribute. + ret = + nla_parse_nested(rinfo, NL80211_RATE_INFO_MAX, sinfo[NL80211_STA_INFO_TX_BITRATE], rate_policy); + // Returns 0 on success or a negative error code. + if (ret < 0) { + return NL_SKIP; + } + + // total bitrate exists + if (rinfo[NL80211_RATE_INFO_BITRATE]) { + // Return payload of 16 bit integer attribute. + *rate = static_cast(nla_get_u16(rinfo[NL80211_RATE_INFO_BITRATE])) / 10; + } + + return NL_SKIP; +} + +void NL80211::init() +{ + // Allocate new netlink socket. + socket_ = nl_socket_alloc(); + // Returns newly allocated netlink socket or NULL. + if (!socket_) { + return; + } + + // Connect a generic netlink socket. + // Returns 0 on success or a negative error code. + int ret = genl_connect(socket_); + if (ret < 0) { + shutdown(); + return; + } + + // Resolve generic netlink family name to its identifier. + id_ = genl_ctrl_resolve(socket_, "nl80211"); + // Returns a positive identifier or a negative error code. + if (id_ < 0) { + shutdown(); + return; + } + + // Allocate a new callback handle. + cb_ = nl_cb_alloc(NL_CB_DEFAULT); + // Returns newly allocated callback handle or NULL. + if (!cb_) { + shutdown(); + return; + } + + // Set up a callback. + ret = nl_cb_set(cb_, NL_CB_VALID, NL_CB_CUSTOM, callback, reinterpret_cast(&bitrate_)); + // Returns 0 on success or a negative error code. + if (ret < 0) { + shutdown(); + return; + } + + initialized_ = true; +} + +float NL80211::getBitrate(const char * ifa_name) +{ + int ret; + struct nl_msg * msg; + void * hdr; + int index; + + bitrate_ = 0.0; + + if (!initialized_) { + return bitrate_; + } + + // Get index of the network interface + index = if_nametoindex(ifa_name); + // Returns index number of the network interface on success + // or 0 on error and errno is set appropriately + if (!index) { + return bitrate_; + } + + // Allocate a new netlink message with the default maximum payload size. + msg = nlmsg_alloc(); + // Returns newly allocated netlink message or NULL. + if (!msg) { + return bitrate_; + } + + // Add Generic Netlink headers to Netlink message. + hdr = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, id_, 0, NLM_F_DUMP, NL80211_CMD_GET_STATION, 0); + // Returns pointer to user header or NULL if an error occurred. + if (!hdr) { + nlmsg_free(msg); + return bitrate_; + } + + // Add 32 bit integer attribute to netlink message. + ret = nla_put_u32(msg, NL80211_ATTR_IFINDEX, index); + // Returns 0 on success or a negative error code. + if (ret < 0) { + nlmsg_free(msg); + return bitrate_; + } + + // Finalize and transmit Netlink message. + ret = nl_send_auto(socket_, msg); + // Returns number of bytes sent or a negative error code. + if (ret < 0) { + nlmsg_free(msg); + return bitrate_; + } + + // Receive a set of messages from a netlink socket. + ret = nl_recvmsgs(socket_, cb_); + // 0 on success or a negative error code from nl_recv(). + if (ret < 0) { + nlmsg_free(msg); + return bitrate_; + } + + nlmsg_free(msg); + return bitrate_; +} + +void NL80211::shutdown() +{ + if (cb_) { + nl_cb_put(cb_); + } + nl_close(socket_); + nl_socket_free(socket_); + initialized_ = false; +} diff --git a/system/system_monitor/src/ntp_monitor/ntp_monitor.cpp b/system/system_monitor/src/ntp_monitor/ntp_monitor.cpp new file mode 100644 index 0000000000000..49f5ec57eda4c --- /dev/null +++ b/system/system_monitor/src/ntp_monitor/ntp_monitor.cpp @@ -0,0 +1,139 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file ntp_monitor.cpp + * @brief NTP monitor class + */ + +#include "system_monitor/ntp_monitor/ntp_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include +#include + +#include + +#include +#include +#include + +namespace bp = boost::process; +namespace fs = boost::filesystem; + +NTPMonitor::NTPMonitor(const rclcpp::NodeOptions & options) +: Node("ntp_monitor", options), + updater_(this), + offset_warn_(declare_parameter("offset_warn", 0.1)), + offset_error_(declare_parameter("offset_error", 5.0)) +{ + gethostname(hostname_, sizeof(hostname_)); + + // Check if command exists + fs::path p = bp::search_path("chronyc"); + chronyc_exists_ = (p.empty()) ? false : true; + + updater_.setHardwareID(hostname_); + updater_.add("NTP Offset", this, &NTPMonitor::checkOffset); +} + +void NTPMonitor::update() { updater_.force_update(); } + +void NTPMonitor::checkOffset(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (!chronyc_exists_) { + stat.summary(DiagStatus::ERROR, "chronyc error"); + stat.add( + "chronyc", "Command 'chronyc' not found, but can be installed with: sudo apt install chrony"); + return; + } + + std::string error_str; + float offset = 0.0f; + std::map tracking_map; + error_str = executeChronyc(offset, tracking_map); + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, "chronyc error"); + stat.add("chronyc", error_str); + return; + } + + int level = DiagStatus::OK; + + // Check an earlier offset as well + float abs = std::abs(offset); + if (abs >= offset_error_) { + level = DiagStatus::ERROR; + } else if (abs >= offset_warn_) { + level = DiagStatus::WARN; + } + for (auto itr = tracking_map.begin(); itr != tracking_map.end(); ++itr) { + stat.add(itr->first, itr->second); + } + stat.summary(level, offset_dict_.at(level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +std::string NTPMonitor::executeChronyc( + float & out_offset, std::map & out_tracking_map) +{ + std::string result; + + // Tracking chrony status + bp::ipstream is_out; + bp::child c("chronyc tracking", bp::std_out > is_out); + c.wait(); + if (c.exit_code() != 0) { + std::ostringstream os; + is_out >> os.rdbuf(); + result = os.str().c_str(); + return result; + } + + std::string line; + std::cmatch match; + const std::regex filter("^(.+[A-Za-z()]) *: (.*)"); + const std::regex filter_system_time("([0-9.]*) seconds (slow|fast).*"); + + while (std::getline(is_out, line) && !line.empty()) { + if (std::regex_match(line.c_str(), match, filter)) { + out_tracking_map[match[1].str()] = match[2].str(); + } + } + + // System time : conversion string to float + std::string str_system_time = out_tracking_map["System time"]; + if (std::regex_match(str_system_time.c_str(), match, filter_system_time)) { + out_offset = std::atof(match[1].str().c_str()); + + if (match[2].str() == "fast") { + // "fast" is - value(match to ntpdate) + out_offset *= -1; + } else { + // "slow" is + value(match to ntpdate) + } + } else { + RCLCPP_WARN(get_logger(), "regex_match: illegal result. str = %s", str_system_time.c_str()); + } + return result; +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(NTPMonitor) diff --git a/system/system_monitor/src/process_monitor/process_monitor.cpp b/system/system_monitor/src/process_monitor/process_monitor.cpp new file mode 100644 index 0000000000000..661753196128e --- /dev/null +++ b/system/system_monitor/src/process_monitor/process_monitor.cpp @@ -0,0 +1,314 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file process_monitor.cpp + * @brief Process monitor class + */ + +#include "system_monitor/process_monitor/process_monitor.hpp" + +#include "system_monitor/system_monitor_utility.hpp" + +#include + +#include +#include +#include +#include + +ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options) +: Node("process_monitor", options), + updater_(this), + num_of_procs_(declare_parameter("num_of_procs", 5)) +{ + int index; + + gethostname(hostname_, sizeof(hostname_)); + + updater_.setHardwareID(hostname_); + updater_.add("Tasks Summary", this, &ProcessMonitor::monitorProcesses); + + for (index = 0; index < num_of_procs_; ++index) { + auto task = std::make_shared(fmt::format("High-load Proc[{}]", index)); + load_tasks_.push_back(task); + updater_.add(*task); + } + for (index = 0; index < num_of_procs_; ++index) { + auto task = std::make_shared(fmt::format("High-mem Proc[{}]", index)); + memory_tasks_.push_back(task); + updater_.add(*task); + } +} + +void ProcessMonitor::update() { updater_.force_update(); } + +void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + bp::ipstream is_err; + bp::ipstream is_out; + std::ostringstream os; + + // Get processes + bp::child c("top -bn1 -o %CPU -w 128", bp::std_out > is_out, bp::std_err > is_err); + c.wait(); + if (c.exit_code() != 0) { + is_err >> os.rdbuf(); + stat.summary(DiagStatus::ERROR, "top error"); + stat.add("top", os.str().c_str()); + setErrorContent(&load_tasks_, "top error", "top", os.str().c_str()); + setErrorContent(&memory_tasks_, "top error", "top", os.str().c_str()); + return; + } + + is_out >> os.rdbuf(); + std::string str = os.str(); + + // Get task summary + getTasksSummary(stat, str); + // Remove header + removeHeader(stat, str); + + // Get high load processes + getHighLoadProcesses(str); + + // Get high memory processes + getHighMemoryProcesses(str); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void ProcessMonitor::getTasksSummary( + diagnostic_updater::DiagnosticStatusWrapper & stat, const std::string & output) +{ + bp::pipe p; + std::string line; + + // Echo output for grep + { + bp::ipstream is_out; + bp::ipstream is_err; + bp::child c(fmt::format("echo {}", output), bp::std_out > p, bp::std_err > is_err); + c.wait(); + if (c.exit_code() != 0) { + std::ostringstream os; + is_err >> os.rdbuf(); + stat.summary(DiagStatus::ERROR, "echo error"); + stat.add("echo", os.str().c_str()); + return; + } + } + // Find matching pattern of summary + { + bp::ipstream is_out; + bp::child c("grep Tasks:", bp::std_out > is_out, bp::std_in < p); + c.wait(); + // no matching line + if (c.exit_code() != 0) { + stat.summary(DiagStatus::ERROR, "matching pattern not found"); + stat.add("name", "Tasks:"); + return; + } + + std::getline(is_out, line); + std::cmatch match; + const std::regex filter( + "^Tasks: (\\d+) total,\\s+(\\d+) running,\\s+(\\d+) sleeping,\\s+(\\d+) stopped,\\s+(\\d+) " + "zombie"); + + if (std::regex_match(line.c_str(), match, filter)) { + stat.add("total", match[1].str()); + stat.add("running", match[2].str()); + stat.add("sleeping", match[3].str()); + stat.add("stopped", match[4].str()); + stat.add("zombie", match[5].str()); + stat.summary(DiagStatus::OK, "OK"); + } else { + stat.summary(DiagStatus::ERROR, "invalid format"); + } + } +} + +void ProcessMonitor::removeHeader( + diagnostic_updater::DiagnosticStatusWrapper & stat, std::string & output) +{ + bp::pipe p1; + bp::pipe p2; + std::ostringstream os; + + // Echo output for sed + { + bp::ipstream is_err; + bp::child c(fmt::format("echo {}", output), bp::std_out > p1, bp::std_err > is_err); + c.wait(); + if (c.exit_code() != 0) { + is_err >> os.rdbuf(); + stat.summary(DiagStatus::ERROR, "echo error"); + stat.add("echo", os.str().c_str()); + return; + } + } + // Remove %Cpu section + { + bp::ipstream is_err; + bp::child c("sed \"/^%Cpu/d\"", bp::std_out > p2, bp::std_err > is_err, bp::std_in < p1); + c.wait(); + // no matching line + if (c.exit_code() != 0) { + stat.summary(DiagStatus::ERROR, "sed error"); + stat.add("sed", "Failed to remove header"); + return; + } + } + // Remove header + { + bp::ipstream is_out; + bp::ipstream is_err; + bp::child c("sed \"1,6d\"", bp::std_out > is_out, bp::std_err > is_err, bp::std_in < p2); + c.wait(); + // no matching line + if (c.exit_code() != 0) { + stat.summary(DiagStatus::ERROR, "sed error"); + stat.add("sed", "Failed to remove header"); + return; + } + // overwrite + is_out >> os.rdbuf(); + output = os.str(); + } +} + +void ProcessMonitor::getHighLoadProcesses(const std::string & output) +{ + bp::pipe p; + std::ostringstream os; + + // Echo output for sed + bp::ipstream is_out; + bp::ipstream is_err; + bp::child c(fmt::format("echo {}", output), bp::std_out > p, bp::std_err > is_err); + c.wait(); + if (c.exit_code() != 0) { + is_err >> os.rdbuf(); + setErrorContent(&load_tasks_, "echo error", "echo", os.str().c_str()); + return; + } + + // Get top-rated + getTopratedProcesses(&load_tasks_, &p); +} + +void ProcessMonitor::getHighMemoryProcesses(const std::string & output) +{ + bp::pipe p1; + bp::pipe p2; + std::ostringstream os; + + // Echo output for sed + { + bp::ipstream is_out; + bp::ipstream is_err; + bp::child c(fmt::format("echo {}", output), bp::std_out > p1, bp::std_err > is_err); + c.wait(); + if (c.exit_code() != 0) { + is_err >> os.rdbuf(); + setErrorContent(&memory_tasks_, "echo error", "echo", os.str().c_str()); + return; + } + } + // Sort by memory usage + { + bp::ipstream is_out; + bp::ipstream is_err; + bp::child c("sort -r -k 10", bp::std_out > p2, bp::std_err > is_err, bp::std_in < p1); + c.wait(); + if (c.exit_code() != 0) { + is_err >> os.rdbuf(); + setErrorContent(&memory_tasks_, "sort error", "sort", os.str().c_str()); + return; + } + } + + // Get top-rated + getTopratedProcesses(&memory_tasks_, &p2); +} + +void ProcessMonitor::getTopratedProcesses( + std::vector> * tasks, bp::pipe * p) +{ + if (tasks == nullptr || p == nullptr) { + return; + } + + bp::ipstream is_out; + bp::ipstream is_err; + std::ostringstream os; + + bp::child c( + fmt::format("sed -n \"1,{} p\"", num_of_procs_), bp::std_out > is_out, bp::std_err > is_err, + bp::std_in < *p); + + c.wait(); + // Failed to modify line + if (c.exit_code() != 0) { + is_err >> os.rdbuf(); + setErrorContent(tasks, "sed error", "sed", os.str().c_str()); + return; + } + + std::vector list; + std::string line; + int index = 0; + + while (std::getline(is_out, line) && !line.empty()) { + boost::trim_left(line); + boost::split(list, line, boost::is_space(), boost::token_compress_on); + + tasks->at(index)->setDiagnosticsStatus(DiagStatus::OK, "OK"); + tasks->at(index)->setProcessId(list[0]); + tasks->at(index)->setUserName(list[1]); + tasks->at(index)->setPriority(list[2]); + tasks->at(index)->setNiceValue(list[3]); + tasks->at(index)->setVirtualImage(list[4]); + tasks->at(index)->setResidentSize(list[5]); + tasks->at(index)->setSharedMemSize(list[6]); + tasks->at(index)->setProcessStatus(list[7]); + tasks->at(index)->setCPUUsage(list[8]); + tasks->at(index)->setMemoryUsage(list[9]); + tasks->at(index)->setCPUTime(list[10]); + tasks->at(index)->setCommandName(list[11]); + ++index; + } +} + +void ProcessMonitor::setErrorContent( + std::vector> * tasks, const std::string & message, + const std::string & error_command, const std::string & content) +{ + if (tasks == nullptr) { + return; + } + + for (auto itr = tasks->begin(); itr != tasks->end(); ++itr) { + (*itr)->setDiagnosticsStatus(DiagStatus::ERROR, message); + (*itr)->setErrorContent(error_command, content); + } +} + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(ProcessMonitor) diff --git a/system/system_monitor/test/config/test_hdd_monitor.param.yaml b/system/system_monitor/test/config/test_hdd_monitor.param.yaml new file mode 100644 index 0000000000000..491cd6cfe6353 --- /dev/null +++ b/system/system_monitor/test/config/test_hdd_monitor.param.yaml @@ -0,0 +1,9 @@ +/**: + ros__parameters: + hdd_reader_port: 7635 + disks: + - name: /dev/sda + temp_warn: 55.0 + temp_error: 70.0 + usage_warn: 0.95 + usage_error: 0.99 diff --git a/system/system_monitor/test/config/test_net_monitor.param.yaml b/system/system_monitor/test/config/test_net_monitor.param.yaml new file mode 100644 index 0000000000000..aacf8a3bdcf7f --- /dev/null +++ b/system/system_monitor/test/config/test_net_monitor.param.yaml @@ -0,0 +1,5 @@ + +/**: + ros__parameters: + devices: [ wlp82s0 ] + usage_warn: 0.95 diff --git a/system/system_monitor/test/config/test_ntp_monitor.param.yaml b/system/system_monitor/test/config/test_ntp_monitor.param.yaml new file mode 100644 index 0000000000000..bf659a0b6c2cb --- /dev/null +++ b/system/system_monitor/test/config/test_ntp_monitor.param.yaml @@ -0,0 +1,5 @@ +/**: + ros__parameters: + server: ntp.ubuntu.com + offset_warn: 0.1 + offset_error: 5.0 diff --git a/system/system_monitor/test/src/cpu_monitor/mpstat1.cpp b/system/system_monitor/test/src/cpu_monitor/mpstat1.cpp new file mode 100644 index 0000000000000..0b44abd00f720 --- /dev/null +++ b/system/system_monitor/test/src/cpu_monitor/mpstat1.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file mpstat1.cpp + * @brief dummy mpstat mpstat to return error + */ + +int main(int argc, char ** argv) { return -1; } diff --git a/system/system_monitor/test/src/cpu_monitor/mpstat2.cpp b/system/system_monitor/test/src/cpu_monitor/mpstat2.cpp new file mode 100644 index 0000000000000..35466a986817a --- /dev/null +++ b/system/system_monitor/test/src/cpu_monitor/mpstat2.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file mpstat2.cpp + * @brief dummy mpstat executable to provide nothing + */ + +int main(int argc, char ** argv) { return 0; } diff --git a/system/system_monitor/test/src/cpu_monitor/test_arm_cpu_monitor.cpp b/system/system_monitor/test/src/cpu_monitor/test_arm_cpu_monitor.cpp new file mode 100644 index 0000000000000..056f8a46193df --- /dev/null +++ b/system/system_monitor/test/src/cpu_monitor/test_arm_cpu_monitor.cpp @@ -0,0 +1,695 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/cpu_monitor/arm_cpu_monitor.hpp" + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static constexpr const char * TEST_FILE = "test"; + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +char ** argv_; + +class TestCPUMonitor : public CPUMonitor +{ + friend class CPUMonitorTestSuite; + +public: + TestCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void addTempName(const std::string & path) { temps_.emplace_back(path, path); } + void clearTempNames() { temps_.clear(); } + + void addFreqName(int index, const std::string & path) { freqs_.emplace_back(index, path); } + void clearFreqNames() { freqs_.clear(); } + + void setMpstatExists(bool mpstat_exists) { mpstat_exists_ = mpstat_exists; } + + void changeUsageWarn(float usage_warn) { usage_warn_ = usage_warn; } + void changeUsageError(float usage_error) { usage_error_ = usage_error; } + + void changeLoad1Warn(float load1_warn) { load1_warn_ = load1_warn; } + void changeLoad5Warn(float load5_warn) { load5_warn_ = load5_warn; } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class CPUMonitorTestSuite : public ::testing::Test +{ +public: + CPUMonitorTestSuite() + { + // Get directory of executable + const fs::path exe_path(argv_[0]); + exe_dir_ = exe_path.parent_path().generic_string(); + // Get dummy executable path + mpstat_ = exe_dir_ + "/mpstat"; + } + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + std::string exe_dir_; + std::string mpstat_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_cpu_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestCPUMonitor::diagCallback, monitor_.get(), _1)); + monitor_->getTempNames(); + monitor_->getFreqNames(); + + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + // Remove dummy executable if exists + if (fs::exists(mpstat_)) { + fs::remove(mpstat_); + } + } + + void TearDown() + { + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + // Remove dummy executable if exists + if (fs::exists(mpstat_)) { + fs::remove(mpstat_); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } + + void modifyPath() + { + // Modify PATH temporarily + auto env = boost::this_process::environment(); + std::string new_path = env["PATH"].to_string(); + new_path.insert(0, fmt::format("{}:", exe_dir_)); + env["PATH"] = new_path; + } +}; + +TEST_F(CPUMonitorTestSuite, tempWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify warning + { + // Write warning level + std::ofstream ofs(TEST_FILE); + ofs << 90000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, tempErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify error + { + // Write error level + std::ofstream ofs(TEST_FILE); + ofs << 95000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, tempTemperatureFilesNotFoundTest) +{ + // Clear list + monitor_->clearTempNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "temperature files not found"); +} + +TEST_F(CPUMonitorTestSuite, tempFileOpenErrorTest) +{ + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "file open error"); + ASSERT_TRUE(findValue(status, "file open error", value)); + ASSERT_STREQ(value.c_str(), TEST_FILE); +} + +TEST_F(CPUMonitorTestSuite, usageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageWarn(0.90); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, usageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageError(1.00); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, usageMpstatNotFoundTest) +{ + // Set flag false + monitor_->setMpstatExists(false); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat error"); + ASSERT_TRUE(findValue(status, "mpstat", value)); + ASSERT_STREQ( + value.c_str(), + "Command 'mpstat' not found, but can be installed with: sudo apt install sysstat"); +} + +TEST_F(CPUMonitorTestSuite, load1WarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeLoad1Warn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeLoad1Warn(0.90); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, load5WarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeLoad5Warn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeLoad5Warn(0.80); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, DISABLED_throttlingTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(CPUMonitorTestSuite, freqTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Frequency", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(CPUMonitorTestSuite, freqFrequencyFilesNotFoundTest) +{ + // Clear list + monitor_->clearFreqNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Frequency", status)); + + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "frequency files not found"); +} + +TEST_F(CPUMonitorTestSuite, usageMpstatErrorTest) +{ + // Symlink mpstat1 to mpstat + fs::create_symlink(exe_dir_ + "/mpstat1", mpstat_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat error"); + ASSERT_TRUE(findValue(status, "mpstat", value)); +} + +TEST_F(CPUMonitorTestSuite, usageMpstatExceptionTest) +{ + // Symlink mpstat2 to mpstat + fs::create_symlink(exe_dir_ + "/mpstat2", mpstat_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat exception"); +} + +// for coverage +class DummyCPUMonitor : public CPUMonitorBase +{ + friend class CPUMonitorTestSuite; + +public: + DummyCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitorBase(node_name, options) + { + } + void update() { updater_.force_update(); } +}; + +TEST_F(CPUMonitorTestSuite, dummyCPUMonitorTest) +{ + rclcpp::NodeOptions options; + std::unique_ptr monitor = + std::make_unique("dummy_cpu_monitor", options); + monitor->getTempNames(); + monitor->getFreqNames(); + // Publish topic + monitor->update(); +} + +int main(int argc, char ** argv) +{ + argv_ = argv; + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/cpu_monitor/test_intel_cpu_monitor.cpp b/system/system_monitor/test/src/cpu_monitor/test_intel_cpu_monitor.cpp new file mode 100644 index 0000000000000..c6ab5de8771a4 --- /dev/null +++ b/system/system_monitor/test/src/cpu_monitor/test_intel_cpu_monitor.cpp @@ -0,0 +1,917 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/cpu_monitor/intel_cpu_monitor.hpp" + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +static constexpr const char * TEST_FILE = "test"; +static constexpr const char * DOCKER_ENV = "/.dockerenv"; + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +char ** argv_; + +class TestCPUMonitor : public CPUMonitor +{ + friend class CPUMonitorTestSuite; + +public: + TestCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void addTempName(const std::string & path) { temps_.emplace_back(path, path); } + void clearTempNames() { temps_.clear(); } + bool isTempNamesEmpty() { return temps_.empty(); } + + void addFreqName(int index, const std::string & path) { freqs_.emplace_back(index, path); } + void clearFreqNames() { freqs_.clear(); } + + void setMpstatExists(bool mpstat_exists) { mpstat_exists_ = mpstat_exists; } + + void changeUsageWarn(float usage_warn) { usage_warn_ = usage_warn; } + void changeUsageError(float usage_error) { usage_error_ = usage_error; } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class CPUMonitorTestSuite : public ::testing::Test +{ +public: + CPUMonitorTestSuite() + { + // Get directory of executable + const fs::path exe_path(argv_[0]); + exe_dir_ = exe_path.parent_path().generic_string(); + // Get dummy executable path + mpstat_ = exe_dir_ + "/mpstat"; + } + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + std::string exe_dir_; + std::string mpstat_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_cpu_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestCPUMonitor::diagCallback, monitor_.get(), _1)); + monitor_->getTempNames(); + monitor_->getFreqNames(); + + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + // Remove dummy executable if exists + if (fs::exists(mpstat_)) { + fs::remove(mpstat_); + } + } + + void TearDown() + { + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + // Remove dummy executable if exists + if (fs::exists(mpstat_)) { + fs::remove(mpstat_); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } + + void modifyPath() + { + // Modify PATH temporarily + auto env = boost::this_process::environment(); + std::string new_path = env["PATH"].to_string(); + new_path.insert(0, fmt::format("{}:", exe_dir_)); + env["PATH"] = new_path; + } +}; + +enum ThreadTestMode { + Normal = 0, + Throttling, + ReturnsError, + RecvTimeout, + RecvNoData, + FormatError, +}; + +bool stop_thread; +pthread_mutex_t mutex; + +void * msr_reader(void * args) +{ + ThreadTestMode * mode = reinterpret_cast(args); + + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + return nullptr; + } + + // Allow address reuse + int ret = 0; + int opt = 1; + ret = setsockopt( + sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast(&opt), (socklen_t)sizeof(opt)); + if (ret < 0) { + close(sock); + return nullptr; + } + + // Give the socket FD the local address ADDR + sockaddr_in addr; + memset(&addr, 0, sizeof(sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_port = htons(7634); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + close(sock); + return nullptr; + } + + // Prepare to accept connections on socket FD + ret = listen(sock, 5); + if (ret < 0) { + close(sock); + return nullptr; + } + + sockaddr_in client; + socklen_t len = sizeof(client); + + // Await a connection on socket FD + int new_sock = accept(sock, reinterpret_cast(&client), &len); + if (new_sock < 0) { + close(sock); + return nullptr; + } + + ret = 0; + std::ostringstream oss; + boost::archive::text_oarchive oa(oss); + MSRInfo msr = {0}; + + switch (*mode) { + case Normal: + msr.error_code_ = 0; + msr.pkg_thermal_status_.push_back(false); + oa << msr; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + case Throttling: + msr.error_code_ = 0; + msr.pkg_thermal_status_.push_back(true); + oa << msr; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + case ReturnsError: + msr.error_code_ = EACCES; + oa << msr; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + case RecvTimeout: + // Wait for recv timeout + while (true) { + pthread_mutex_lock(&mutex); + if (stop_thread) { + break; + } + pthread_mutex_unlock(&mutex); + sleep(1); + } + break; + + case RecvNoData: + // Send nothing, close socket immediately + break; + + case FormatError: + // Send wrong data + oa << "test"; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + default: + break; + } + + // Close the file descriptor FD + close(new_sock); + close(sock); + + return nullptr; +} + +TEST_F(CPUMonitorTestSuite, tempWarnTest) +{ + // Skip test if process runs inside CI environment + if (monitor_->isTempNamesEmpty() && fs::exists(DOCKER_ENV)) { + return; + } + + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify warning + { + // Write warning level + std::ofstream ofs(TEST_FILE); + ofs << 90000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, tempErrorTest) +{ + // Skip test if process runs inside CI environment + if (monitor_->isTempNamesEmpty() && fs::exists(DOCKER_ENV)) { + return; + } + + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify error + { + // Write error level + std::ofstream ofs(TEST_FILE); + ofs << 95000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, tempTemperatureFilesNotFoundTest) +{ + // Clear list + monitor_->clearTempNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "temperature files not found"); +} + +TEST_F(CPUMonitorTestSuite, tempFileOpenErrorTest) +{ + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "file open error"); + ASSERT_TRUE(findValue(status, "file open error", value)); + ASSERT_STREQ(value.c_str(), TEST_FILE); +} + +TEST_F(CPUMonitorTestSuite, usageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageWarn(0.90); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, usageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageError(1.00); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, usageMpstatNotFoundTest) +{ + // Set flag false + monitor_->setMpstatExists(false); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat error"); + ASSERT_TRUE(findValue(status, "mpstat", value)); + ASSERT_STREQ( + value.c_str(), + "Command 'mpstat' not found, but can be installed with: sudo apt install sysstat"); +} + +TEST_F(CPUMonitorTestSuite, load1WarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + // Depending on running situation of machine. + ASSERT_TRUE(status.level == DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, load5WarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + // Depending on running situation of machine. + ASSERT_TRUE(status.level == DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, throttlingTest) +{ + pthread_t th; + ThreadTestMode mode = Normal; + pthread_create(&th, nullptr, msr_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(CPUMonitorTestSuite, throttlingThrottlingTest) +{ + pthread_t th; + ThreadTestMode mode = Throttling; + pthread_create(&th, nullptr, msr_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "throttling"); +} + +TEST_F(CPUMonitorTestSuite, throttlingReturnsErrorTest) +{ + pthread_t th; + ThreadTestMode mode = ReturnsError; + pthread_create(&th, nullptr, msr_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "msr_reader error"); + ASSERT_TRUE(findValue(status, "msr_reader", value)); + ASSERT_STREQ(value.c_str(), strerror(EACCES)); +} + +TEST_F(CPUMonitorTestSuite, throttlingRecvTimeoutTest) +{ + pthread_t th; + ThreadTestMode mode = RecvTimeout; + pthread_create(&th, nullptr, msr_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + // Recv timeout occurs, thread is no longer needed + pthread_mutex_lock(&mutex); + stop_thread = true; + pthread_mutex_unlock(&mutex); + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "recv error"); + ASSERT_TRUE(findValue(status, "recv", value)); + ASSERT_STREQ(value.c_str(), strerror(EWOULDBLOCK)); +} + +TEST_F(CPUMonitorTestSuite, throttlingRecvNoDataTest) +{ + pthread_t th; + ThreadTestMode mode = RecvNoData; + pthread_create(&th, nullptr, msr_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "recv error"); + ASSERT_TRUE(findValue(status, "recv", value)); + ASSERT_STREQ(value.c_str(), "No data received"); +} + +TEST_F(CPUMonitorTestSuite, throttlingFormatErrorTest) +{ + pthread_t th; + ThreadTestMode mode = FormatError; + pthread_create(&th, nullptr, msr_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "recv error"); + ASSERT_TRUE(findValue(status, "recv", value)); + ASSERT_STREQ(value.c_str(), "input stream error"); +} + +TEST_F(CPUMonitorTestSuite, throttlingConnectErrorTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "connect error"); + ASSERT_TRUE(findValue(status, "connect", value)); + ASSERT_STREQ(value.c_str(), strerror(ECONNREFUSED)); +} + +TEST_F(CPUMonitorTestSuite, freqTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Frequency", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(CPUMonitorTestSuite, freqFrequencyFilesNotFoundTest) +{ + // Clear list + monitor_->clearFreqNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Frequency", status)); + + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "frequency files not found"); +} + +TEST_F(CPUMonitorTestSuite, usageMpstatErrorTest) +{ + // Symlink mpstat1 to mpstat + fs::create_symlink(exe_dir_ + "/mpstat1", mpstat_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat error"); + ASSERT_TRUE(findValue(status, "mpstat", value)); +} + +TEST_F(CPUMonitorTestSuite, usageMpstatExceptionTest) +{ + // Symlink mpstat2 to mpstat + fs::create_symlink(exe_dir_ + "/mpstat2", mpstat_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat exception"); +} + +// for coverage +class DummyCPUMonitor : public CPUMonitorBase +{ + friend class CPUMonitorTestSuite; + +public: + DummyCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitorBase(node_name, options) + { + } + void update() { updater_.force_update(); } +}; + +TEST_F(CPUMonitorTestSuite, dummyCPUMonitorTest) +{ + rclcpp::NodeOptions options; + std::unique_ptr monitor = + std::make_unique("dummy_cpu_monitor", options); + monitor->getTempNames(); + monitor->getFreqNames(); + // Publish topic + monitor->update(); +} + +int main(int argc, char ** argv) +{ + argv_ = argv; + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/cpu_monitor/test_raspi_cpu_monitor.cpp b/system/system_monitor/test/src/cpu_monitor/test_raspi_cpu_monitor.cpp new file mode 100644 index 0000000000000..f58124deef67c --- /dev/null +++ b/system/system_monitor/test/src/cpu_monitor/test_raspi_cpu_monitor.cpp @@ -0,0 +1,695 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/cpu_monitor/raspi_cpu_monitor.hpp" + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static constexpr const char * TEST_FILE = "test"; + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +char ** argv_; + +class TestCPUMonitor : public CPUMonitor +{ + friend class CPUMonitorTestSuite; + +public: + TestCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void addTempName(const std::string & path) { temps_.emplace_back(path, path); } + void clearTempNames() { temps_.clear(); } + + void addFreqName(int index, const std::string & path) { freqs_.emplace_back(index, path); } + void clearFreqNames() { freqs_.clear(); } + + void setMpstatExists(bool mpstat_exists) { mpstat_exists_ = mpstat_exists; } + + void changeUsageWarn(float usage_warn) { usage_warn_ = usage_warn; } + void changeUsageError(float usage_error) { usage_error_ = usage_error; } + + void changeLoad1Warn(float load1_warn) { load1_warn_ = load1_warn; } + void changeLoad5Warn(float load5_warn) { load5_warn_ = load5_warn; } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class CPUMonitorTestSuite : public ::testing::Test +{ +public: + CPUMonitorTestSuite() + { + // Get directory of executable + const fs::path exe_path(argv_[0]); + exe_dir_ = exe_path.parent_path().generic_string(); + // Get dummy executable path + mpstat_ = exe_dir_ + "/mpstat"; + } + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + std::string exe_dir_; + std::string mpstat_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_cpu_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestCPUMonitor::diagCallback, monitor_.get(), _1)); + monitor_->getTempNames(); + monitor_->getFreqNames(); + + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + // Remove dummy executable if exists + if (fs::exists(mpstat_)) { + fs::remove(mpstat_); + } + } + + void TearDown() + { + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + // Remove dummy executable if exists + if (fs::exists(mpstat_)) { + fs::remove(mpstat_); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } + + void modifyPath() + { + // Modify PATH temporarily + auto env = boost::this_process::environment(); + std::string new_path = env["PATH"].to_string(); + new_path.insert(0, fmt::format("{}:", exe_dir_)); + env["PATH"] = new_path; + } +}; + +TEST_F(CPUMonitorTestSuite, tempWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify warning + { + // Write warning level + std::ofstream ofs(TEST_FILE); + ofs << 90000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, tempErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify error + { + // Write error level + std::ofstream ofs(TEST_FILE); + ofs << 95000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, tempTemperatureFilesNotFoundTest) +{ + // Clear list + monitor_->clearTempNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "temperature files not found"); +} + +TEST_F(CPUMonitorTestSuite, tempFileOpenErrorTest) +{ + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "file open error"); + ASSERT_TRUE(findValue(status, "file open error", value)); + ASSERT_STREQ(value.c_str(), TEST_FILE); +} + +TEST_F(CPUMonitorTestSuite, usageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageWarn(0.90); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, usageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageError(1.00); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, usageMpstatNotFoundTest) +{ + // Set flag false + monitor_->setMpstatExists(false); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat error"); + ASSERT_TRUE(findValue(status, "mpstat", value)); + ASSERT_STREQ( + value.c_str(), + "Command 'mpstat' not found, but can be installed with: sudo apt install sysstat"); +} + +TEST_F(CPUMonitorTestSuite, load1WarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeLoad1Warn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeLoad1Warn(0.90); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, load5WarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeLoad5Warn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeLoad5Warn(0.80); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, DISABLED_throttlingTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(CPUMonitorTestSuite, freqTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Frequency", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(CPUMonitorTestSuite, freqFrequencyFilesNotFoundTest) +{ + // Clear list + monitor_->clearFreqNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Frequency", status)); + + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "frequency files not found"); +} + +TEST_F(CPUMonitorTestSuite, usageMpstatErrorTest) +{ + // Symlink mpstat1 to mpstat + fs::create_symlink(exe_dir_ + "/mpstat1", mpstat_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat error"); + ASSERT_TRUE(findValue(status, "mpstat", value)); +} + +TEST_F(CPUMonitorTestSuite, usageMpstatExceptionTest) +{ + // Symlink mpstat2 to mpstat + fs::create_symlink(exe_dir_ + "/mpstat2", mpstat_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat exception"); +} + +// for coverage +class DummyCPUMonitor : public CPUMonitorBase +{ + friend class CPUMonitorTestSuite; + +public: + DummyCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitorBase(node_name, options) + { + } + void update() { updater_.force_update(); } +}; + +TEST_F(CPUMonitorTestSuite, dummyCPUMonitorTest) +{ + rclcpp::NodeOptions options; + std::unique_ptr monitor = + std::make_unique("dummy_cpu_monitor", options); + monitor->getTempNames(); + monitor->getFreqNames(); + // Publish topic + monitor->update(); +} + +int main(int argc, char ** argv) +{ + argv_ = argv; + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/cpu_monitor/test_tegra_cpu_monitor.cpp b/system/system_monitor/test/src/cpu_monitor/test_tegra_cpu_monitor.cpp new file mode 100644 index 0000000000000..104ef3cb454d9 --- /dev/null +++ b/system/system_monitor/test/src/cpu_monitor/test_tegra_cpu_monitor.cpp @@ -0,0 +1,681 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/cpu_monitor/tegra_cpu_monitor.hpp" + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static constexpr const char * TEST_FILE = "test"; + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +char ** argv_; + +class TestCPUMonitor : public CPUMonitor +{ + friend class CPUMonitorTestSuite; + +public: + TestCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void addTempName(const std::string & path) { temps_.emplace_back(path, path); } + void clearTempNames() { temps_.clear(); } + + void addFreqName(int index, const std::string & path) { freqs_.emplace_back(index, path); } + void clearFreqNames() { freqs_.clear(); } + + void setMpstatExists(bool mpstat_exists) { mpstat_exists_ = mpstat_exists; } + + void changeUsageWarn(float usage_warn) { usage_warn_ = usage_warn; } + void changeUsageError(float usage_error) { usage_error_ = usage_error; } + + void changeLoad1Warn(float load1_warn) { load1_warn_ = load1_warn; } + void changeLoad5Warn(float load5_warn) { load5_warn_ = load5_warn; } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class CPUMonitorTestSuite : public ::testing::Test +{ +public: + CPUMonitorTestSuite() + { + // Get directory of executable + const fs::path exe_path(argv_[0]); + exe_dir_ = exe_path.parent_path().generic_string(); + // Get dummy executable path + mpstat_ = exe_dir_ + "/mpstat"; + } + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + std::string exe_dir_; + std::string mpstat_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_cpu_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestCPUMonitor::diagCallback, monitor_.get(), _1)); + monitor_->getTempNames(); + monitor_->getFreqNames(); + + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + // Remove dummy executable if exists + if (fs::exists(mpstat_)) { + fs::remove(mpstat_); + } + } + + void TearDown() + { + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + // Remove dummy executable if exists + if (fs::exists(mpstat_)) { + fs::remove(mpstat_); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } + + void modifyPath() + { + // Modify PATH temporarily + auto env = boost::this_process::environment(); + std::string new_path = env["PATH"].to_string(); + new_path.insert(0, fmt::format("{}:", exe_dir_)); + env["PATH"] = new_path; + } +}; + +TEST_F(CPUMonitorTestSuite, tempWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify warning + { + // Write warning level + std::ofstream ofs(TEST_FILE); + ofs << 90000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, tempErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify error + { + // Write error level + std::ofstream ofs(TEST_FILE); + ofs << 95000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, tempTemperatureFilesNotFoundTest) +{ + // Clear list + monitor_->clearTempNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "temperature files not found"); +} + +TEST_F(CPUMonitorTestSuite, tempFileOpenErrorTest) +{ + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "file open error"); + ASSERT_TRUE(findValue(status, "file open error", value)); + ASSERT_STREQ(value.c_str(), TEST_FILE); +} + +TEST_F(CPUMonitorTestSuite, usageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageWarn(0.90); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, usageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageError(1.00); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, usageMpstatNotFoundTest) +{ + // Set flag false + monitor_->setMpstatExists(false); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat error"); + ASSERT_TRUE(findValue(status, "mpstat", value)); + ASSERT_STREQ( + value.c_str(), + "Command 'mpstat' not found, but can be installed with: sudo apt install sysstat"); +} + +TEST_F(CPUMonitorTestSuite, load1WarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeLoad1Warn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeLoad1Warn(0.90); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, load5WarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeLoad5Warn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeLoad5Warn(0.80); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Load Average", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(CPUMonitorTestSuite, freqTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Frequency", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(CPUMonitorTestSuite, freqFrequencyFilesNotFoundTest) +{ + // Clear list + monitor_->clearFreqNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("CPU Frequency", status)); + + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "frequency files not found"); +} + +TEST_F(CPUMonitorTestSuite, usageMpstatErrorTest) +{ + // Symlink mpstat1 to mpstat + fs::create_symlink(exe_dir_ + "/mpstat1", mpstat_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat error"); + ASSERT_TRUE(findValue(status, "mpstat", value)); +} + +TEST_F(CPUMonitorTestSuite, usageMpstatExceptionTest) +{ + // Symlink mpstat2 to mpstat + fs::create_symlink(exe_dir_ + "/mpstat2", mpstat_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("CPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "mpstat exception"); +} + +// for coverage +class DummyCPUMonitor : public CPUMonitorBase +{ + friend class CPUMonitorTestSuite; + +public: + DummyCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitorBase(node_name, options) + { + } + void update() { updater_.force_update(); } +}; + +TEST_F(CPUMonitorTestSuite, dummyCPUMonitorTest) +{ + rclcpp::NodeOptions options; + std::unique_ptr monitor = + std::make_unique("dummy_cpu_monitor", options); + monitor->getTempNames(); + monitor->getFreqNames(); + // Publish topic + monitor->update(); +} + +int main(int argc, char ** argv) +{ + argv_ = argv; + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/cpu_monitor/test_unknown_cpu_monitor.cpp b/system/system_monitor/test/src/cpu_monitor/test_unknown_cpu_monitor.cpp new file mode 100644 index 0000000000000..53fffe03bb02b --- /dev/null +++ b/system/system_monitor/test/src/cpu_monitor/test_unknown_cpu_monitor.cpp @@ -0,0 +1,78 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/cpu_monitor/unknown_cpu_monitor.hpp" + +#include + +#include + +#include +#include + +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +class TestCPUMonitor : public CPUMonitor +{ + friend class CPUMonitorTestSuite; + +public: + TestCPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : CPUMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void update() { updater_.force_update(); } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; +}; + +class CPUMonitorTestSuite : public ::testing::Test +{ +public: + CPUMonitorTestSuite() {} + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_cpu_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestCPUMonitor::diagCallback, monitor_.get(), _1)); + monitor_->getTempNames(); + monitor_->getFreqNames(); + } + + void TearDown() { rclcpp::shutdown(); } +}; + +TEST_F(CPUMonitorTestSuite, test) { ASSERT_TRUE(true); } + +int main(int argc, char ** argv) +{ + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/gpu_monitor/test_nvml_gpu_monitor.cpp b/system/system_monitor/test/src/gpu_monitor/test_nvml_gpu_monitor.cpp new file mode 100644 index 0000000000000..c5eee02076597 --- /dev/null +++ b/system/system_monitor/test/src/gpu_monitor/test_nvml_gpu_monitor.cpp @@ -0,0 +1,552 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/gpu_monitor/nvml_gpu_monitor.hpp" + +#include + +#include + +#include + +#include +#include + +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +class TestGPUMonitor : public GPUMonitor +{ + friend class GPUMonitorTestSuite; + +public: + TestGPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : GPUMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void addGPU(const gpu_info & info) { gpus_.push_back(info); } + void clearGPU() { gpus_.clear(); } + + void changeTempWarn(float temp_warn) { temp_warn_ = temp_warn; } + void changeTempError(float temp_error) { temp_error_ = temp_error; } + + void changeGPUUsageWarn(float gpu_usage_warn) { gpu_usage_warn_ = gpu_usage_warn; } + void changeGPUUsageError(float gpu_usage_error) { gpu_usage_error_ = gpu_usage_error; } + + void changeMemoryUsageWarn(float memory_usage_warn) { memory_usage_warn_ = memory_usage_warn; } + void changeMemoryUsageError(float memory_usage_error) + { + memory_usage_error_ = memory_usage_error; + } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class GPUMonitorTestSuite : public ::testing::Test +{ +public: + GPUMonitorTestSuite() {} + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_gpu_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestGPUMonitor::diagCallback, monitor_.get(), _1)); + } + + void TearDown() { rclcpp::shutdown(); } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } +}; + +TEST_F(GPUMonitorTestSuite, tempWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeTempWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeTempWarn(90.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, tempErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify error + { + // Change error level + monitor_->changeTempError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeTempError(95.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, gpuUsageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeGPUUsageWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeGPUUsageWarn(0.90); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, gpuUsageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify error + { + // Change error level + monitor_->changeGPUUsageError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeGPUUsageError(1.00); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, gpuMemoryUsageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeMemoryUsageWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeMemoryUsageWarn(0.95); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, gpuMemoryUsageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify error + { + // Change error level + monitor_->changeMemoryUsageError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeMemoryUsageError(0.99); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, throttlingTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(GPUMonitorTestSuite, gpuNotFoundTest) +{ + // Clear list + monitor_->clearGPU(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "gpu not found"); + + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "gpu not found"); + + ASSERT_TRUE(monitor_->findDiagStatus("GPU Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "gpu not found"); + + ASSERT_TRUE(monitor_->findDiagStatus("GPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "gpu not found"); +} + +TEST_F(GPUMonitorTestSuite, illegalDeviceHandleTest) +{ + // Clear list + monitor_->clearGPU(); + // Add blank device + gpu_info gpu; + monitor_->addGPU(gpu); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "Failed to retrieve the current temperature"); + + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "Failed to retrieve the current utilization rates"); + + ASSERT_TRUE(monitor_->findDiagStatus("GPU Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ( + status.message.c_str(), "Failed to retrieve the amount of used, free and total memory"); + + ASSERT_TRUE(monitor_->findDiagStatus("GPU Thermal Throttling", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "Failed to retrieve the current clock speeds"); +} + +// for coverage +class DummyGPUMonitor : public GPUMonitorBase +{ + friend class GPUMonitorTestSuite; + +public: + DummyGPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : GPUMonitorBase(node_name, options) + { + } + void update() { updater_.force_update(); } +}; + +TEST_F(GPUMonitorTestSuite, dummyGPUMonitorTest) +{ + rclcpp::NodeOptions options; + std::unique_ptr monitor = + std::make_unique("dummy_gpu_monitor", options); + // Publish topic + monitor->update(); +} + +int main(int argc, char ** argv) +{ + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/gpu_monitor/test_tegra_gpu_monitor.cpp b/system/system_monitor/test/src/gpu_monitor/test_tegra_gpu_monitor.cpp new file mode 100644 index 0000000000000..44b26e8b2433d --- /dev/null +++ b/system/system_monitor/test/src/gpu_monitor/test_tegra_gpu_monitor.cpp @@ -0,0 +1,499 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/gpu_monitor/tegra_gpu_monitor.hpp" + +#include + +#include +#include + +#include + +#include +#include + +static constexpr const char * TEST_FILE = "test"; + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +class TestGPUMonitor : public GPUMonitor +{ + friend class GPUMonitorTestSuite; + +public: + TestGPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : GPUMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void addTempName(const std::string & path) { temps_.emplace_back(path, path); } + void clearTempNames() { temps_.clear(); } + + void addLoadName(const std::string & path) { loads_.emplace_back(path, path); } + void clearLoadNames() { loads_.clear(); } + + void addFreqName(const std::string & path) { freqs_.emplace_back(path, path); } + void clearFreqNames() { freqs_.clear(); } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class GPUMonitorTestSuite : public ::testing::Test +{ +public: + GPUMonitorTestSuite() {} + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_gpu_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestGPUMonitor::diagCallback, monitor_.get(), _1)); + } + + void TearDown() + { + // Remove test file if exists + if (fs::exists(TEST_FILE)) { + fs::remove(TEST_FILE); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } +}; + +TEST_F(GPUMonitorTestSuite, tempWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify warning + { + // Write warning level + std::ofstream ofs(TEST_FILE); + ofs << 90000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, tempErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Verify error + { + // Write error level + std::ofstream ofs(TEST_FILE); + ofs << 95000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 89900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, tempTemperatureFilesNotFoundTest) +{ + // Clear list + monitor_->clearTempNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "temperature files not found"); +} + +TEST_F(GPUMonitorTestSuite, tempFileOpenErrorTest) +{ + // Add test file to list + monitor_->addTempName(TEST_FILE); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "file open error"); + ASSERT_TRUE(findValue(status, "file open error", value)); + ASSERT_STREQ(value.c_str(), TEST_FILE); +} + +TEST_F(GPUMonitorTestSuite, usageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addLoadName(TEST_FILE); + + // Verify warning + { + // Write warning level + std::ofstream ofs(TEST_FILE); + ofs << 900 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 890 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, usageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Add test file to list + monitor_->addLoadName(TEST_FILE); + + // Verify error + { + // Write error level + std::ofstream ofs(TEST_FILE); + ofs << 1000 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Write normal level + std::ofstream ofs(TEST_FILE); + ofs << 890 << std::endl; + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(GPUMonitorTestSuite, usageTemperatureFilesNotFoundTest) +{ + // Clear list + monitor_->clearLoadNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "load files not found"); +} + +TEST_F(GPUMonitorTestSuite, usageFileOpenErrorTest) +{ + // Add test file to list + monitor_->addLoadName(TEST_FILE); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "file open error"); + ASSERT_TRUE(findValue(status, "file open error", value)); + ASSERT_STREQ(value.c_str(), TEST_FILE); +} + +TEST_F(GPUMonitorTestSuite, freqTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Frequency", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(GPUMonitorTestSuite, freqFrequencyFilesNotFoundTest) +{ + // Clear list + monitor_->clearFreqNames(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("GPU Frequency", status)); + + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "frequency files not found"); +} + +// for coverage +class DummyGPUMonitor : public GPUMonitorBase +{ + friend class GPUMonitorTestSuite; + +public: + DummyGPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : GPUMonitorBase(node_name, options) + { + } + void update() { updater_.force_update(); } +}; + +TEST_F(GPUMonitorTestSuite, dummyGPUMonitorTest) +{ + rclcpp::NodeOptions options; + std::unique_ptr monitor = + std::make_unique("dummy_gpu_monitor", options); + // Publish topic + monitor->update(); +} + +int main(int argc, char ** argv) +{ + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/gpu_monitor/test_unknown_gpu_monitor.cpp b/system/system_monitor/test/src/gpu_monitor/test_unknown_gpu_monitor.cpp new file mode 100644 index 0000000000000..06ebce8cc6ce2 --- /dev/null +++ b/system/system_monitor/test/src/gpu_monitor/test_unknown_gpu_monitor.cpp @@ -0,0 +1,76 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/gpu_monitor/unknown_gpu_monitor.hpp" + +#include + +#include + +#include +#include + +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +class TestGPUMonitor : public GPUMonitor +{ + friend class GPUMonitorTestSuite; + +public: + TestGPUMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : GPUMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void update() { updater_.force_update(); } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; +}; + +class GPUMonitorTestSuite : public ::testing::Test +{ +public: + GPUMonitorTestSuite() {} + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_gpu_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestGPUMonitor::diagCallback, monitor_.get(), _1)); + } + + void TearDown() { rclcpp::shutdown(); } +}; + +TEST_F(GPUMonitorTestSuite, test) { ASSERT_TRUE(true); } + +int main(int argc, char ** argv) +{ + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/hdd_monitor/df1.cpp b/system/system_monitor/test/src/hdd_monitor/df1.cpp new file mode 100644 index 0000000000000..99700d9d381d5 --- /dev/null +++ b/system/system_monitor/test/src/hdd_monitor/df1.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file df.cpp + * @brief dummy df executable to return error + */ + +int main(int argc, char ** argv) { return -1; } diff --git a/system/system_monitor/test/src/hdd_monitor/test_hdd_monitor.cpp b/system/system_monitor/test/src/hdd_monitor/test_hdd_monitor.cpp new file mode 100644 index 0000000000000..64d78de19418d --- /dev/null +++ b/system/system_monitor/test/src/hdd_monitor/test_hdd_monitor.cpp @@ -0,0 +1,712 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/hdd_monitor/hdd_monitor.hpp" + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +char ** argv_; + +class TestHDDMonitor : public HDDMonitor +{ + friend class HDDMonitorTestSuite; + +public: + TestHDDMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : HDDMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void addHDDParams( + const std::string & name, float temp_warn, float temp_error, float usage_warn, + float usage_error) + { + HDDParam param; + param.temp_warn_ = temp_warn; + param.temp_error_ = temp_error; + param.usage_warn_ = usage_warn; + param.usage_error_ = usage_error; + hdd_params_[name] = param; + } + + void changeHDDParams(float temp_warn, float temp_error, float usage_warn, float usage_error) + { + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr) { + itr->second.temp_warn_ = temp_warn; + itr->second.temp_error_ = temp_error; + itr->second.usage_warn_ = usage_warn; + itr->second.usage_error_ = usage_error; + } + } + + void removeHDDParams(const std::string & name) + { + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr) { + if (itr->first == name) { + hdd_params_.erase(itr); + break; + } + } + } + + void clearHDDParams() { hdd_params_.clear(); } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class HDDMonitorTestSuite : public ::testing::Test +{ +public: + HDDMonitorTestSuite() + { + // Get directory of executable + const fs::path exe_path(argv_[0]); + exe_dir_ = exe_path.parent_path().generic_string(); + // Get dummy executable path + df_ = exe_dir_ + "/df"; + } + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + std::string exe_dir_; + std::string df_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_hdd_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestHDDMonitor::diagCallback, monitor_.get(), _1)); + // Remove dummy executable if exists + if (fs::exists(df_)) { + fs::remove(df_); + } + } + + void TearDown() + { + // Remove dummy executable if exists + if (fs::exists(df_)) { + fs::remove(df_); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } + + void modifyPath() + { + // Modify PATH temporarily + auto env = boost::this_process::environment(); + std::string new_path = env["PATH"].to_string(); + new_path.insert(0, fmt::format("{}:", exe_dir_)); + env["PATH"] = new_path; + } +}; + +enum ThreadTestMode { + Normal = 0, + Hot, + CriticalHot, + ReturnsError, + RecvTimeout, + RecvNoData, + FormatError, +}; + +bool stop_thread; +pthread_mutex_t mutex; + +void * hdd_reader(void * args) +{ + ThreadTestMode * mode = reinterpret_cast(args); + + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + return nullptr; + } + + // Allow address reuse + int ret = 0; + int opt = 1; + ret = setsockopt( + sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast(&opt), (socklen_t)sizeof(opt)); + if (ret < 0) { + close(sock); + return nullptr; + } + + // Give the socket FD the local address ADDR + sockaddr_in addr; + memset(&addr, 0, sizeof(sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_port = htons(7635); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + close(sock); + return nullptr; + } + + // Prepare to accept connections on socket FD + ret = listen(sock, 5); + if (ret < 0) { + close(sock); + return nullptr; + } + + sockaddr_in client; + socklen_t len = sizeof(client); + + // Await a connection on socket FD + int new_sock = accept(sock, reinterpret_cast(&client), &len); + if (new_sock < 0) { + close(sock); + return nullptr; + } + + // Receive list of device from a socket + char buf[1024] = ""; + ret = recv(new_sock, buf, sizeof(buf) - 1, 0); + if (ret < 0) { + close(sock); + return nullptr; + } + // No data received + if (ret == 0) { + close(sock); + return nullptr; + } + + ret = 0; + std::ostringstream oss; + boost::archive::text_oarchive oa(oss); + HDDInfoList list; + HDDInfo info = {0}; + + switch (*mode) { + case Normal: + info.error_code_ = 0; + info.temp_ = 40; + list["/dev/sda"] = info; + oa << list; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + case Hot: + info.error_code_ = 0; + info.temp_ = 55; + list["/dev/sda"] = info; + oa << list; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + case CriticalHot: + info.error_code_ = 0; + info.temp_ = 70; + list["/dev/sda"] = info; + oa << list; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + case ReturnsError: + info.error_code_ = EACCES; + list["/dev/sda"] = info; + oa << list; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + case RecvTimeout: + // Wait for recv timeout + while (true) { + pthread_mutex_lock(&mutex); + if (stop_thread) { + break; + } + pthread_mutex_unlock(&mutex); + sleep(1); + } + break; + + case RecvNoData: + // Send nothing, close socket immediately + break; + + case FormatError: + // Send wrong data + oa << "test"; + ret = write(new_sock, oss.str().c_str(), oss.str().length()); + break; + + default: + break; + } + + // Close the file descriptor FD + close(new_sock); + close(sock); + + return nullptr; +} + +TEST_F(HDDMonitorTestSuite, tempNormalTest) +{ + pthread_t th; + ThreadTestMode mode = Normal; + pthread_create(&th, nullptr, hdd_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(HDDMonitorTestSuite, tempWarnTest) +{ + pthread_t th; + ThreadTestMode mode = Hot; + pthread_create(&th, nullptr, hdd_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); +} + +TEST_F(HDDMonitorTestSuite, tempErrorTest) +{ + pthread_t th; + ThreadTestMode mode = CriticalHot; + pthread_create(&th, nullptr, hdd_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); +} + +TEST_F(HDDMonitorTestSuite, tempReturnsErrorTest) +{ + pthread_t th; + ThreadTestMode mode = ReturnsError; + pthread_create(&th, nullptr, hdd_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "hdd_reader error"); + ASSERT_TRUE(findValue(status, "HDD 0: hdd_reader", value)); + ASSERT_STREQ(value.c_str(), strerror(EACCES)); +} + +TEST_F(HDDMonitorTestSuite, tempRecvTimeoutTest) +{ + pthread_t th; + ThreadTestMode mode = RecvTimeout; + pthread_create(&th, nullptr, hdd_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + // Recv timeout occurs, thread is no longer needed + pthread_mutex_lock(&mutex); + stop_thread = true; + pthread_mutex_unlock(&mutex); + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "recv error"); + ASSERT_TRUE(findValue(status, "recv", value)); + ASSERT_STREQ(value.c_str(), strerror(EWOULDBLOCK)); +} + +TEST_F(HDDMonitorTestSuite, tempRecvNoDataTest) +{ + pthread_t th; + ThreadTestMode mode = RecvNoData; + pthread_create(&th, nullptr, hdd_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "recv error"); + ASSERT_TRUE(findValue(status, "recv", value)); + ASSERT_STREQ(value.c_str(), "No data received"); +} + +TEST_F(HDDMonitorTestSuite, tempFormatErrorTest) +{ + pthread_t th; + ThreadTestMode mode = FormatError; + pthread_create(&th, nullptr, hdd_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "recv error"); + ASSERT_TRUE(findValue(status, "recv", value)); + ASSERT_STREQ(value.c_str(), "input stream error"); +} + +TEST_F(HDDMonitorTestSuite, tempConnectErrorTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "connect error"); + ASSERT_TRUE(findValue(status, "connect", value)); + ASSERT_STREQ(value.c_str(), strerror(ECONNREFUSED)); +} + +TEST_F(HDDMonitorTestSuite, tempInvalidDiskParameterTest) +{ + // Clear list + monitor_->clearHDDParams(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "invalid disk parameter"); +} + +TEST_F(HDDMonitorTestSuite, tempNoSuchDeviceTest) +{ + // Add test file to list + monitor_->addHDDParams("/dev/sdx", 55.0, 77.0, 0.95, 0.99); + + pthread_t th; + ThreadTestMode mode = Normal; + pthread_create(&th, nullptr, hdd_reader, &mode); + // Wait for thread started + rclcpp::WallRate(10).sleep(); + + // Publish topic + monitor_->update(); + + pthread_join(th, NULL); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Temperature", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "hdd_reader error"); + ASSERT_TRUE(findValue(status, "HDD 1: hdd_reader", value)); + ASSERT_STREQ(value.c_str(), strerror(ENOENT)); + + // Remove test fie from list + monitor_->removeHDDParams("/dev/sdx"); +} + +TEST_F(HDDMonitorTestSuite, usageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeHDDParams(55.0, 77.0, 0.00, 0.99); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeHDDParams(55.0, 77.0, 0.95, 0.99); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(HDDMonitorTestSuite, usageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change error level + monitor_->changeHDDParams(55.0, 77.0, 0.95, 0.00); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeHDDParams(55.0, 77.0, 0.95, 0.99); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("HDD Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(HDDMonitorTestSuite, usageDfErrorTest) +{ + // Symlink df1 to df + fs::create_symlink(exe_dir_ + "/df1", df_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("HDD Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "df error"); + ASSERT_TRUE(findValue(status, "HDD 0: df", value)); +} + +int main(int argc, char ** argv) +{ + argv_ = argv; + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/mem_monitor/free1.cpp b/system/system_monitor/test/src/mem_monitor/free1.cpp new file mode 100644 index 0000000000000..a6e2bb49c3093 --- /dev/null +++ b/system/system_monitor/test/src/mem_monitor/free1.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file free1.cpp + * @brief dummy free executable to return error + */ + +int main(int argc, char ** argv) { return -1; } diff --git a/system/system_monitor/test/src/mem_monitor/test_mem_monitor.cpp b/system/system_monitor/test/src/mem_monitor/test_mem_monitor.cpp new file mode 100644 index 0000000000000..11ef07795ac68 --- /dev/null +++ b/system/system_monitor/test/src/mem_monitor/test_mem_monitor.cpp @@ -0,0 +1,280 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/mem_monitor/mem_monitor.hpp" + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +char ** argv_; + +class TestMemMonitor : public MemMonitor +{ + friend class MemMonitorTestSuite; + +public: + TestMemMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : MemMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void changeUsageWarn(float usage_warn) { usage_warn_ = usage_warn; } + void changeUsageError(float usage_error) { usage_error_ = usage_error; } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class MemMonitorTestSuite : public ::testing::Test +{ +public: + MemMonitorTestSuite() + { + // Get directory of executable + const fs::path exe_path(argv_[0]); + exe_dir_ = exe_path.parent_path().generic_string(); + // Get dummy executable path + free_ = exe_dir_ + "/free"; + } + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + std::string exe_dir_; + std::string free_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_mem_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestMemMonitor::diagCallback, monitor_.get(), _1)); + + // Remove dummy executable if exists + if (fs::exists(free_)) { + fs::remove(free_); + } + } + + void TearDown() + { + // Remove dummy executable if exists + if (fs::exists(free_)) { + fs::remove(free_); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } + + void modifyPath() + { + // Modify PATH temporarily + auto env = boost::this_process::environment(); + std::string new_path = env["PATH"].to_string(); + new_path.insert(0, fmt::format("{}:", exe_dir_)); + env["PATH"] = new_path; + } +}; + +TEST_F(MemMonitorTestSuite, usageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageWarn(0.95); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(MemMonitorTestSuite, usageErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageError(0.99); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(MemMonitorTestSuite, usageFreeErrorTest) +{ + // Symlink free1 to free + fs::create_symlink(exe_dir_ + "/free1", free_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("Memory Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "free error"); + ASSERT_TRUE(findValue(status, "free", value)); +} + +int main(int argc, char ** argv) +{ + argv_ = argv; + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/net_monitor/test_net_monitor.cpp b/system/system_monitor/test/src/net_monitor/test_net_monitor.cpp new file mode 100644 index 0000000000000..34dc7adb9cf6a --- /dev/null +++ b/system/system_monitor/test/src/net_monitor/test_net_monitor.cpp @@ -0,0 +1,192 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/net_monitor/net_monitor.hpp" + +#include + +#include +#include + +#include + +#include +#include +#include + +static constexpr const char * DOCKER_ENV = "/.dockerenv"; + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +class TestNetMonitor : public NetMonitor +{ + friend class NetMonitorTestSuite; + +public: + TestNetMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : NetMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void changeUsageWarn(float usage_warn) { usage_warn_ = usage_warn; } + + const std::vector getDeviceParams() { return device_params_; } + void clearDeviceParams() { device_params_.clear(); } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class NetMonitorTestSuite : public ::testing::Test +{ +public: + NetMonitorTestSuite() {} + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_net_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestNetMonitor::diagCallback, monitor_.get(), _1)); + } + + void TearDown() { rclcpp::shutdown(); } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } +}; + +TEST_F(NetMonitorTestSuite, usageWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("Network Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeUsageWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("Network Usage", status)); + // Skip test if process runs inside docker + // Don't know what interface should be monitored. + if (!fs::exists(DOCKER_ENV)) { + ASSERT_EQ(status.level, DiagStatus::WARN); + } + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeUsageWarn(0.95); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("Network Usage", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(NetMonitorTestSuite, usageInvalidDeviceParameterTest) +{ + // Clear list + monitor_->clearDeviceParams(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("Network Usage", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "invalid device parameter"); +} + +int main(int argc, char ** argv) +{ + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/ntp_monitor/ntpdate1.cpp b/system/system_monitor/test/src/ntp_monitor/ntpdate1.cpp new file mode 100644 index 0000000000000..0f79345c1e6c0 --- /dev/null +++ b/system/system_monitor/test/src/ntp_monitor/ntpdate1.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file ntpdate1.cpp + * @brief dummy ntpdate executable to return error + */ + +int main(int argc, char ** argv) { return -1; } diff --git a/system/system_monitor/test/src/ntp_monitor/test_ntp_monitor.cpp b/system/system_monitor/test/src/ntp_monitor/test_ntp_monitor.cpp new file mode 100644 index 0000000000000..0b0b98127eeee --- /dev/null +++ b/system/system_monitor/test/src/ntp_monitor/test_ntp_monitor.cpp @@ -0,0 +1,303 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/ntp_monitor/ntp_monitor.hpp" + +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +char ** argv_; + +class TestNTPMonitor : public NTPMonitor +{ + friend class NTPMonitorTestSuite; + +public: + TestNTPMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : NTPMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + void changeOffsetWarn(float offset_warn) { offset_warn_ = offset_warn; } + void changeOffsetError(float offset_error) { offset_error_ = offset_error; } + + void setNtpdateExists(bool ntpdate_exists) { ntpdate_exists_ = ntpdate_exists; } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class NTPMonitorTestSuite : public ::testing::Test +{ +public: + NTPMonitorTestSuite() + { + // Get directory of executable + const fs::path exe_path(argv_[0]); + exe_dir_ = exe_path.parent_path().generic_string(); + // Get dummy executable path + ntpdate_ = exe_dir_ + "/ntpdate"; + } + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + std::string exe_dir_; + std::string ntpdate_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_ntp_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestNTPMonitor::diagCallback, monitor_.get(), _1)); + + // Remove dummy executable if exists + if (fs::exists(ntpdate_)) { + fs::remove(ntpdate_); + } + } + + void TearDown() + { + // Remove dummy executable if exists + if (fs::exists(ntpdate_)) { + fs::remove(ntpdate_); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } + + void modifyPath() + { + // Modify PATH temporarily + auto env = boost::this_process::environment(); + std::string new_path = env["PATH"].to_string(); + new_path.insert(0, fmt::format("{}:", exe_dir_)); + env["PATH"] = new_path; + } +}; + +TEST_F(NTPMonitorTestSuite, offsetWarnTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("NTP Offset", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeOffsetWarn(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("NTP Offset", status)); + ASSERT_EQ(status.level, DiagStatus::WARN); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeOffsetWarn(0.05); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("NTP Offset", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(NTPMonitorTestSuite, offsetErrorTest) +{ + // Verify normal behavior + { + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("NTP Offset", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } + + // Verify warning + { + // Change warning level + monitor_->changeOffsetError(0.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("NTP Offset", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + } + + // Verify normal behavior + { + // Change back to normal + monitor_->changeOffsetError(5.0); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("NTP Offset", status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(NTPMonitorTestSuite, offsetNtpdateNotFoundTest) +{ + // Set flag false + monitor_->setNtpdateExists(false); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("NTP Offset", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "ntpdate error"); + ASSERT_TRUE(findValue(status, "ntpdate", value)); + ASSERT_STREQ( + value.c_str(), + "Command 'ntpdate' not found, but can be installed with: sudo apt install ntpdate"); +} + +TEST_F(NTPMonitorTestSuite, offsetNtpdateErrorTest) +{ + // Symlink ntpdate1 to ntpdate + fs::create_symlink(exe_dir_ + "/ntpdate1", ntpdate_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + ASSERT_TRUE(monitor_->findDiagStatus("NTP Offset", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "ntpdate error"); +} + +int main(int argc, char ** argv) +{ + argv_ = argv; + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/process_monitor/echo1.cpp b/system/system_monitor/test/src/process_monitor/echo1.cpp new file mode 100644 index 0000000000000..e029bc48c6ed2 --- /dev/null +++ b/system/system_monitor/test/src/process_monitor/echo1.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file echo1.cpp + * @brief dummy echo executable to return error + */ + +int main(int argc, char ** argv) { return -1; } diff --git a/system/system_monitor/test/src/process_monitor/sed1.cpp b/system/system_monitor/test/src/process_monitor/sed1.cpp new file mode 100644 index 0000000000000..5e7d953cf244a --- /dev/null +++ b/system/system_monitor/test/src/process_monitor/sed1.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file sed1.cpp + * @brief dummy sed executable to return error + */ + +int main(int argc, char ** argv) { return -1; } diff --git a/system/system_monitor/test/src/process_monitor/sort1.cpp b/system/system_monitor/test/src/process_monitor/sort1.cpp new file mode 100644 index 0000000000000..ba48b612f05b7 --- /dev/null +++ b/system/system_monitor/test/src/process_monitor/sort1.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file sort1.cpp + * @brief dummy sort executable to return error + */ + +int main(int argc, char ** argv) { return -1; } diff --git a/system/system_monitor/test/src/process_monitor/test_process_monitor.cpp b/system/system_monitor/test/src/process_monitor/test_process_monitor.cpp new file mode 100644 index 0000000000000..d1cecf9cecd24 --- /dev/null +++ b/system/system_monitor/test/src/process_monitor/test_process_monitor.cpp @@ -0,0 +1,379 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "system_monitor/process_monitor/process_monitor.hpp" + +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace bp = boost::process; +namespace fs = boost::filesystem; +using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; + +char ** argv_; + +class TestProcessMonitor : public ProcessMonitor +{ + friend class ProcessMonitorTestSuite; + +public: + TestProcessMonitor(const std::string & node_name, const rclcpp::NodeOptions & options) + : ProcessMonitor(node_name, options) + { + } + + void diagCallback(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_msg) + { + array_ = *diag_msg; + } + + int getNumOfProcs() const { return num_of_procs_; } + + void update() { updater_.force_update(); } + + const std::string removePrefix(const std::string & name) + { + return boost::algorithm::erase_all_copy(name, prefix_); + } + + bool findDiagStatus(const std::string & name, DiagStatus & status) // NOLINT + { + for (int i = 0; i < array_.status.size(); ++i) { + if (removePrefix(array_.status[i].name) == name) { + status = array_.status[i]; + return true; + } + } + return false; + } + +private: + diagnostic_msgs::msg::DiagnosticArray array_; + const std::string prefix_ = std::string(this->get_name()) + ": "; +}; + +class ProcessMonitorTestSuite : public ::testing::Test +{ +public: + ProcessMonitorTestSuite() + { + // Get directory of executable + const fs::path exe_path(argv_[0]); + exe_dir_ = exe_path.parent_path().generic_string(); + // Get dummy executable path + top_ = exe_dir_ + "/top"; + echo_ = exe_dir_ + "/echo"; + sed_ = exe_dir_ + "/sed"; + sort_ = exe_dir_ + "/sort"; + } + +protected: + std::unique_ptr monitor_; + rclcpp::Subscription::SharedPtr sub_; + std::string exe_dir_; + std::string top_; + std::string echo_; + std::string sed_; + std::string sort_; + + void SetUp() + { + using std::placeholders::_1; + rclcpp::init(0, nullptr); + rclcpp::NodeOptions node_options; + monitor_ = std::make_unique("test_process_monitor", node_options); + sub_ = monitor_->create_subscription( + "/diagnostics", 1000, std::bind(&TestProcessMonitor::diagCallback, monitor_.get(), _1)); + + // Remove dummy executable if exists + if (fs::exists(top_)) { + fs::remove(top_); + } + if (fs::exists(echo_)) { + fs::remove(echo_); + } + if (fs::exists(sed_)) { + fs::remove(sed_); + } + if (fs::exists(sort_)) { + fs::remove(sort_); + } + } + + void TearDown() + { + // Remove dummy executable if exists + if (fs::exists(top_)) { + fs::remove(top_); + } + if (fs::exists(echo_)) { + fs::remove(echo_); + } + if (fs::exists(sed_)) { + fs::remove(sed_); + } + if (fs::exists(sort_)) { + fs::remove(sort_); + } + rclcpp::shutdown(); + } + + bool findValue(const DiagStatus status, const std::string & key, std::string & value) // NOLINT + { + for (auto itr = status.values.begin(); itr != status.values.end(); ++itr) { + if (itr->key == key) { + value = itr->value; + return true; + } + } + return false; + } + + void modifyPath() + { + // Modify PATH temporarily + auto env = boost::this_process::environment(); + std::string new_path = env["PATH"].to_string(); + new_path.insert(0, fmt::format("{}:", exe_dir_)); + env["PATH"] = new_path; + } +}; + +TEST_F(ProcessMonitorTestSuite, tasksSummaryTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + ASSERT_TRUE(monitor_->findDiagStatus("Tasks Summary", status)); + ASSERT_EQ(status.level, DiagStatus::OK); +} + +TEST_F(ProcessMonitorTestSuite, highLoadProcTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + for (int i = 0; i < monitor_->getNumOfProcs(); ++i) { + ASSERT_TRUE(monitor_->findDiagStatus(fmt::format("High-load Proc[{}]", i), status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(ProcessMonitorTestSuite, highMemProcTest) +{ + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + for (int i = 0; i < monitor_->getNumOfProcs(); ++i) { + ASSERT_TRUE(monitor_->findDiagStatus(fmt::format("High-mem Proc[{}]", i), status)); + ASSERT_EQ(status.level, DiagStatus::OK); + } +} + +TEST_F(ProcessMonitorTestSuite, topErrorTest) +{ + // Symlink top1 to top + fs::create_symlink(exe_dir_ + "/top1", top_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("Tasks Summary", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "top error"); + ASSERT_TRUE(findValue(status, "top", value)); + ASSERT_STREQ(value.c_str(), ""); + + for (int i = 0; i < monitor_->getNumOfProcs(); ++i) { + ASSERT_TRUE(monitor_->findDiagStatus(fmt::format("High-load Proc[{}]", i), status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "top error"); + + ASSERT_TRUE(monitor_->findDiagStatus(fmt::format("High-mem Proc[{}]", i), status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "top error"); + } +} + +TEST_F(ProcessMonitorTestSuite, matchingPatternNotFoundTest) +{ + // Symlink top2 to top + fs::create_symlink(exe_dir_ + "/top2", top_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("Tasks Summary", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "matching pattern not found"); +} + +TEST_F(ProcessMonitorTestSuite, invalidFormatTest) +{ + // Symlink top3 to top + fs::create_symlink(exe_dir_ + "/top3", top_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("Tasks Summary", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "invalid format"); +} + +TEST_F(ProcessMonitorTestSuite, echoErrorTest) +{ + // Symlink sed1 to sed + fs::create_symlink(exe_dir_ + "/echo1", echo_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("Tasks Summary", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "echo error"); +} + +TEST_F(ProcessMonitorTestSuite, sedErrorTest) +{ + // Symlink sed1 to sed + fs::create_symlink(exe_dir_ + "/sed1", sed_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + ASSERT_TRUE(monitor_->findDiagStatus("Tasks Summary", status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "sed error"); +} + +TEST_F(ProcessMonitorTestSuite, sortErrorTest) +{ + // Symlink sort1 to sort + fs::create_symlink(exe_dir_ + "/sort1", sort_); + + // Modify PATH temporarily + modifyPath(); + + // Publish topic + monitor_->update(); + + // Give time to publish + rclcpp::WallRate(2).sleep(); + rclcpp::spin_some(monitor_->get_node_base_interface()); + + // Verify + DiagStatus status; + std::string value; + + for (int i = 0; i < monitor_->getNumOfProcs(); ++i) { + ASSERT_TRUE(monitor_->findDiagStatus(fmt::format("High-mem Proc[{}]", i), status)); + ASSERT_EQ(status.level, DiagStatus::ERROR); + ASSERT_STREQ(status.message.c_str(), "sort error"); + } +} + +int main(int argc, char ** argv) +{ + argv_ = argv; + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/system/system_monitor/test/src/process_monitor/top1.cpp b/system/system_monitor/test/src/process_monitor/top1.cpp new file mode 100644 index 0000000000000..56f9243122f23 --- /dev/null +++ b/system/system_monitor/test/src/process_monitor/top1.cpp @@ -0,0 +1,20 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file top1.cpp + * @brief dummy top executable to return error + */ + +int main(int argc, char ** argv) { return -1; } diff --git a/system/system_monitor/test/src/process_monitor/top2.cpp b/system/system_monitor/test/src/process_monitor/top2.cpp new file mode 100644 index 0000000000000..c759ed2c21441 --- /dev/null +++ b/system/system_monitor/test/src/process_monitor/top2.cpp @@ -0,0 +1,22 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file top2.cpp + * @brief dummy top executable to provide nothing + */ + +#include + +int main(int argc, char ** argv) { return 0; } diff --git a/system/system_monitor/test/src/process_monitor/top3.cpp b/system/system_monitor/test/src/process_monitor/top3.cpp new file mode 100644 index 0000000000000..2725ee2676064 --- /dev/null +++ b/system/system_monitor/test/src/process_monitor/top3.cpp @@ -0,0 +1,26 @@ +// Copyright 2020 Autoware Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file top3.cpp + * @brief dummy top executable to provide invalid output + */ + +#include + +int main(int argc, char ** argv) +{ + printf("Tasks:"); + return 0; +}