From 1cc2a07df328aff0f0fbc917349bea5144b376aa Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol <97144416+v-nakayama7440-esol@users.noreply.github.com> Date: Tue, 20 Sep 2022 21:23:58 +0900 Subject: [PATCH] feat(system_monitor): add IP packet reassembles failed monitoring to net_monitor (#1427) * feat(system_monitor): add IP packet reassembles failed monitoring to net_monitor Signed-off-by: v-nakayama7440-esol * fix build errors caused by merge mistakes Signed-off-by: v-nakayama7440-esol * fix(system_monitor): chang word Reasm and fix deep nesting Signed-off-by: v-nakayama7440-esol * fix(system_monitor): fix deep nesting Signed-off-by: v-nakayama7440-esol * fix(system_monitor): lightweight /proc/net/snmp reading Signed-off-by: v-nakayama7440-esol * fix(system_monitor): fix index variable type to unsigned, add log output, and make index evaluation expression easier to understand Signed-off-by: v-nakayama7440-esol * fix(system_monitor): remove unnecessary static_cast Signed-off-by: v-nakayama7440-esol * fix(system_monitor): typo fix Signed-off-by: v-nakayama7440-esol Signed-off-by: v-nakayama7440-esol Co-authored-by: ito-san <57388357+ito-san@users.noreply.github.com> --- .../system_monitor/net_monitor.param.yaml | 2 + .../diagnostic_aggregator/system.param.yaml | 6 + system/system_monitor/README.md | 47 +++--- .../config/net_monitor.param.yaml | 2 + system/system_monitor/docs/ros_parameters.md | 14 +- .../system_monitor/docs/topics_net_monitor.md | 18 +++ .../net_monitor/net_monitor.hpp | 32 ++++ .../src/net_monitor/net_monitor.cpp | 143 +++++++++++++++++- 8 files changed, 234 insertions(+), 30 deletions(-) diff --git a/launch/tier4_system_launch/config/system_monitor/net_monitor.param.yaml b/launch/tier4_system_launch/config/system_monitor/net_monitor.param.yaml index e91aa4ca3fbb3..d72b8d1334c17 100644 --- a/launch/tier4_system_launch/config/system_monitor/net_monitor.param.yaml +++ b/launch/tier4_system_launch/config/system_monitor/net_monitor.param.yaml @@ -5,3 +5,5 @@ monitor_program: "greengrass" crc_error_check_duration: 1 crc_error_count_threshold: 1 + reassembles_failed_check_duration: 1 + reassembles_failed_check_count: 1 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml index 68ef4b47552c8..8cf97cc95553c 100644 --- a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml +++ b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml @@ -134,6 +134,12 @@ contains: [": Network CRC Error"] timeout: 3.0 + ip_packet_reassembles_failed: + type: diagnostic_aggregator/GenericAnalyzer + path: ip_packet_reassembles_failed + contains: [": IP Packet Reassembles Failed"] + timeout: 3.0 + storage: type: diagnostic_aggregator/AnalyzerGroup path: storage diff --git a/system/system_monitor/README.md b/system/system_monitor/README.md index 1eeffa58154dd..5c786e9762875 100644 --- a/system/system_monitor/README.md +++ b/system/system_monitor/README.md @@ -53,29 +53,30 @@ Every topic is published in 1 minute interval. [Usage] ✓:Supported, -:Not supported -| Node | Message | Intel | arm64(tegra) | arm64(raspi) | Notes | -| --------------- | ---------------------- | :---: | :----------: | :----------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| CPU Monitor | CPU Temperature | ✓ | ✓ | ✓ | | -| | CPU Usage | ✓ | ✓ | ✓ | | -| | CPU Load Average | ✓ | ✓ | ✓ | | -| | CPU Thermal Throttling | ✓ | - | ✓ | | -| | CPU Frequency | ✓ | ✓ | ✓ | Notification of frequency only, normally error not generated. | -| HDD Monitor | HDD Temperature | ✓ | ✓ | ✓ | | -| | HDD PowerOnHours | ✓ | ✓ | ✓ | | -| | HDD TotalDataWritten | ✓ | ✓ | ✓ | | -| | HDD Usage | ✓ | ✓ | ✓ | | -| Memory Monitor | Memory Usage | ✓ | ✓ | ✓ | | -| Net Monitor | Network Usage | ✓ | ✓ | ✓ | | -| | Network CRC Error | ✓ | ✓ | ✓ | Warning occurs when the number of CRC errors in the period reaches the threshold value. The number of CRC errors that occur is the same as the value that can be confirmed with the ip command. | -| NTP Monitor | NTP Offset | ✓ | ✓ | ✓ | | -| Process Monitor | Tasks Summary | ✓ | ✓ | ✓ | | -| | High-load Proc[0-9] | ✓ | ✓ | ✓ | | -| | High-mem Proc[0-9] | ✓ | ✓ | ✓ | | -| GPU Monitor | GPU Temperature | ✓ | ✓ | - | | -| | GPU Usage | ✓ | ✓ | - | | -| | GPU Memory Usage | ✓ | - | - | | -| | GPU Thermal Throttling | ✓ | - | - | | -| | GPU Frequency | ✓ | ✓ | - | For Intel platform, monitor whether current GPU clock is supported by the GPU. | +| Node | Message | Intel | arm64(tegra) | arm64(raspi) | Notes | +| --------------- | ---------------------------- | :---: | :----------: | :----------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| CPU Monitor | CPU Temperature | ✓ | ✓ | ✓ | | +| | CPU Usage | ✓ | ✓ | ✓ | | +| | CPU Load Average | ✓ | ✓ | ✓ | | +| | CPU Thermal Throttling | ✓ | - | ✓ | | +| | CPU Frequency | ✓ | ✓ | ✓ | Notification of frequency only, normally error not generated. | +| HDD Monitor | HDD Temperature | ✓ | ✓ | ✓ | | +| | HDD PowerOnHours | ✓ | ✓ | ✓ | | +| | HDD TotalDataWritten | ✓ | ✓ | ✓ | | +| | HDD Usage | ✓ | ✓ | ✓ | | +| Memory Monitor | Memory Usage | ✓ | ✓ | ✓ | | +| Net Monitor | Network Usage | ✓ | ✓ | ✓ | | +| | Network CRC Error | ✓ | ✓ | ✓ | Warning occurs when the number of CRC errors in the period reaches the threshold value. The number of CRC errors that occur is the same as the value that can be confirmed with the ip command. | +| | IP Packet Reassembles Failed | ✓ | ✓ | ✓ | | +| NTP Monitor | NTP Offset | ✓ | ✓ | ✓ | | +| Process Monitor | Tasks Summary | ✓ | ✓ | ✓ | | +| | High-load Proc[0-9] | ✓ | ✓ | ✓ | | +| | High-mem Proc[0-9] | ✓ | ✓ | ✓ | | +| GPU Monitor | GPU Temperature | ✓ | ✓ | - | | +| | GPU Usage | ✓ | ✓ | - | | +| | GPU Memory Usage | ✓ | - | - | | +| | GPU Thermal Throttling | ✓ | - | - | | +| | GPU Frequency | ✓ | ✓ | - | For Intel platform, monitor whether current GPU clock is supported by the GPU. | ## ROS parameters diff --git a/system/system_monitor/config/net_monitor.param.yaml b/system/system_monitor/config/net_monitor.param.yaml index 686ee349b0765..cb7e1b48380a3 100644 --- a/system/system_monitor/config/net_monitor.param.yaml +++ b/system/system_monitor/config/net_monitor.param.yaml @@ -5,3 +5,5 @@ monitor_program: "greengrass" crc_error_check_duration: 1 crc_error_count_threshold: 1 + reassembles_failed_check_duration: 1 + reassembles_failed_check_count: 1 diff --git a/system/system_monitor/docs/ros_parameters.md b/system/system_monitor/docs/ros_parameters.md index 044c1eb10a5d0..64ebe5e6a64e3 100644 --- a/system/system_monitor/docs/ros_parameters.md +++ b/system/system_monitor/docs/ros_parameters.md @@ -53,12 +53,14 @@ mem_monitor: net_monitor: -| Name | Type | Unit | Default | Notes | -| :------------------------ | :----------: | :-----: | :-----: | :-------------------------------------------------------------------------------------------------------------- | -| devices | list[string] | n/a | none | The name of network interface to monitor. (e.g. eth0, \* for all network interfaces) | -| usage_warn | float | %(1e-2) | 0.95 | Generates warning when network usage reaches a specified value or higher. | -| crc_error_check_duration | int | sec | 1 | CRC error check duration. | -| crc_error_count_threshold | int | n/a | 1 | Generates warning when count of CRC errors during CRC error check duration reaches a specified value or higher. | +| Name | Type | Unit | Default | Notes | +| :-------------------------------- | :----------: | :-----: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------------------------- | +| devices | list[string] | n/a | none | The name of network interface to monitor. (e.g. eth0, \* for all network interfaces) | +| usage_warn | float | %(1e-2) | 0.95 | Generates warning when network usage reaches a specified value or higher. | +| crc_error_check_duration | int | sec | 1 | CRC error check duration. | +| crc_error_count_threshold | int | n/a | 1 | Generates warning when count of CRC errors during CRC error check duration reaches a specified value or higher. | +| reassembles_failed_check_duration | int | sec | 1 | IP packet reassembles failed check duration. | +| reassembles_failed_check_count | int | n/a | 1 | Generates warning when count of IP packet reassembles failed during IP packet reassembles failed check duration reaches a specified value or higher. | ## NTP Monitor diff --git a/system/system_monitor/docs/topics_net_monitor.md b/system/system_monitor/docs/topics_net_monitor.md index 261cede53de21..d223b359cdb08 100644 --- a/system/system_monitor/docs/topics_net_monitor.md +++ b/system/system_monitor/docs/topics_net_monitor.md @@ -81,3 +81,21 @@ | Network [0-9]: interface name | wlp82s0 | | Network [0-9]: total rx_crc_errors | 0 | | Network [0-9]: rx_crc_errors per unit time | 0 | + +## IP Packet Reassembles Failed + +/diagnostics/net_monitor: IP Packet Reassembles Failed + +[summary] + +| level | message | +| ----- | ------------------ | +| OK | OK | +| WARN | reassembles failed | + +[values] + +| key | value (example) | +| --------------------------------------- | --------------- | +| total packet reassembles failed | 0 | +| packet reassembles failed per unit time | 0 | diff --git a/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp b/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp index 53f3e1250c9f0..3ddb078fa00e1 100644 --- a/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp +++ b/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp @@ -91,6 +91,15 @@ class NetMonitor : public rclcpp::Node void checkCrcError( diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** + * @brief check IP packet reassembles failed + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkReassemblesFailed( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** * @brief get wireless speed * @param [in] ifa_name interface name @@ -168,6 +177,18 @@ class NetMonitor : public rclcpp::Node const NetworkInfo & net_info, int index, diagnostic_updater::DiagnosticStatusWrapper & stat, std::string & error_str); + /** + * @brief search column index of IP packet reassembles failed in /proc/net/snmp + */ + void searchReassemblesFailedColumnIndex(); + + /** + * @brief get IP packet reassembles failed + * @param [out] reassembles_failed IP packet reassembles failed + * @return execution result + */ + bool getReassemblesFailed(uint64_t & reassembles_failed); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get Network information @@ -191,11 +212,22 @@ class NetMonitor : public rclcpp::Node } crc_errors; std::map crc_errors_; //!< @brief list of CRC errors + std::deque + reassembles_failed_queue_; //!< @brief queue that holds count of IP packet reassembles failed + uint64_t last_reassembles_failed_; //!< @brief IP packet reassembles failed at the time of the + //!< last monitoring + std::string monitor_program_; //!< @brief nethogs monitor program name bool nethogs_all_; //!< @brief nethogs result all mode int traffic_reader_port_; //!< @brief port number to connect to traffic_reader unsigned int crc_error_check_duration_; //!< @brief CRC error check duration unsigned int crc_error_count_threshold_; //!< @brief CRC error count threshold + unsigned int + reassembles_failed_check_duration_; //!< @brief IP packet reassembles failed check duration + unsigned int + reassembles_failed_check_count_; //!< @brief IP packet reassembles failed check count threshold + unsigned int reassembles_failed_column_index_; //!< @brief column index of IP Reassembles failed + //!< in /proc/net/snmp /** * @brief Network usage status messages diff --git a/system/system_monitor/src/net_monitor/net_monitor.cpp b/system/system_monitor/src/net_monitor/net_monitor.cpp index 983af5668a082..c213e864db93b 100644 --- a/system/system_monitor/src/net_monitor/net_monitor.cpp +++ b/system/system_monitor/src/net_monitor/net_monitor.cpp @@ -37,6 +37,8 @@ #include #include +#include +#include #include #include @@ -46,10 +48,15 @@ NetMonitor::NetMonitor(const rclcpp::NodeOptions & options) last_update_time_{0, 0, this->get_clock()->get_clock_type()}, device_params_( declare_parameter>("devices", std::vector())), + last_reassembles_failed_(0), monitor_program_(declare_parameter("monitor_program", "greengrass")), traffic_reader_port_(declare_parameter("traffic_reader_port", TRAFFIC_READER_PORT)), crc_error_check_duration_(declare_parameter("crc_error_check_duration", 1)), - crc_error_count_threshold_(declare_parameter("crc_error_count_threshold", 1)) + crc_error_count_threshold_(declare_parameter("crc_error_count_threshold", 1)), + reassembles_failed_check_duration_( + declare_parameter("reassembles_failed_check_duration", 1)), + reassembles_failed_check_count_(declare_parameter("reassembles_failed_check_count", 1)), + reassembles_failed_column_index_(0) { using namespace std::literals::chrono_literals; @@ -65,9 +72,12 @@ NetMonitor::NetMonitor(const rclcpp::NodeOptions & options) updater_.add("Network Usage", this, &NetMonitor::checkUsage); updater_.add("Network Traffic", this, &NetMonitor::monitorTraffic); updater_.add("Network CRC Error", this, &NetMonitor::checkCrcError); + updater_.add("IP Packet Reassembles Failed", this, &NetMonitor::checkReassemblesFailed); nl80211_.init(); + searchReassemblesFailedColumnIndex(); + // get Network information for the first time updateNetworkInfoList(); @@ -492,5 +502,136 @@ void NetMonitor::monitorTraffic(diagnostic_updater::DiagnosticStatusWrapper & st SystemMonitorUtility::stopMeasurement(t_start, stat); } +void NetMonitor::checkReassemblesFailed(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + int whole_level = DiagStatus::OK; + std::string error_str; + uint64_t total_reassembles_failed = 0; + uint64_t unit_reassembles_failed = 0; + + if (getReassemblesFailed(total_reassembles_failed)) { + reassembles_failed_queue_.push_back(total_reassembles_failed - last_reassembles_failed_); + while (reassembles_failed_queue_.size() > reassembles_failed_check_duration_) { + reassembles_failed_queue_.pop_front(); + } + + for (auto reassembles_failed : reassembles_failed_queue_) { + unit_reassembles_failed += reassembles_failed; + } + + stat.add(fmt::format("total packet reassembles failed"), total_reassembles_failed); + stat.add(fmt::format("packet reassembles failed per unit time"), unit_reassembles_failed); + + if (unit_reassembles_failed >= reassembles_failed_check_count_) { + whole_level = std::max(whole_level, static_cast(DiagStatus::WARN)); + error_str = "reassembles failed"; + } + + last_reassembles_failed_ = total_reassembles_failed; + } else { + reassembles_failed_queue_.push_back(0); + whole_level = std::max(whole_level, static_cast(DiagStatus::ERROR)); + error_str = "failed to read /proc/net/snmp"; + } + + if (!error_str.empty()) { + stat.summary(whole_level, error_str); + } else { + stat.summary(whole_level, "OK"); + } + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void NetMonitor::searchReassemblesFailedColumnIndex() +{ + std::ifstream ifs("/proc/net/snmp"); + if (!ifs) { + RCLCPP_WARN(get_logger(), "Failed to open /proc/net/snmp."); + return; + } + + // /proc/net/snmp + // Ip: Forwarding DefaultTTL InReceives ... ReasmTimeout ReasmReqds ReasmOKs ReasmFails ... + // Ip: 2 64 5636471397 ... 135 2303339 216166 270 ... + std::string line; + + // Find column index of 'ReasmFails' + if (!std::getline(ifs, line)) { + RCLCPP_WARN(get_logger(), "Failed to get /proc/net/snmp first line."); + return; + } + + std::vector title_list; + boost::split(title_list, line, boost::is_space()); + + if (title_list.empty()) { + RCLCPP_WARN(get_logger(), "/proc/net/snmp first line is empty."); + return; + } + if (title_list[0] != "Ip:") { + RCLCPP_WARN( + get_logger(), "/proc/net/snmp line title column is invalid. : %s", title_list[0].c_str()); + return; + } + + int index = 0; + for (auto itr = title_list.begin(); itr != title_list.end(); ++itr, ++index) { + if (*itr == "ReasmFails") { + reassembles_failed_column_index_ = index; + break; + } + } +} + +bool NetMonitor::getReassemblesFailed(uint64_t & reassembles_failed) +{ + if (reassembles_failed_column_index_ == 0) { + RCLCPP_WARN( + get_logger(), "reassembles failed column index is invalid. : %d", + reassembles_failed_column_index_); + return false; + } + + std::ifstream ifs("/proc/net/snmp"); + if (!ifs) { + RCLCPP_WARN(get_logger(), "Failed to open /proc/net/snmp."); + return false; + } + + std::string line; + + // Skip title row + if (!std::getline(ifs, line)) { + RCLCPP_WARN(get_logger(), "Failed to get /proc/net/snmp first line."); + return false; + } + + // Find a value of 'ReasmFails' + if (!std::getline(ifs, line)) { + RCLCPP_WARN(get_logger(), "Failed to get /proc/net/snmp second line."); + return false; + } + + std::vector value_list; + boost::split(value_list, line, boost::is_space()); + + if (reassembles_failed_column_index_ >= value_list.size()) { + RCLCPP_WARN( + get_logger(), + "There are not enough columns for reassembles failed column index. : columns=%d index=%d", + static_cast(value_list.size()), reassembles_failed_column_index_); + return false; + } + + reassembles_failed = std::stoull(value_list[reassembles_failed_column_index_]); + + return true; +} + #include RCLCPP_COMPONENTS_REGISTER_NODE(NetMonitor)