From c5ea04d5d8c3c4b7a45bd3c9b61792a8a8175a54 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Mon, 18 Apr 2022 09:57:44 +0900 Subject: [PATCH 01/10] feat: add HDD monitoring items to hdd_monitor Signed-off-by: v-nakayama7440-esol --- .../system_monitor/hdd_monitor.param.yaml | 5 + .../diagnostic_aggregator/system.param.yaml | 30 +++ system/system_monitor/README.md | 5 + .../config/hdd_monitor.param.yaml | 5 + system/system_monitor/docs/ros_parameters.md | 5 + .../system_monitor/docs/topics_hdd_monitor.md | 97 +++++++ .../include/hdd_reader/hdd_reader.hpp | 7 +- .../hdd_monitor/hdd_monitor.hpp | 174 ++++++++++++- .../reader/hdd_reader/hdd_reader.cpp | 9 + .../src/hdd_monitor/hdd_monitor.cpp | 237 +++++++++++++++++- 10 files changed, 567 insertions(+), 7 deletions(-) diff --git a/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml b/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml index 32d3a425b1898..98e11bee3b61b 100644 --- a/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml +++ b/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml @@ -7,5 +7,10 @@ name: /dev/sda3 temp_warn: 55.0 temp_error: 70.0 + recovered_error_warn: 1 free_warn: 5120 # MB(8hour) free_error: 100 # MB(last 1 minute) + read_data_rate_warn: 360.0 # MB/s + write_data_rate_warn: 103.5 # MB/s + read_iops_warn: 63360.0 # IOPS + write_iops_warn: 24120.0 # IOPS diff --git a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml index 2cc4bca27724a..11575764f17c1 100644 --- a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml +++ b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml @@ -132,6 +132,36 @@ contains: [": HDD Temperature"] timeout: 3.0 + recovered_error: + type: diagnostic_aggregator/GenericAnalyzer + path: recovered_error + contains: [": HDD RecoveredError"] + timeout: 3.0 + + read_data_rate: + type: diagnostic_aggregator/GenericAnalyzer + path: read_data_rate + contains: [": HDD ReadDataRate"] + timeout: 3.0 + + write_data_rate: + type: diagnostic_aggregator/GenericAnalyzer + path: write_data_rate + contains: [": HDD WriteDataRate"] + timeout: 3.0 + + read_iops: + type: diagnostic_aggregator/GenericAnalyzer + path: read_iops + contains: [": HDD ReadIOPS"] + timeout: 3.0 + + write_iops: + type: diagnostic_aggregator/GenericAnalyzer + path: write_iops + contains: [": HDD WriteIOPS"] + timeout: 3.0 + usage: type: diagnostic_aggregator/GenericAnalyzer path: usage diff --git a/system/system_monitor/README.md b/system/system_monitor/README.md index bd4eb3fb863c3..01c4276e46d79 100644 --- a/system/system_monitor/README.md +++ b/system/system_monitor/README.md @@ -61,7 +61,12 @@ Every topic is published in 1 minute interval. | | CPU Thermal Throttling | ✓ | - | ✓ | | | | CPU Frequency | ✓ | ✓ | ✓ | Notification of frequency only, normally error not generated. | | HDD Monitor | HDD Temperature | ✓ | ✓ | ✓ | | +| | HDD RecoveredError | ✓ | ✓ | ✓ | | | | HDD Usage | ✓ | ✓ | ✓ | | +| | HDD ReadDataRate | ✓ | ✓ | ✓ | | +| | HDD WriteDataRate | ✓ | ✓ | ✓ | | +| | HDD ReadIOPS | ✓ | ✓ | ✓ | | +| | HDD WriteIOPS | ✓ | ✓ | ✓ | | | Memory Monitor | Memory Usage | ✓ | ✓ | ✓ | | | Net Monitor | Network Usage | ✓ | ✓ | ✓ | | | NTP Monitor | NTP Offset | ✓ | ✓ | ✓ | | diff --git a/system/system_monitor/config/hdd_monitor.param.yaml b/system/system_monitor/config/hdd_monitor.param.yaml index 70f8dc5ffa13f..ee245df864839 100644 --- a/system/system_monitor/config/hdd_monitor.param.yaml +++ b/system/system_monitor/config/hdd_monitor.param.yaml @@ -10,5 +10,10 @@ power_on_hours_warn: 3000000 total_data_written_warn: 4915200 # =150TB (1unit=32MB) total_data_written_safety_factor: 0.05 + recovered_error_warn: 1 free_warn: 5120 # MB(8hour) free_error: 100 # MB(last 1 minute) + read_data_rate_warn: 360.0 # MB/s + write_data_rate_warn: 103.5 # MB/s + read_iops_warn: 63360.0 # IOPS + write_iops_warn: 24120.0 # IOPS diff --git a/system/system_monitor/docs/ros_parameters.md b/system/system_monitor/docs/ros_parameters.md index 779297492e00a..fe4aed9d421b0 100644 --- a/system/system_monitor/docs/ros_parameters.md +++ b/system/system_monitor/docs/ros_parameters.md @@ -31,6 +31,11 @@ hdd_monitor: | total_data_written_attribute_id | int | n/a | 0xF1 | S.M.A.R.T attribute ID of total data written. | | total_data_written_warn | int | depends on device | 4915200 | Generates warning when HDD total data written reaches a specified value or higher. | | total_data_written_safety_factor | int | %(1e-2) | 0.05 | Safety factor of HDD total data written. | +| recovered_error_warn | int | n/a | 1 | Generates warning when HDD recovered error reaches a specified value or higher. | +| read_data_rate_warn | float | MB/s | 360.0 | Generates warning when HDD read data rate reaches a specified value or higher. | +| write_data_rate_warn | float | MB/s | 103.5 | Generates warning when HDD write data rate reaches a specified value or higher. | +| read_iops_warn | float | IOPS | 63360.0 | Generates warning when HDD read IOPS reaches a specified value or higher. | +| write_iops_warn | float | IOPS | 24120.0 | Generates warning when HDD write IOPS reaches a specified value or higher. | hdd_monitor: diff --git a/system/system_monitor/docs/topics_hdd_monitor.md b/system/system_monitor/docs/topics_hdd_monitor.md index 4ef37b6ae5207..0d3dc0ba13633 100644 --- a/system/system_monitor/docs/topics_hdd_monitor.md +++ b/system/system_monitor/docs/topics_hdd_monitor.md @@ -64,6 +64,27 @@ | HDD [0-9]: serial | FB590709182505050767 | | HDD [0-9]: total data written | 146295330
not available | +## HDD RecoveredError + +/diagnostics/hdd_monitor: HDD RecoveredError + +[summary] + +| level | message | +| ----- | -------------------- | +| OK | OK | +| WARN | high soft error rate | + +[values] + +| key | value (example) | +| -------------------------- | ------------------------- | +| HDD [0-9]: status | OK / high soft error rate | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: model | PHISON PS5012-E12S-512G | +| HDD [0-9]: serial | FB590709182505050767 | +| HDD [0-9]: recovered error | 0
not available | + ## HDD Usage /diagnostics/hdd_monitor: HDD Usage @@ -87,3 +108,79 @@ | HDD [0-9]: avail | 749G | | HDD [0-9]: use | 69% | | HDD [0-9]: mounted on | / | + +## HDD ReadDataRate + +/diagnostics/hdd_monitor: HDD ReadDataRate + +[summary] + +| level | message | +| ----- | ---------------------- | +| OK | OK | +| WARN | high data rate of read | + +[values] + +| key | value (example) | +| ---------------------------- | --------------------------- | +| HDD [0-9]: status | OK / high data rate of read | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: data rate of read | 0.00 MB/s | + +## HDD WriteDataRate + +/diagnostics/hdd_monitor: HDD WriteDataRate + +[summary] + +| level | message | +| ----- | ----------------------- | +| OK | OK | +| WARN | high data rate of write | + +[values] + +| key | value (example) | +| ----------------------------- | ---------------------------- | +| HDD [0-9]: status | OK / high data rate of write | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: data rate of write | 0.00 MB/s | + +## HDD ReadIOPS + +/diagnostics/hdd_monitor: HDD ReadIOPS + +[summary] + +| level | message | +| ----- | ----------------- | +| OK | OK | +| WARN | high IOPS of read | + +[values] + +| key | value (example) | +| ----------------------- | ---------------------- | +| HDD [0-9]: status | OK / high IOPS of read | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: IOPS of read | 0.00 IOPS | + +## HDD WriteIOPS + +/diagnostics/hdd_monitor: HDD WriteIOPS + +[summary] + +| level | message | +| ----- | ------------------ | +| OK | OK | +| WARN | high IOPS of write | + +[values] + +| key | value (example) | +| ------------------------ | ----------------------- | +| HDD [0-9]: status | OK / high IOPS of write | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: IOPS of write | 0.00 IOPS | diff --git a/system/system_monitor/include/hdd_reader/hdd_reader.hpp b/system/system_monitor/include/hdd_reader/hdd_reader.hpp index 44c52a491b9fe..1a830311978f7 100644 --- a/system/system_monitor/include/hdd_reader/hdd_reader.hpp +++ b/system/system_monitor/include/hdd_reader/hdd_reader.hpp @@ -40,7 +40,8 @@ struct HDDDevice { std::string name_; //!< @brief Device name uint8_t - total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written + total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written + uint8_t recovered_error_attribute_id_; //!< @brief S.M.A.R.T attribute ID of recovered error /** * @brief Load or save data members. @@ -54,6 +55,7 @@ struct HDDDevice { ar & name_; ar & total_data_written_attribute_id_; + ar & recovered_error_attribute_id_; } }; @@ -70,7 +72,9 @@ struct HDDInfo // in S.M.A.R.T. information. uint64_t power_on_hours_; //!< @brief power on hours count uint64_t total_data_written_; //!< @brief total data written + uint32_t recovered_error_; //!< @brief recovered error count bool is_valid_total_data_written_; //!< @brief whether total_data_written_ is valid value + bool is_valid_recovered_error_; //!< @brief whether recovered_error_ is valid value /** * @brief Load or save data members. @@ -89,6 +93,7 @@ struct HDDInfo ar & power_on_hours_; ar & total_data_written_; ar & is_valid_total_data_written_; + ar & is_valid_recovered_error_; } }; diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index c739f466e95e0..4c55ac704dff6 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -39,8 +39,13 @@ struct HDDParam int power_on_hours_warn_; //!< @brief HDD power on hours to generate warning uint64_t total_data_written_warn_; //!< @brief HDD total data written to generate warning float total_data_written_safety_factor_; //!< @brief safety factor of HDD total data written - int free_warn_; //!< @brief HDD free space(MB) to generate warning - int free_error_; //!< @brief HDD free space(MB) to generate error + int recovered_error_warn_; //!< @brief HDD recovered error count to generate warning + int free_warn_; //!< @brief HDD free space(MB) to generate warning + int free_error_; //!< @brief HDD free space(MB) to generate error + float read_data_rate_warn_; //!< @brief HDD data rate(MB/s) of read to generate warning + float write_data_rate_warn_; //!< @brief HDD data rate(MB/s) of write to generate warning + float read_iops_warn_; //!< @brief HDD IOPS of read to generate warning + float write_iops_warn_; //!< @brief HDD IOPS of write to generate warning HDDParam() : temp_warn_(55.0), @@ -48,8 +53,44 @@ struct HDDParam power_on_hours_warn_(3000000), total_data_written_warn_(4915200), total_data_written_safety_factor_(0.05), + recovered_error_warn_(1), free_warn_(5120), - free_error_(100) + free_error_(100), + read_data_rate_warn_(360.0), + write_data_rate_warn_(103.5), + read_iops_warn_(63360.0), + write_iops_warn_(24120.0) + { + } +}; + +/** + * @brief statistics of sysfs device + */ +struct SysfsDevStat +{ + unsigned long rd_ios_; //!< @brief number of read operations issued to the device + unsigned long rd_sectors_; //!< @brief number of sectors read + unsigned long wr_ios_; //!< @brief number of write operations issued to the device + unsigned long wr_sectors_; //!< @brief number of sectors written + + SysfsDevStat() : rd_ios_(0), rd_sectors_(0), wr_ios_(0), wr_sectors_(0) {} +}; + +/** + * @brief statistics of HDD + */ +struct HDDStat +{ + std::string device_; //!< @brief device + std::string error_str_; //!< @brief error string + float read_data_rate_MBs_; //!< @brief data rate of read (MB/s) + float write_data_rate_MBs_; //!< @brief data rate of write (MB/s) + float read_iops_; //!< @brief IOPS of read + float write_iops_; //!< @brief IOPS of write + SysfsDevStat last_sfdevstat_; //!< @brief last statistics of sysfs device + + HDDStat() : read_data_rate_MBs_(0.0), write_data_rate_MBs_(0.0), read_iops_(0.0), write_iops_(0.0) { } }; @@ -61,6 +102,18 @@ enum class HDDSMARTInfoItem : uint32_t { TEMPERATURE = 0, POWER_ON_HOURS = 1, TOTAL_DATA_WRITTEN = 2, + RECOVERED_ERROR = 3, + SIZE +}; + +/** + * @brief HDD statistics items to check + */ +enum class HDDStatItem : uint32_t { + READ_DATA_RATE = 0, + WRITE_DATA_RATE = 1, + READ_IOPS = 2, + WRITE_IOPS = 3, SIZE }; @@ -103,9 +156,19 @@ class HDDMonitor : public rclcpp::Node void checkSMARTTotalDataWritten( diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** + * @brief check HDD recovered error count + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkSMARTRecoveredError( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** * @brief check S.M.A.R.T. information * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @param [in] item S.M.A.R.T information item to be checked * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference * to pass diagnostic message updated in this function to diagnostic publish calls. */ @@ -122,6 +185,53 @@ class HDDMonitor : public rclcpp::Node void checkUsage( diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** + * @brief check HDD data rate of read + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkReadDataRate( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD data rate of write + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkWriteDataRate( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD IOPS of read + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkReadIOPS( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD IOPS of write + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkWriteIOPS( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD statistics + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @param [in] item statistic item to be checked + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkStatistics( + diagnostic_updater::DiagnosticStatusWrapper & stat, + HDDStatItem item); // NOLINT(runtime/references) + /** * @brief human readable size string to MB * @param [in] human readable size string @@ -151,6 +261,34 @@ class HDDMonitor : public rclcpp::Node */ void updateHDDInfoList(); + /** + * @brief start HDD transfer measurement + */ + void startHDDTransferMeasurement(); + + /** + * @brief update HDD statistics + */ + void updateHDDStatistics(); + + /** + * @brief get increment value of sysfs device stats per second + * @param [in] cur_val current value + * @param [in] last_val last value + * @param [in] duration_sec duration in seconds + * @return increment value + */ + double getIncreaseSysfsDeviceStatValuePerSec( + unsigned long cur_val, unsigned long last_val, double duration_sec); + + /** + * @brief read stats for current whole device using /sys/block/ directory + * @param [in] device device name + * @param [out] sfdevstat statistics of sysfs device + * @return result of success or failure + */ + int readSysfsDeviceStat(const std::string & device, SysfsDevStat & sfdevstat); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get HDD information from HDDReader @@ -159,9 +297,11 @@ class HDDMonitor : public rclcpp::Node int hdd_reader_port_; //!< @brief port number to connect to hdd_reader std::map hdd_params_; //!< @brief list of error and warning levels std::vector hdd_devices_; //!< @brief list of devices + std::map hdd_stats_; //!< @brief diagnostic of connection diagnostic_updater::DiagnosticStatusWrapper connect_diag_; - HDDInfoList hdd_info_list_; //!< @brief list of HDD information + HDDInfoList hdd_info_list_; //!< @brief list of HDD information + rclcpp::Time last_hdd_stat_update_time_; //!< @brief last HDD statistics update time /** * @brief HDD SMART status messages @@ -173,6 +313,10 @@ class HDDMonitor : public rclcpp::Node {{DiagStatus::OK, "OK"}, {DiagStatus::WARN, "lifetime limit"}, {DiagStatus::ERROR, "unused"}}, // total data written {{DiagStatus::OK, "OK"}, {DiagStatus::WARN, "warranty period"}, {DiagStatus::ERROR, "unused"}}, + // recovered error count + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high soft error rate"}, + {DiagStatus::ERROR, "unused"}}, }; /** @@ -182,6 +326,28 @@ class HDDMonitor : public rclcpp::Node {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "low disk space"}, {DiagStatus::ERROR, "very low disk space"}}; + + /** + * @brief HDD statistics status messages + */ + const std::map stat_dicts_[static_cast(HDDStatItem::SIZE)] = { + // data rate of read + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high data rate of read"}, + {DiagStatus::ERROR, "unused"}}, + // data rate of write + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high data rate of write"}, + {DiagStatus::ERROR, "unused"}}, + // IOPS of read + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high IOPS of read"}, + {DiagStatus::ERROR, "unused"}}, + // IOPS of write + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high IOPS of write"}, + {DiagStatus::ERROR, "unused"}}, + }; }; #endif // SYSTEM_MONITOR__HDD_MONITOR__HDD_MONITOR_HPP_ diff --git a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp index 08476f4e0c638..fa54bcd9c6550 100644 --- a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp +++ b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp @@ -285,6 +285,7 @@ int get_ata_SMARTData(int fd, HDDInfo * info, const HDDDevice & device) std::bitset(ATAAttributeIDs::SIZE)> found_flag; info->is_valid_total_data_written_ = false; + info->is_valid_recovered_error_ = false; // Retrieve S.M.A.R.T. Informations for (int i = 0; i < 30; ++i) { if (data.attribute_entry_[i].attribute_id_ == 0xC2) { // Temperature - Device Internal @@ -300,6 +301,11 @@ int get_ata_SMARTData(int fd, HDDInfo * info, const HDDDevice & device) (data.attribute_entry_[i].data_ | (static_cast(data.attribute_entry_[i].attribute_specific_) << 32)); info->is_valid_total_data_written_ = true; + } else if ( + data.attribute_entry_[i].attribute_id_ == + device.recovered_error_attribute_id_) { // Hardware ECC Recovered + info->recovered_error_ = data.attribute_entry_[i].data_; + info->is_valid_recovered_error_ = true; } } @@ -399,6 +405,9 @@ int get_nvme_SMARTData(int fd, HDDInfo * info) // Bytes 143:128 Power On Hours info->power_on_hours_ = *(reinterpret_cast(&data[128])); + // NVMe S.M.A.R.T has no information of recovered error count + info->is_valid_recovered_error_ = false; + return EXIT_SUCCESS; } diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index d1b15f1231987..d5560e8036db3 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -40,7 +41,8 @@ namespace bp = boost::process; HDDMonitor::HDDMonitor(const rclcpp::NodeOptions & options) : Node("hdd_monitor", options), updater_(this), - hdd_reader_port_(declare_parameter("hdd_reader_port", 7635)) + hdd_reader_port_(declare_parameter("hdd_reader_port", 7635)), + last_hdd_stat_update_time_{0, 0, this->get_clock()->get_clock_type()} { using namespace std::literals::chrono_literals; @@ -52,11 +54,19 @@ HDDMonitor::HDDMonitor(const rclcpp::NodeOptions & options) updater_.add("HDD Temperature", this, &HDDMonitor::checkSMARTTemperature); updater_.add("HDD PowerOnHours", this, &HDDMonitor::checkSMARTPowerOnHours); updater_.add("HDD TotalDataWritten", this, &HDDMonitor::checkSMARTTotalDataWritten); + updater_.add("HDD RecoveredError", this, &HDDMonitor::checkSMARTRecoveredError); updater_.add("HDD Usage", this, &HDDMonitor::checkUsage); + updater_.add("HDD ReadDataRate", this, &HDDMonitor::checkReadDataRate); + updater_.add("HDD WriteDataRate", this, &HDDMonitor::checkWriteDataRate); + updater_.add("HDD ReadIOPS", this, &HDDMonitor::checkReadIOPS); + updater_.add("HDD WriteIOPS", this, &HDDMonitor::checkWriteIOPS); // get HDD information from HDD reader for the first time updateHDDInfoList(); + // start HDD transfer measurement + startHDDTransferMeasurement(); + timer_ = rclcpp::create_timer(this, get_clock(), 1s, std::bind(&HDDMonitor::onTimer, this)); } @@ -75,6 +85,11 @@ void HDDMonitor::checkSMARTTotalDataWritten(diagnostic_updater::DiagnosticStatus checkSMART(stat, HDDSMARTInfoItem::TOTAL_DATA_WRITTEN); } +void HDDMonitor::checkSMARTRecoveredError(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkSMART(stat, HDDSMARTInfoItem::RECOVERED_ERROR); +} + void HDDMonitor::checkSMART( diagnostic_updater::DiagnosticStatusWrapper & stat, HDDSMARTInfoItem item) { @@ -158,6 +173,20 @@ void HDDMonitor::checkSMART( val_str = "not available"; } } break; + case HDDSMARTInfoItem::RECOVERED_ERROR: { + int32_t recovered_error = static_cast(hdd_itr->second.recovered_error_); + + level = DiagStatus::OK; + if (recovered_error >= itr->second.recovered_error_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: recovered error", index); + if (hdd_itr->second.is_valid_recovered_error_) { + val_str = fmt::format("{}", hdd_itr->second.recovered_error_); + } else { + val_str = "not available"; + } + } break; default: break; } @@ -277,6 +306,111 @@ void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) SystemMonitorUtility::stopMeasurement(t_start, stat); } +void HDDMonitor::checkReadDataRate(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkStatistics(stat, HDDStatItem::READ_DATA_RATE); +} + +void HDDMonitor::checkWriteDataRate(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkStatistics(stat, HDDStatItem::WRITE_DATA_RATE); +} + +void HDDMonitor::checkReadIOPS(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkStatistics(stat, HDDStatItem::READ_IOPS); +} + +void HDDMonitor::checkWriteIOPS(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkStatistics(stat, HDDStatItem::WRITE_IOPS); +} + +void HDDMonitor::checkStatistics( + diagnostic_updater::DiagnosticStatusWrapper & stat, HDDStatItem item) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (hdd_params_.empty()) { + stat.summary(DiagStatus::ERROR, "invalid disk parameter"); + return; + } + + int hdd_index = 0; + int whole_level = DiagStatus::OK; + std::string error_str = ""; + std::string key_str = ""; + std::string val_str = ""; + + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++hdd_index) { + int level = DiagStatus::OK; + + switch (item) { + case HDDStatItem::READ_DATA_RATE: { + float read_data_rate = hdd_stats_[itr->first].read_data_rate_MBs_; + + if (read_data_rate >= itr->second.read_data_rate_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: data rate of read", hdd_index); + val_str = fmt::format("{:.2f} MB/s", read_data_rate); + } break; + case HDDStatItem::WRITE_DATA_RATE: { + float write_data_rate = hdd_stats_[itr->first].write_data_rate_MBs_; + + if (write_data_rate >= itr->second.write_data_rate_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: data rate of write", hdd_index); + val_str = fmt::format("{:.2f} MB/s", write_data_rate); + } break; + case HDDStatItem::READ_IOPS: { + float read_iops = hdd_stats_[itr->first].read_iops_; + + if (read_iops >= itr->second.read_iops_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: IOPS of read", hdd_index); + val_str = fmt::format("{:.2f} IOPS", read_iops); + } break; + case HDDStatItem::WRITE_IOPS: { + float write_iops = hdd_stats_[itr->first].write_iops_; + + if (write_iops >= itr->second.write_iops_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: IOPS of write", hdd_index); + val_str = fmt::format("{:.2f} IOPS", write_iops); + } break; + default: + break; + } + + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.device_.c_str()); + if (!hdd_stats_[itr->first].error_str_.empty()) { + error_str = hdd_stats_[itr->first].error_str_; + stat.add(fmt::format("HDD {}: status", hdd_index), error_str); + } else { + stat.add( + fmt::format("HDD {}: status", hdd_index), + stat_dicts_[static_cast(item)].at(level)); + stat.add(key_str, val_str.c_str()); + } + + whole_level = std::max(whole_level, level); + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, error_str); + } else { + stat.summary(whole_level, stat_dicts_[static_cast(item)].at(whole_level)); + } + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + void HDDMonitor::getHDDParams() { const auto num_disks = this->declare_parameter("num_disks", 0); @@ -300,8 +434,13 @@ void HDDMonitor::getHDDParams() declare_parameter(prefix + ".total_data_written_warn", 4915200); param.total_data_written_warn_ = static_cast( total_data_written_warn_org * (1.0f - param.total_data_written_safety_factor_)); + param.recovered_error_warn_ = declare_parameter(prefix + ".recovered_error_warn", 1); param.free_warn_ = declare_parameter(prefix + ".free_warn", 5120); param.free_error_ = declare_parameter(prefix + ".free_error", 100); + param.read_data_rate_warn_ = declare_parameter(prefix + ".read_data_rate_warn", 360.0); + param.write_data_rate_warn_ = declare_parameter(prefix + ".write_data_rate_warn", 103.5); + param.read_iops_warn_ = declare_parameter(prefix + ".read_iops_warn", 63360.0); + param.write_iops_warn_ = declare_parameter(prefix + ".write_iops_warn", 24120.0); // Remove index number of partition for passing device name to hdd-reader if (boost::starts_with(device_name, "/dev/sd")) { @@ -317,7 +456,14 @@ void HDDMonitor::getHDDParams() device.name_ = param.device_; device.total_data_written_attribute_id_ = static_cast( declare_parameter(prefix + ".total_data_written_attribute_id", 0xF1)); + device.recovered_error_attribute_id_ = + static_cast(declare_parameter(prefix + ".recovered_error_attribute_id", 0xC3)); hdd_devices_.push_back(device); + + HDDStat stat; + const std::regex raw_pattern(".*/"); + stat.device_ = std::regex_replace(param.device_, raw_pattern, ""); + hdd_stats_[device_name] = stat; } } @@ -345,7 +491,11 @@ std::string HDDMonitor::getDeviceFromMountPoint(const std::string & mount_point) return ret; } -void HDDMonitor::onTimer() { updateHDDInfoList(); } +void HDDMonitor::onTimer() +{ + updateHDDInfoList(); + updateHDDStatistics(); +} void HDDMonitor::updateHDDInfoList() { @@ -438,5 +588,88 @@ void HDDMonitor::updateHDDInfoList() } } +void HDDMonitor::startHDDTransferMeasurement() +{ + for (auto & hdd_stat : hdd_stats_) { + SysfsDevStat sfdevstat; + if (readSysfsDeviceStat(hdd_stat.second.device_, sfdevstat)) { + continue; + } + hdd_stat.second.last_sfdevstat_ = sfdevstat; + } + + last_hdd_stat_update_time_ = this->now(); +} + +void HDDMonitor::updateHDDStatistics() +{ + double duration_sec = (this->now() - last_hdd_stat_update_time_).seconds(); + + for (auto & hdd_stat : hdd_stats_) { + SysfsDevStat sfdevstat; + if (readSysfsDeviceStat(hdd_stat.second.device_, sfdevstat)) { + continue; + } + + SysfsDevStat & last_sfdevstat = hdd_stat.second.last_sfdevstat_; + + hdd_stat.second.read_data_rate_MBs_ = getIncreaseSysfsDeviceStatValuePerSec( + sfdevstat.rd_sectors_, last_sfdevstat.rd_sectors_, duration_sec); + hdd_stat.second.read_data_rate_MBs_ /= 2048; + hdd_stat.second.write_data_rate_MBs_ = getIncreaseSysfsDeviceStatValuePerSec( + sfdevstat.wr_sectors_, last_sfdevstat.wr_sectors_, duration_sec); + hdd_stat.second.write_data_rate_MBs_ /= 2048; + hdd_stat.second.read_iops_ = getIncreaseSysfsDeviceStatValuePerSec( + sfdevstat.rd_ios_, last_sfdevstat.rd_ios_, duration_sec); + hdd_stat.second.write_iops_ = getIncreaseSysfsDeviceStatValuePerSec( + sfdevstat.wr_ios_, last_sfdevstat.wr_ios_, duration_sec); + + hdd_stat.second.last_sfdevstat_ = sfdevstat; + } + + last_hdd_stat_update_time_ = this->now(); +} + +double HDDMonitor::getIncreaseSysfsDeviceStatValuePerSec( + unsigned long cur_val, unsigned long last_val, double duration_sec) +{ + if (cur_val > last_val && duration_sec > 0.0) { + return static_cast(cur_val - last_val) / duration_sec; + } + return 0.0; +} + +int HDDMonitor::readSysfsDeviceStat(const std::string & device, SysfsDevStat & sfdevstat) +{ + int ret = -1; + unsigned int ios_pgr, tot_ticks, rq_ticks, wr_ticks; + unsigned long rd_ios, rd_merges_or_rd_sec, wr_ios, wr_merges; + unsigned long rd_sec_or_wr_ios, wr_sec, rd_ticks_or_wr_sec; + unsigned long dc_ios, dc_merges, dc_sec, dc_ticks; + + std::string filename("/sys/block/"); + filename += device + "/stat"; + FILE * fp = fopen(filename.c_str(), "r"); + if (fp == NULL) { + return ret; + } + + int i = fscanf( + fp, "%lu %lu %lu %lu %lu %lu %lu %u %u %u %u %lu %lu %lu %lu", &rd_ios, &rd_merges_or_rd_sec, + &rd_sec_or_wr_ios, &rd_ticks_or_wr_sec, &wr_ios, &wr_merges, &wr_sec, &wr_ticks, &ios_pgr, + &tot_ticks, &rq_ticks, &dc_ios, &dc_merges, &dc_sec, &dc_ticks); + + if (i >= 7) { + sfdevstat.rd_ios_ = rd_ios; + sfdevstat.rd_sectors_ = rd_sec_or_wr_ios; + sfdevstat.wr_ios_ = wr_ios; + sfdevstat.wr_sectors_ = wr_sec; + ret = 0; + } + + fclose(fp); + return ret; +} + #include RCLCPP_COMPONENTS_REGISTER_NODE(HDDMonitor) From fb6ca488bc89749adbfe494235f4e5446db93483 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Mon, 18 Apr 2022 14:08:35 +0900 Subject: [PATCH 02/10] fix pre-commit C long type error Signed-off-by: v-nakayama7440-esol --- .../include/system_monitor/hdd_monitor/hdd_monitor.hpp | 10 +++++----- system/system_monitor/src/hdd_monitor/hdd_monitor.cpp | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index 4c55ac704dff6..9610a7b7019be 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -69,10 +69,10 @@ struct HDDParam */ struct SysfsDevStat { - unsigned long rd_ios_; //!< @brief number of read operations issued to the device - unsigned long rd_sectors_; //!< @brief number of sectors read - unsigned long wr_ios_; //!< @brief number of write operations issued to the device - unsigned long wr_sectors_; //!< @brief number of sectors written + uint64_t rd_ios_; //!< @brief number of read operations issued to the device + uint64_t rd_sectors_; //!< @brief number of sectors read + uint64_t wr_ios_; //!< @brief number of write operations issued to the device + uint64_t wr_sectors_; //!< @brief number of sectors written SysfsDevStat() : rd_ios_(0), rd_sectors_(0), wr_ios_(0), wr_sectors_(0) {} }; @@ -279,7 +279,7 @@ class HDDMonitor : public rclcpp::Node * @return increment value */ double getIncreaseSysfsDeviceStatValuePerSec( - unsigned long cur_val, unsigned long last_val, double duration_sec); + uint64_t cur_val, uint64_t last_val, double duration_sec); /** * @brief read stats for current whole device using /sys/block/ directory diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index d5560e8036db3..54fd973cfebd9 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -631,7 +631,7 @@ void HDDMonitor::updateHDDStatistics() } double HDDMonitor::getIncreaseSysfsDeviceStatValuePerSec( - unsigned long cur_val, unsigned long last_val, double duration_sec) + uint64_t cur_val, uint64_t last_val, double duration_sec) { if (cur_val > last_val && duration_sec > 0.0) { return static_cast(cur_val - last_val) / duration_sec; @@ -643,9 +643,9 @@ int HDDMonitor::readSysfsDeviceStat(const std::string & device, SysfsDevStat & s { int ret = -1; unsigned int ios_pgr, tot_ticks, rq_ticks, wr_ticks; - unsigned long rd_ios, rd_merges_or_rd_sec, wr_ios, wr_merges; - unsigned long rd_sec_or_wr_ios, wr_sec, rd_ticks_or_wr_sec; - unsigned long dc_ios, dc_merges, dc_sec, dc_ticks; + uint64_t rd_ios, rd_merges_or_rd_sec, wr_ios, wr_merges; + uint64_t rd_sec_or_wr_ios, wr_sec, rd_ticks_or_wr_sec; + uint64_t dc_ios, dc_merges, dc_sec, dc_ticks; std::string filename("/sys/block/"); filename += device + "/stat"; From a312a40a0eae84aa6adc783a2730c7505ba78457 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Mon, 25 Apr 2022 13:38:15 +0900 Subject: [PATCH 03/10] fixed the monitoring method of RecoveredError Signed-off-by: v-nakayama7440-esol --- .../include/system_monitor/hdd_monitor/hdd_monitor.hpp | 4 +++- system/system_monitor/src/hdd_monitor/hdd_monitor.cpp | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index 9610a7b7019be..6deb7f02c5728 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -297,7 +297,9 @@ class HDDMonitor : public rclcpp::Node int hdd_reader_port_; //!< @brief port number to connect to hdd_reader std::map hdd_params_; //!< @brief list of error and warning levels std::vector hdd_devices_; //!< @brief list of devices - std::map hdd_stats_; + std::map + initial_recovered_errors_; //!< @brief list of initial recovered error count + std::map hdd_stats_; //!< @brief list of HDD statistics //!< @brief diagnostic of connection diagnostic_updater::DiagnosticStatusWrapper connect_diag_; HDDInfoList hdd_info_list_; //!< @brief list of HDD information diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index 54fd973cfebd9..dd8415bd30e65 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -175,6 +175,10 @@ void HDDMonitor::checkSMART( } break; case HDDSMARTInfoItem::RECOVERED_ERROR: { int32_t recovered_error = static_cast(hdd_itr->second.recovered_error_); + if (initial_recovered_errors_.find(itr->first) == initial_recovered_errors_.end()) { + initial_recovered_errors_[itr->first] = recovered_error; + } + recovered_error -= initial_recovered_errors_[itr->first]; level = DiagStatus::OK; if (recovered_error >= itr->second.recovered_error_warn_) { From 969466eeedf6c1ec3a3b15e00598b740e97b9d55 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Thu, 28 Apr 2022 10:04:09 +0900 Subject: [PATCH 04/10] additional support for storage health check Signed-off-by: v-nakayama7440-esol --- .../system_monitor/hdd_monitor.param.yaml | 7 ++ .../diagnostic_aggregator/system.param.yaml | 6 ++ system/system_monitor/README.md | 1 + .../config/hdd_monitor.param.yaml | 4 + system/system_monitor/docs/ros_parameters.md | 3 + .../system_monitor/docs/topics_hdd_monitor.md | 46 +++++++--- .../include/hdd_reader/hdd_reader.hpp | 15 ++-- .../hdd_monitor/hdd_monitor.hpp | 22 +++++ .../reader/hdd_reader/hdd_reader.cpp | 22 ++--- .../src/hdd_monitor/hdd_monitor.cpp | 88 +++++++++++++++++-- 10 files changed, 175 insertions(+), 39 deletions(-) diff --git a/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml b/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml index 98e11bee3b61b..04dd3afd09280 100644 --- a/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml +++ b/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml @@ -5,8 +5,15 @@ disks: # Until multi type lists are allowed, name N the disks as disk0...disk{N-1} disk0: name: /dev/sda3 + temp_attribute_id: 0xC2 temp_warn: 55.0 temp_error: 70.0 + power_on_hours_attribute_id: 0x09 + power_on_hours_warn: 3000000 + total_data_written_attribute_id: 0xF1 + total_data_written_warn: 4915200 # =150TB (1unit=32MB) + total_data_written_safety_factor: 0.05 + recovered_error_attribute_id: 0xC3 recovered_error_warn: 1 free_warn: 5120 # MB(8hour) free_error: 100 # MB(last 1 minute) diff --git a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml index 11575764f17c1..56047e79c5fe9 100644 --- a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml +++ b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml @@ -168,6 +168,12 @@ contains: [": HDD Usage"] timeout: 3.0 + connection: + type: diagnostic_aggregator/GenericAnalyzer + path: connection + contains: [": HDD Connection"] + timeout: 3.0 + process: type: diagnostic_aggregator/AnalyzerGroup path: process diff --git a/system/system_monitor/README.md b/system/system_monitor/README.md index 01c4276e46d79..a6b43108bd9fc 100644 --- a/system/system_monitor/README.md +++ b/system/system_monitor/README.md @@ -67,6 +67,7 @@ Every topic is published in 1 minute interval. | | HDD WriteDataRate | ✓ | ✓ | ✓ | | | | HDD ReadIOPS | ✓ | ✓ | ✓ | | | | HDD WriteIOPS | ✓ | ✓ | ✓ | | +| | HDD Connection | ✓ | ✓ | ✓ | | | Memory Monitor | Memory Usage | ✓ | ✓ | ✓ | | | Net Monitor | Network Usage | ✓ | ✓ | ✓ | | | NTP Monitor | NTP Offset | ✓ | ✓ | ✓ | | diff --git a/system/system_monitor/config/hdd_monitor.param.yaml b/system/system_monitor/config/hdd_monitor.param.yaml index ee245df864839..d818d848be801 100644 --- a/system/system_monitor/config/hdd_monitor.param.yaml +++ b/system/system_monitor/config/hdd_monitor.param.yaml @@ -5,11 +5,15 @@ disks: # Until multi type lists are allowed, name N the disks as disk0...disk{N-1} disk0: name: / + temp_attribute_id: 0xC2 temp_warn: 55.0 temp_error: 70.0 + power_on_hours_attribute_id: 0x09 power_on_hours_warn: 3000000 + total_data_written_attribute_id: 0xF1 total_data_written_warn: 4915200 # =150TB (1unit=32MB) total_data_written_safety_factor: 0.05 + recovered_error_attribute_id: 0xC3 recovered_error_warn: 1 free_warn: 5120 # MB(8hour) free_error: 100 # MB(last 1 minute) diff --git a/system/system_monitor/docs/ros_parameters.md b/system/system_monitor/docs/ros_parameters.md index fe4aed9d421b0..988f1bb194751 100644 --- a/system/system_monitor/docs/ros_parameters.md +++ b/system/system_monitor/docs/ros_parameters.md @@ -25,12 +25,15 @@ hdd_monitor: | Name | Type | Unit | Default | Notes | | :------------------------------- | :----: | :---------------: | :-----: | :--------------------------------------------------------------------------------- | | name | string | n/a | none | The disk name to monitor temperature. (e.g. /dev/sda) | +| temp_attribute_id | int | n/a | 0xC2 | S.M.A.R.T attribute ID of temperature. | | temp_warn | float | DegC | 55.0 | Generates warning when HDD temperature reaches a specified value or higher. | | temp_error | float | DegC | 70.0 | Generates error when HDD temperature reaches a specified value or higher. | +| power_on_hours_attribute_id | int | n/a | 0x09 | S.M.A.R.T attribute ID of power-on hours. | | power_on_hours_warn | int | Hour | 3000000 | Generates warning when HDD power-on hours reaches a specified value or higher. | | total_data_written_attribute_id | int | n/a | 0xF1 | S.M.A.R.T attribute ID of total data written. | | total_data_written_warn | int | depends on device | 4915200 | Generates warning when HDD total data written reaches a specified value or higher. | | total_data_written_safety_factor | int | %(1e-2) | 0.05 | Safety factor of HDD total data written. | +| recovered_error_attribute_id | int | n/a | 0xC3 | S.M.A.R.T attribute ID of recovered error. | | recovered_error_warn | int | n/a | 1 | Generates warning when HDD recovered error reaches a specified value or higher. | | read_data_rate_warn | float | MB/s | 360.0 | Generates warning when HDD read data rate reaches a specified value or higher. | | write_data_rate_warn | float | MB/s | 103.5 | Generates warning when HDD write data rate reaches a specified value or higher. | diff --git a/system/system_monitor/docs/topics_hdd_monitor.md b/system/system_monitor/docs/topics_hdd_monitor.md index 0d3dc0ba13633..76db061fa744c 100644 --- a/system/system_monitor/docs/topics_hdd_monitor.md +++ b/system/system_monitor/docs/topics_hdd_monitor.md @@ -14,13 +14,13 @@ [values] -| key | value (example) | -| ---------------------- | -------------------------- | -| HDD [0-9]: status | OK / hot / critical hot | -| HDD [0-9]: name | /dev/nvme0 | -| HDD [0-9]: model | SAMSUNG MZVLB1T0HBLR-000L7 | -| HDD [0-9]: serial | S4EMNF0M820682 | -| HDD [0-9]: temperature | 37.0 DegC | +| key | value (example) | +| ---------------------- | ---------------------------- | +| HDD [0-9]: status | OK / hot / critical hot | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: model | SAMSUNG MZVLB1T0HBLR-000L7 | +| HDD [0-9]: serial | S4EMNF0M820682 | +| HDD [0-9]: temperature | 37.0 DegC
not available | ## HDD PowerOnHours @@ -35,13 +35,13 @@ [values] -| key | value (example) | -| ------------------------- | ----------------------- | -| HDD [0-9]: status | OK / lifetime limit | -| HDD [0-9]: name | /dev/nvme0 | -| HDD [0-9]: model | PHISON PS5012-E12S-512G | -| HDD [0-9]: serial | FB590709182505050767 | -| HDD [0-9]: power on hours | 4834 Hours | +| key | value (example) | +| ------------------------- | ----------------------------- | +| HDD [0-9]: status | OK / lifetime limit | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: model | PHISON PS5012-E12S-512G | +| HDD [0-9]: serial | FB590709182505050767 | +| HDD [0-9]: power on hours | 4834 Hours
not available | ## HDD TotalDataWritten @@ -184,3 +184,21 @@ | HDD [0-9]: status | OK / high IOPS of write | | HDD [0-9]: name | /dev/nvme0 | | HDD [0-9]: IOPS of write | 0.00 IOPS | + +## HDD Connection + +/diagnostics/hdd_monitor: HDD Connection + +[summary] + +| level | message | +| ----- | ------------- | +| OK | OK | +| ERROR | not connected | + +[values] + +| key | value (example) | +| ----------------- | ------------------ | +| HDD [0-9]: status | OK / not connected | +| HDD [0-9]: name | /dev/nvme0 | diff --git a/system/system_monitor/include/hdd_reader/hdd_reader.hpp b/system/system_monitor/include/hdd_reader/hdd_reader.hpp index 1a830311978f7..cf8001fbb59b7 100644 --- a/system/system_monitor/include/hdd_reader/hdd_reader.hpp +++ b/system/system_monitor/include/hdd_reader/hdd_reader.hpp @@ -28,17 +28,14 @@ #include #include -/** - * @brief ATA attribute IDs - */ -enum class ATAAttributeIDs : uint8_t { TEMPERATURE = 0, POWER_ON_HOURS = 1, SIZE }; - /** * @brief HDD device */ struct HDDDevice { - std::string name_; //!< @brief Device name + std::string name_; //!< @brief Device name + uint8_t temp_attribute_id_; //!< @brief S.M.A.R.T attribute ID of temperature + uint8_t power_on_hours_attribute_id_; //!< @brief S.M.A.R.T attribute ID of power on hours uint8_t total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written uint8_t recovered_error_attribute_id_; //!< @brief S.M.A.R.T attribute ID of recovered error @@ -54,6 +51,8 @@ struct HDDDevice void serialize(archive & ar, const unsigned /*version*/) // NOLINT(runtime/references) { ar & name_; + ar & temp_attribute_id_; + ar & power_on_hours_attribute_id_; ar & total_data_written_attribute_id_; ar & recovered_error_attribute_id_; } @@ -73,6 +72,8 @@ struct HDDInfo uint64_t power_on_hours_; //!< @brief power on hours count uint64_t total_data_written_; //!< @brief total data written uint32_t recovered_error_; //!< @brief recovered error count + bool is_valid_temp_; //!< @brief whether temp_ is valid value + bool is_valid_power_on_hours_; //!< @brief whether power_on_hours_ is valid value bool is_valid_total_data_written_; //!< @brief whether total_data_written_ is valid value bool is_valid_recovered_error_; //!< @brief whether recovered_error_ is valid value @@ -92,6 +93,8 @@ struct HDDInfo ar & temp_; ar & power_on_hours_; ar & total_data_written_; + ar & is_valid_temp_; + ar & is_valid_power_on_hours_; ar & is_valid_total_data_written_; ar & is_valid_recovered_error_; } diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index 6deb7f02c5728..ac62ab6eb234a 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -232,6 +232,15 @@ class HDDMonitor : public rclcpp::Node diagnostic_updater::DiagnosticStatusWrapper & stat, HDDStatItem item); // NOLINT(runtime/references) + /** + * @brief check HDD connection + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkConnection( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** * @brief human readable size string to MB * @param [in] human readable size string @@ -289,6 +298,11 @@ class HDDMonitor : public rclcpp::Node */ int readSysfsDeviceStat(const std::string & device, SysfsDevStat & sfdevstat); + /** + * @brief update HDD connections + */ + void updateHDDConnections(); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get HDD information from HDDReader @@ -297,6 +311,8 @@ class HDDMonitor : public rclcpp::Node int hdd_reader_port_; //!< @brief port number to connect to hdd_reader std::map hdd_params_; //!< @brief list of error and warning levels std::vector hdd_devices_; //!< @brief list of devices + std::map + hdd_connected_flags_; //!< @brief list of flag whether HDD is connected std::map initial_recovered_errors_; //!< @brief list of initial recovered error count std::map hdd_stats_; //!< @brief list of HDD statistics @@ -350,6 +366,12 @@ class HDDMonitor : public rclcpp::Node {DiagStatus::WARN, "high IOPS of write"}, {DiagStatus::ERROR, "unused"}}, }; + + /** + * @brief HDD connection status messages + */ + const std::map connection_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "unused"}, {DiagStatus::ERROR, "not connected"}}; }; #endif // SYSTEM_MONITOR__HDD_MONITOR__HDD_MONITOR_HPP_ diff --git a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp index fa54bcd9c6550..55f7fe1668835 100644 --- a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp +++ b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp @@ -283,17 +283,21 @@ int get_ata_SMARTData(int fd, HDDInfo * info, const HDDDevice & device) return errno; } - std::bitset(ATAAttributeIDs::SIZE)> found_flag; + info->is_valid_temp_ = false; + info->is_valid_power_on_hours_ = false; info->is_valid_total_data_written_ = false; info->is_valid_recovered_error_ = false; // Retrieve S.M.A.R.T. Informations for (int i = 0; i < 30; ++i) { - if (data.attribute_entry_[i].attribute_id_ == 0xC2) { // Temperature - Device Internal + if (data.attribute_entry_[i].attribute_id_ == device.temp_attribute_id_) { // Temperature - + // Device Internal info->temp_ = static_cast(data.attribute_entry_[i].data_); - found_flag.set(static_cast(ATAAttributeIDs::TEMPERATURE)); - } else if (data.attribute_entry_[i].attribute_id_ == 0x09) { // Power-on Hours Count + info->is_valid_temp_ = true; + } else if ( + data.attribute_entry_[i].attribute_id_ == + device.power_on_hours_attribute_id_) { // Power-on Hours Count info->power_on_hours_ = data.attribute_entry_[i].data_; - found_flag.set(static_cast(ATAAttributeIDs::POWER_ON_HOURS)); + info->is_valid_power_on_hours_ = true; } else if ( data.attribute_entry_[i].attribute_id_ == device.total_data_written_attribute_id_) { // Total LBAs Written @@ -309,11 +313,7 @@ int get_ata_SMARTData(int fd, HDDInfo * info, const HDDDevice & device) } } - if (found_flag.all()) { - return EXIT_SUCCESS; - } - - return ENOENT; + return EXIT_SUCCESS; } /** @@ -390,6 +390,7 @@ int get_nvme_SMARTData(int fd, HDDInfo * info) // Bytes 2:1 Composite Temperature // Convert kelvin to celsius unsigned int temperature = ((data[2] << 8) | data[1]) - 273; + info->is_valid_temp_ = true; info->temp_ = static_cast(temperature); // Bytes 63:48 Data Units Written @@ -403,6 +404,7 @@ int get_nvme_SMARTData(int fd, HDDInfo * info) info->total_data_written_ = *(reinterpret_cast(&data[48])); // Bytes 143:128 Power On Hours + info->is_valid_power_on_hours_ = true; info->power_on_hours_ = *(reinterpret_cast(&data[128])); // NVMe S.M.A.R.T has no information of recovered error count diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index dd8415bd30e65..0bbefbef35b55 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -60,6 +60,10 @@ HDDMonitor::HDDMonitor(const rclcpp::NodeOptions & options) updater_.add("HDD WriteDataRate", this, &HDDMonitor::checkWriteDataRate); updater_.add("HDD ReadIOPS", this, &HDDMonitor::checkReadIOPS); updater_.add("HDD WriteIOPS", this, &HDDMonitor::checkWriteIOPS); + updater_.add("HDD Connection", this, &HDDMonitor::checkConnection); + + // get HDD connection status + updateHDDConnections(); // get HDD information from HDD reader for the first time updateHDDInfoList(); @@ -118,6 +122,10 @@ void HDDMonitor::checkSMART( std::string val_str = ""; for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++index) { + if (!hdd_connected_flags_[itr->first]) { + continue; + } + // Retrieve HDD information auto hdd_itr = hdd_info_list_.find(itr->second.device_); if (hdd_itr == hdd_info_list_.end()) { @@ -147,7 +155,11 @@ void HDDMonitor::checkSMART( level = DiagStatus::WARN; } key_str = fmt::format("HDD {}: temperature", index); - val_str = fmt::format("{:.1f} DegC", temp); + if (hdd_itr->second.is_valid_temp_) { + val_str = fmt::format("{:.1f} DegC", temp); + } else { + val_str = "not available"; + } } break; case HDDSMARTInfoItem::POWER_ON_HOURS: { int64_t power_on_hours = static_cast(hdd_itr->second.power_on_hours_); @@ -157,7 +169,11 @@ void HDDMonitor::checkSMART( level = DiagStatus::WARN; } key_str = fmt::format("HDD {}: power on hours", index); - val_str = fmt::format("{} Hours", hdd_itr->second.power_on_hours_); + if (hdd_itr->second.is_valid_power_on_hours_) { + val_str = fmt::format("{} Hours", hdd_itr->second.power_on_hours_); + } else { + val_str = "not available"; + } } break; case HDDSMARTInfoItem::TOTAL_DATA_WRITTEN: { uint64_t total_data_written = static_cast(hdd_itr->second.total_data_written_); @@ -230,6 +246,10 @@ void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) std::string error_str = ""; for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++hdd_index) { + if (!hdd_connected_flags_[itr->first]) { + continue; + } + // Get summary of disk space usage of ext4 bp::ipstream is_out; bp::ipstream is_err; @@ -348,6 +368,10 @@ void HDDMonitor::checkStatistics( std::string val_str = ""; for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++hdd_index) { + if (!hdd_connected_flags_[itr->first]) { + continue; + } + int level = DiagStatus::OK; switch (item) { @@ -415,18 +439,44 @@ void HDDMonitor::checkStatistics( SystemMonitorUtility::stopMeasurement(t_start, stat); } +void HDDMonitor::checkConnection(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (hdd_params_.empty()) { + stat.summary(DiagStatus::ERROR, "invalid disk parameter"); + return; + } + + int hdd_index = 0; + int whole_level = DiagStatus::OK; + + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++hdd_index) { + int level = DiagStatus::OK; + + if (!hdd_connected_flags_[itr->first]) { + level = DiagStatus::ERROR; + } + + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.device_.c_str()); + stat.add(fmt::format("HDD {}: status", hdd_index), connection_dict_.at(level)); + + whole_level = std::max(whole_level, level); + } + + stat.summary(whole_level, connection_dict_.at(whole_level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + void HDDMonitor::getHDDParams() { const auto num_disks = this->declare_parameter("num_disks", 0); for (auto i = 0; i < num_disks; ++i) { const auto prefix = "disks.disk" + std::to_string(i); - const auto name = declare_parameter(prefix + ".name", "/"); - - // Get device name from mount point - const auto device_name = getDeviceFromMountPoint(name); - if (device_name.empty()) { - continue; - } + const auto device_name = declare_parameter(prefix + ".name", "/"); HDDParam param; param.temp_warn_ = declare_parameter(prefix + ".temp_warn", 55.0f); @@ -458,6 +508,10 @@ void HDDMonitor::getHDDParams() HDDDevice device; device.name_ = param.device_; + device.temp_attribute_id_ = + static_cast(declare_parameter(prefix + ".temp_attribute_id", 0xC2)); + device.power_on_hours_attribute_id_ = + static_cast(declare_parameter(prefix + ".power_on_hours_attribute_id", 0x09)); device.total_data_written_attribute_id_ = static_cast( declare_parameter(prefix + ".total_data_written_attribute_id", 0xF1)); device.recovered_error_attribute_id_ = @@ -497,6 +551,7 @@ std::string HDDMonitor::getDeviceFromMountPoint(const std::string & mount_point) void HDDMonitor::onTimer() { + updateHDDConnections(); updateHDDInfoList(); updateHDDStatistics(); } @@ -595,8 +650,10 @@ void HDDMonitor::updateHDDInfoList() void HDDMonitor::startHDDTransferMeasurement() { for (auto & hdd_stat : hdd_stats_) { + hdd_stat.second.error_str_ = ""; SysfsDevStat sfdevstat; if (readSysfsDeviceStat(hdd_stat.second.device_, sfdevstat)) { + hdd_stat.second.error_str_ = "stat file read error"; continue; } hdd_stat.second.last_sfdevstat_ = sfdevstat; @@ -610,8 +667,10 @@ void HDDMonitor::updateHDDStatistics() double duration_sec = (this->now() - last_hdd_stat_update_time_).seconds(); for (auto & hdd_stat : hdd_stats_) { + hdd_stat.second.error_str_ = ""; SysfsDevStat sfdevstat; if (readSysfsDeviceStat(hdd_stat.second.device_, sfdevstat)) { + hdd_stat.second.error_str_ = "stat file read error"; continue; } @@ -675,5 +734,16 @@ int HDDMonitor::readSysfsDeviceStat(const std::string & device, SysfsDevStat & s return ret; } +void HDDMonitor::updateHDDConnections() +{ + for (const auto & hdd_param : hdd_params_) { + if (!getDeviceFromMountPoint(hdd_param.first).empty()) { + hdd_connected_flags_[hdd_param.first] = true; + } else { + hdd_connected_flags_[hdd_param.first] = false; + } + } +} + #include RCLCPP_COMPONENTS_REGISTER_NODE(HDDMonitor) From 4aeb4d4114f19e70483732cd7ed9f53911c9f211 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Thu, 28 Apr 2022 10:25:21 +0900 Subject: [PATCH 05/10] resolve conflicts Signed-off-by: v-nakayama7440-esol --- .../config/diagnostic_aggregator/system.param.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml index 56047e79c5fe9..19e08a1b41a4f 100644 --- a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml +++ b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml @@ -168,6 +168,18 @@ contains: [": HDD Usage"] timeout: 3.0 + power_on_hours: + type: diagnostic_aggregator/GenericAnalyzer + path: usage + contains: [": HDD PowerOnHours"] + timeout: 3.0 + + total_data_written: + type: diagnostic_aggregator/GenericAnalyzer + path: usage + contains: [": HDD TotalDataWritten"] + timeout: 3.0 + connection: type: diagnostic_aggregator/GenericAnalyzer path: connection From 283f885f1b686f16fdd15346cad147dda8a85a53 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Mon, 11 Jul 2022 09:41:21 +0900 Subject: [PATCH 06/10] fix bug when setting mount point of HDD Monitor Signed-off-by: v-nakayama7440-esol --- .../system_monitor/docs/topics_hdd_monitor.md | 9 +- .../include/hdd_reader/hdd_reader.hpp | 1 + .../hdd_monitor/hdd_monitor.hpp | 15 ++- .../src/hdd_monitor/hdd_monitor.cpp | 98 ++++++++++++------- 4 files changed, 82 insertions(+), 41 deletions(-) diff --git a/system/system_monitor/docs/topics_hdd_monitor.md b/system/system_monitor/docs/topics_hdd_monitor.md index 76db061fa744c..7705fcea81091 100644 --- a/system/system_monitor/docs/topics_hdd_monitor.md +++ b/system/system_monitor/docs/topics_hdd_monitor.md @@ -198,7 +198,8 @@ [values] -| key | value (example) | -| ----------------- | ------------------ | -| HDD [0-9]: status | OK / not connected | -| HDD [0-9]: name | /dev/nvme0 | +| key | value (example) | +| ---------------------- | ------------------ | +| HDD [0-9]: status | OK / not connected | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: mount point | / | diff --git a/system/system_monitor/include/hdd_reader/hdd_reader.hpp b/system/system_monitor/include/hdd_reader/hdd_reader.hpp index cf8001fbb59b7..6913a2d167040 100644 --- a/system/system_monitor/include/hdd_reader/hdd_reader.hpp +++ b/system/system_monitor/include/hdd_reader/hdd_reader.hpp @@ -93,6 +93,7 @@ struct HDDInfo ar & temp_; ar & power_on_hours_; ar & total_data_written_; + ar & recovered_error_; ar & is_valid_temp_; ar & is_valid_power_on_hours_; ar & is_valid_total_data_written_; diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index ac62ab6eb234a..af8f18f8f62b5 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -33,7 +33,8 @@ */ struct HDDParam { - std::string device_; //!< @brief device + std::string part_device_; //!< @brief partition device + std::string disk_device_; //!< @brief disk device float temp_warn_; //!< @brief HDD temperature(DegC) to generate warning float temp_error_; //!< @brief HDD temperature(DegC) to generate error int power_on_hours_warn_; //!< @brief HDD power on hours to generate warning @@ -46,6 +47,11 @@ struct HDDParam float write_data_rate_warn_; //!< @brief HDD data rate(MB/s) of write to generate warning float read_iops_warn_; //!< @brief HDD IOPS of read to generate warning float write_iops_warn_; //!< @brief HDD IOPS of write to generate warning + uint8_t temp_attribute_id_; //!< @brief S.M.A.R.T attribute ID of temperature + uint8_t power_on_hours_attribute_id_; //!< @brief S.M.A.R.T attribute ID of power on hours + uint8_t + total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written + uint8_t recovered_error_attribute_id_; //!< @brief S.M.A.R.T attribute ID of recovered error HDDParam() : temp_warn_(55.0), @@ -59,7 +65,11 @@ struct HDDParam read_data_rate_warn_(360.0), write_data_rate_warn_(103.5), read_iops_warn_(63360.0), - write_iops_warn_(24120.0) + write_iops_warn_(24120.0), + temp_attribute_id_(0xC2), + power_on_hours_attribute_id_(0x09), + total_data_written_attribute_id_(0xF1), + recovered_error_attribute_id_(0xC3) { } }; @@ -310,7 +320,6 @@ class HDDMonitor : public rclcpp::Node int hdd_reader_port_; //!< @brief port number to connect to hdd_reader std::map hdd_params_; //!< @brief list of error and warning levels - std::vector hdd_devices_; //!< @brief list of devices std::map hdd_connected_flags_; //!< @brief list of flag whether HDD is connected std::map diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index 0bbefbef35b55..8185b81aa9787 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -127,10 +127,10 @@ void HDDMonitor::checkSMART( } // Retrieve HDD information - auto hdd_itr = hdd_info_list_.find(itr->second.device_); + auto hdd_itr = hdd_info_list_.find(itr->second.disk_device_); if (hdd_itr == hdd_info_list_.end()) { stat.add(fmt::format("HDD {}: status", index), "hdd_reader error"); - stat.add(fmt::format("HDD {}: name", index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: name", index), itr->second.part_device_.c_str()); stat.add(fmt::format("HDD {}: hdd_reader", index), strerror(ENOENT)); error_str = "hdd_reader error"; continue; @@ -138,7 +138,7 @@ void HDDMonitor::checkSMART( if (hdd_itr->second.error_code_ != 0) { stat.add(fmt::format("HDD {}: status", index), "hdd_reader error"); - stat.add(fmt::format("HDD {}: name", index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: name", index), itr->second.part_device_.c_str()); stat.add(fmt::format("HDD {}: hdd_reader", index), strerror(hdd_itr->second.error_code_)); error_str = "hdd_reader error"; continue; @@ -213,7 +213,7 @@ void HDDMonitor::checkSMART( stat.add( fmt::format("HDD {}: status", index), smart_dicts_[static_cast(item)].at(level)); - stat.add(fmt::format("HDD {}: name", index), itr->second.device_.c_str()); + stat.add(fmt::format("HDD {}: name", index), itr->second.disk_device_.c_str()); stat.add(fmt::format("HDD {}: model", index), hdd_itr->second.model_.c_str()); stat.add(fmt::format("HDD {}: serial", index), hdd_itr->second.serial_.c_str()); stat.addf(key_str, val_str.c_str()); @@ -255,8 +255,8 @@ void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) bp::ipstream is_err; // Invoke shell to use shell wildcard expansion bp::child c( - "/bin/sh", "-c", fmt::format("df -Pm {}*", itr->first.c_str()), bp::std_out > is_out, - bp::std_err > is_err); + "/bin/sh", "-c", fmt::format("df -Pm {}*", itr->second.part_device_.c_str()), + bp::std_out > is_out, bp::std_err > is_err); c.wait(); if (c.exit_code() != 0) { @@ -264,7 +264,7 @@ void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) is_err >> os.rdbuf(); error_str = "df error"; stat.add(fmt::format("HDD {}: status", hdd_index), "df error"); - stat.add(fmt::format("HDD {}: name", hdd_index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.part_device_.c_str()); stat.add(fmt::format("HDD {}: df", hdd_index), os.str().c_str()); continue; } @@ -415,14 +415,15 @@ void HDDMonitor::checkStatistics( break; } - stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.device_.c_str()); if (!hdd_stats_[itr->first].error_str_.empty()) { error_str = hdd_stats_[itr->first].error_str_; stat.add(fmt::format("HDD {}: status", hdd_index), error_str); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.disk_device_.c_str()); } else { stat.add( fmt::format("HDD {}: status", hdd_index), stat_dicts_[static_cast(item)].at(level)); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.disk_device_.c_str()); stat.add(key_str, val_str.c_str()); } @@ -459,8 +460,9 @@ void HDDMonitor::checkConnection(diagnostic_updater::DiagnosticStatusWrapper & s level = DiagStatus::ERROR; } - stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.device_.c_str()); stat.add(fmt::format("HDD {}: status", hdd_index), connection_dict_.at(level)); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.disk_device_); + stat.add(fmt::format("HDD {}: mount point", hdd_index), itr->first.c_str()); whole_level = std::max(whole_level, level); } @@ -476,7 +478,7 @@ void HDDMonitor::getHDDParams() const auto num_disks = this->declare_parameter("num_disks", 0); for (auto i = 0; i < num_disks; ++i) { const auto prefix = "disks.disk" + std::to_string(i); - const auto device_name = declare_parameter(prefix + ".name", "/"); + const auto mount_point = declare_parameter(prefix + ".name", "/"); HDDParam param; param.temp_warn_ = declare_parameter(prefix + ".temp_warn", 55.0f); @@ -495,33 +497,19 @@ void HDDMonitor::getHDDParams() param.write_data_rate_warn_ = declare_parameter(prefix + ".write_data_rate_warn", 103.5); param.read_iops_warn_ = declare_parameter(prefix + ".read_iops_warn", 63360.0); param.write_iops_warn_ = declare_parameter(prefix + ".write_iops_warn", 24120.0); - - // Remove index number of partition for passing device name to hdd-reader - if (boost::starts_with(device_name, "/dev/sd")) { - const std::regex pattern("\\d+$"); - param.device_ = std::regex_replace(device_name, pattern, ""); - } else if (boost::starts_with(device_name, "/dev/nvme")) { - const std::regex pattern("p\\d+$"); - param.device_ = std::regex_replace(device_name, pattern, ""); - } - hdd_params_[device_name] = param; - - HDDDevice device; - device.name_ = param.device_; - device.temp_attribute_id_ = + param.temp_attribute_id_ = static_cast(declare_parameter(prefix + ".temp_attribute_id", 0xC2)); - device.power_on_hours_attribute_id_ = + param.power_on_hours_attribute_id_ = static_cast(declare_parameter(prefix + ".power_on_hours_attribute_id", 0x09)); - device.total_data_written_attribute_id_ = static_cast( + param.total_data_written_attribute_id_ = static_cast( declare_parameter(prefix + ".total_data_written_attribute_id", 0xF1)); - device.recovered_error_attribute_id_ = + param.recovered_error_attribute_id_ = static_cast(declare_parameter(prefix + ".recovered_error_attribute_id", 0xC3)); - hdd_devices_.push_back(device); + + hdd_params_[mount_point] = param; HDDStat stat; - const std::regex raw_pattern(".*/"); - stat.device_ = std::regex_replace(param.device_, raw_pattern, ""); - hdd_stats_[device_name] = stat; + hdd_stats_[mount_point] = stat; } } @@ -596,9 +584,24 @@ void HDDMonitor::updateHDDInfoList() return; } + std::vector hdd_devices; + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr) { + if (!hdd_connected_flags_[itr->first]) { + continue; + } + + HDDDevice device; + device.name_ = itr->second.disk_device_; + device.temp_attribute_id_ = itr->second.temp_attribute_id_; + device.power_on_hours_attribute_id_ = itr->second.power_on_hours_attribute_id_; + device.total_data_written_attribute_id_ = itr->second.total_data_written_attribute_id_; + device.recovered_error_attribute_id_ = itr->second.recovered_error_attribute_id_; + hdd_devices.push_back(device); + } + std::ostringstream oss; boost::archive::text_oarchive oa(oss); - oa & hdd_devices_; + oa & hdd_devices; // Write list of devices to FD ret = write(sock, oss.str().c_str(), oss.str().length()); @@ -651,6 +654,11 @@ void HDDMonitor::startHDDTransferMeasurement() { for (auto & hdd_stat : hdd_stats_) { hdd_stat.second.error_str_ = ""; + + if (!hdd_connected_flags_[hdd_stat.first]) { + continue; + } + SysfsDevStat sfdevstat; if (readSysfsDeviceStat(hdd_stat.second.device_, sfdevstat)) { hdd_stat.second.error_str_ = "stat file read error"; @@ -668,6 +676,11 @@ void HDDMonitor::updateHDDStatistics() for (auto & hdd_stat : hdd_stats_) { hdd_stat.second.error_str_ = ""; + + if (!hdd_connected_flags_[hdd_stat.first]) { + continue; + } + SysfsDevStat sfdevstat; if (readSysfsDeviceStat(hdd_stat.second.device_, sfdevstat)) { hdd_stat.second.error_str_ = "stat file read error"; @@ -736,9 +749,26 @@ int HDDMonitor::readSysfsDeviceStat(const std::string & device, SysfsDevStat & s void HDDMonitor::updateHDDConnections() { - for (const auto & hdd_param : hdd_params_) { - if (!getDeviceFromMountPoint(hdd_param.first).empty()) { + for (auto & hdd_param : hdd_params_) { + // Get device name from mount point + hdd_param.second.part_device_ = getDeviceFromMountPoint(hdd_param.first); + if (!hdd_param.second.part_device_.empty()) { hdd_connected_flags_[hdd_param.first] = true; + + // Remove index number of partition for passing device name to hdd-reader + if (boost::starts_with(hdd_param.second.part_device_, "/dev/sd")) { + const std::regex pattern("\\d+$"); + hdd_param.second.disk_device_ = + std::regex_replace(hdd_param.second.part_device_, pattern, ""); + } else if (boost::starts_with(hdd_param.second.part_device_, "/dev/nvme")) { + const std::regex pattern("p\\d+$"); + hdd_param.second.disk_device_ = + std::regex_replace(hdd_param.second.part_device_, pattern, ""); + } + + const std::regex raw_pattern(".*/"); + hdd_stats_[hdd_param.first].device_ = + std::regex_replace(hdd_param.second.disk_device_, raw_pattern, ""); } else { hdd_connected_flags_[hdd_param.first] = false; } From bc3ff3527fb06373f89ce3972d33fba41abb95f3 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Fri, 9 Sep 2022 09:26:21 +0900 Subject: [PATCH 07/10] fix(system_monitor): level change when not connected and unmount function added in HDD connection monitoring Signed-off-by: v-nakayama7440-esol --- .../hdd_monitor/hdd_monitor.hpp | 9 ++- .../src/hdd_monitor/hdd_monitor.cpp | 80 ++++++++++++++----- 2 files changed, 70 insertions(+), 19 deletions(-) diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index af8f18f8f62b5..a1de14fc28e73 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -313,6 +313,13 @@ class HDDMonitor : public rclcpp::Node */ void updateHDDConnections(); + /** + * @brief unmount device with lazy option + * @param [in] device device name + * @return result of success or failure + */ + int unmountDeviceWithLazy(std::string & device); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get HDD information from HDDReader @@ -380,7 +387,7 @@ class HDDMonitor : public rclcpp::Node * @brief HDD connection status messages */ const std::map connection_dict_ = { - {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "unused"}, {DiagStatus::ERROR, "not connected"}}; + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "not connected"}, {DiagStatus::ERROR, "unused"}}; }; #endif // SYSTEM_MONITOR__HDD_MONITOR__HDD_MONITOR_HPP_ diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index 8185b81aa9787..1a18e7f76e7bd 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -457,7 +458,7 @@ void HDDMonitor::checkConnection(diagnostic_updater::DiagnosticStatusWrapper & s int level = DiagStatus::OK; if (!hdd_connected_flags_[itr->first]) { - level = DiagStatus::ERROR; + level = DiagStatus::WARN; } stat.add(fmt::format("HDD {}: status", hdd_index), connection_dict_.at(level)); @@ -753,26 +754,69 @@ void HDDMonitor::updateHDDConnections() // Get device name from mount point hdd_param.second.part_device_ = getDeviceFromMountPoint(hdd_param.first); if (!hdd_param.second.part_device_.empty()) { - hdd_connected_flags_[hdd_param.first] = true; - - // Remove index number of partition for passing device name to hdd-reader - if (boost::starts_with(hdd_param.second.part_device_, "/dev/sd")) { - const std::regex pattern("\\d+$"); - hdd_param.second.disk_device_ = - std::regex_replace(hdd_param.second.part_device_, pattern, ""); - } else if (boost::starts_with(hdd_param.second.part_device_, "/dev/nvme")) { - const std::regex pattern("p\\d+$"); - hdd_param.second.disk_device_ = - std::regex_replace(hdd_param.second.part_device_, pattern, ""); - } + // Check the existence of device file + std::error_code ec; + if (std::filesystem::exists(hdd_param.second.part_device_, ec)) { + hdd_connected_flags_[hdd_param.first] = true; + + // Remove index number of partition for passing device name to hdd-reader + if (boost::starts_with(hdd_param.second.part_device_, "/dev/sd")) { + const std::regex pattern("\\d+$"); + hdd_param.second.disk_device_ = + std::regex_replace(hdd_param.second.part_device_, pattern, ""); + } else if (boost::starts_with(hdd_param.second.part_device_, "/dev/nvme")) { + const std::regex pattern("p\\d+$"); + hdd_param.second.disk_device_ = + std::regex_replace(hdd_param.second.part_device_, pattern, ""); + } - const std::regex raw_pattern(".*/"); - hdd_stats_[hdd_param.first].device_ = - std::regex_replace(hdd_param.second.disk_device_, raw_pattern, ""); - } else { - hdd_connected_flags_[hdd_param.first] = false; + const std::regex raw_pattern(".*/"); + hdd_stats_[hdd_param.first].device_ = + std::regex_replace(hdd_param.second.disk_device_, raw_pattern, ""); + + continue; + } else { + // Unmount to solve the state where the device is mounted without existing + if (unmountDeviceWithLazy(hdd_param.second.part_device_)) { + RCLCPP_ERROR( + get_logger(), "Failed to unmount. %s", hdd_param.second.part_device_.c_str()); + } + } } + hdd_connected_flags_[hdd_param.first] = false; + } +} + +int HDDMonitor::unmountDeviceWithLazy(std::string & device) +{ + // boost::process create file descriptor without O_CLOEXEC required for multithreading. + // So create file descriptor with O_CLOEXEC and pass it to boost::process. + int out_fd[2]; + if (pipe2(out_fd, O_CLOEXEC) != 0) { + RCLCPP_ERROR(get_logger(), "Failed to execute pipe2. %s", strerror(errno)); + return -1; + } + bp::pipe out_pipe{out_fd[0], out_fd[1]}; + bp::ipstream is_out{std::move(out_pipe)}; + + int err_fd[2]; + if (pipe2(err_fd, O_CLOEXEC) != 0) { + RCLCPP_ERROR(get_logger(), "Failed to execute pipe2. %s", strerror(errno)); + return -1; + } + bp::pipe err_pipe{err_fd[0], err_fd[1]}; + bp::ipstream is_err{std::move(err_pipe)}; + + bp::child c( + "/bin/sh", "-c", fmt::format("umount -l {}", device.c_str()), bp::std_out > is_out, + bp::std_err > is_err); + c.wait(); + + if (c.exit_code() != 0) { + RCLCPP_ERROR(get_logger(), "Failed to execute umount command. %s", device.c_str()); + return -1; } + return 0; } #include From 6a17a655cff428cf4080087019f4eb5ef3989cc2 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Fri, 9 Sep 2022 09:38:59 +0900 Subject: [PATCH 08/10] fix(system_monitor): level change when not connected in HDD connection monitoring Signed-off-by: v-nakayama7440-esol --- system/system_monitor/docs/topics_hdd_monitor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system/system_monitor/docs/topics_hdd_monitor.md b/system/system_monitor/docs/topics_hdd_monitor.md index 7705fcea81091..31301177f38f1 100644 --- a/system/system_monitor/docs/topics_hdd_monitor.md +++ b/system/system_monitor/docs/topics_hdd_monitor.md @@ -194,7 +194,7 @@ | level | message | | ----- | ------------- | | OK | OK | -| ERROR | not connected | +| WARN | not connected | [values] From b679b8a57aff29f39d265a14a8892382e4d0bc14 Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Tue, 13 Sep 2022 17:27:42 +0900 Subject: [PATCH 09/10] fix(system_monitor): unmount function added in hdd_reader Signed-off-by: v-nakayama7440-esol --- .../include/hdd_reader/hdd_reader.hpp | 5 ++ .../hdd_monitor/hdd_monitor.hpp | 9 +-- .../reader/hdd_reader/hdd_reader.cpp | 26 ++++++++ .../src/hdd_monitor/hdd_monitor.cpp | 66 ++++++------------- 4 files changed, 52 insertions(+), 54 deletions(-) diff --git a/system/system_monitor/include/hdd_reader/hdd_reader.hpp b/system/system_monitor/include/hdd_reader/hdd_reader.hpp index 6913a2d167040..9b34a3ece413b 100644 --- a/system/system_monitor/include/hdd_reader/hdd_reader.hpp +++ b/system/system_monitor/include/hdd_reader/hdd_reader.hpp @@ -40,6 +40,9 @@ struct HDDDevice total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written uint8_t recovered_error_attribute_id_; //!< @brief S.M.A.R.T attribute ID of recovered error + uint8_t unmount_request_flag_; //!< @brief unmount request flag + std::string part_device_; //!< @brief partition device + /** * @brief Load or save data members. * @param [inout] ar archive reference to load or save the serialized data members @@ -55,6 +58,8 @@ struct HDDDevice ar & power_on_hours_attribute_id_; ar & total_data_written_attribute_id_; ar & recovered_error_attribute_id_; + ar & unmount_request_flag_; + ar & part_device_; } }; diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index a1de14fc28e73..8eac2c632bf16 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -313,13 +313,6 @@ class HDDMonitor : public rclcpp::Node */ void updateHDDConnections(); - /** - * @brief unmount device with lazy option - * @param [in] device device name - * @return result of success or failure - */ - int unmountDeviceWithLazy(std::string & device); - diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get HDD information from HDDReader @@ -329,6 +322,8 @@ class HDDMonitor : public rclcpp::Node std::map hdd_params_; //!< @brief list of error and warning levels std::map hdd_connected_flags_; //!< @brief list of flag whether HDD is connected + std::map + device_unmount_request_flags_; //!< @brief list of flag requesting device unmount std::map initial_recovered_errors_; //!< @brief list of initial recovered error count std::map hdd_stats_; //!< @brief list of HDD statistics diff --git a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp index 706be5685f991..e07f66e25046e 100644 --- a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp +++ b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp @@ -24,9 +24,11 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -405,6 +407,25 @@ int get_nvme_SMARTData(int fd, HDDInfo * info) return EXIT_SUCCESS; } +/** + * @brief unmount device with lazy option + * @param [in] device device name + */ +void unmount_device_with_lazy(std::string & device) +{ + boost::process::ipstream is_out; + boost::process::ipstream is_err; + + boost::process::child c( + "/bin/sh", "-c", fmt::format("umount -l {}", device.c_str()), boost::process::std_out > is_out, + boost::process::std_err > is_err); + c.wait(); + + if (c.exit_code() != 0) { + syslog(LOG_ERR, "Failed to execute umount command. %s\n", device.c_str()); + } +} + /** * @brief check HDD temperature * @param [in] port port to listen @@ -499,6 +520,11 @@ void run(int port) boost::archive::text_oarchive oa(oss); for (auto & hdd_device : hdd_devices) { + if (hdd_device.unmount_request_flag_) { + unmount_device_with_lazy(hdd_device.part_device_); + continue; + } + HDDInfo info{}; // Open a file diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index 2e66471cb9f1e..624635028190a 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -626,16 +626,22 @@ void HDDMonitor::updateHDDInfoList() std::vector hdd_devices; for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr) { - if (!hdd_connected_flags_[itr->first]) { + HDDDevice device; + + if (hdd_connected_flags_[itr->first]) { + device.name_ = itr->second.disk_device_; + device.temp_attribute_id_ = itr->second.temp_attribute_id_; + device.power_on_hours_attribute_id_ = itr->second.power_on_hours_attribute_id_; + device.total_data_written_attribute_id_ = itr->second.total_data_written_attribute_id_; + device.recovered_error_attribute_id_ = itr->second.recovered_error_attribute_id_; + device.unmount_request_flag_ = 0; + } else if (device_unmount_request_flags_[itr->first]) { + device.part_device_ = itr->second.part_device_; + device.unmount_request_flag_ = 1; + } else { continue; } - HDDDevice device; - device.name_ = itr->second.disk_device_; - device.temp_attribute_id_ = itr->second.temp_attribute_id_; - device.power_on_hours_attribute_id_ = itr->second.power_on_hours_attribute_id_; - device.total_data_written_attribute_id_ = itr->second.total_data_written_attribute_id_; - device.recovered_error_attribute_id_ = itr->second.recovered_error_attribute_id_; hdd_devices.push_back(device); } @@ -790,6 +796,9 @@ int HDDMonitor::readSysfsDeviceStat(const std::string & device, SysfsDevStat & s void HDDMonitor::updateHDDConnections() { for (auto & hdd_param : hdd_params_) { + hdd_connected_flags_[hdd_param.first] = false; + device_unmount_request_flags_[hdd_param.first] = false; + // Get device name from mount point hdd_param.second.part_device_ = getDeviceFromMountPoint(hdd_param.first); if (!hdd_param.second.part_device_.empty()) { @@ -812,50 +821,13 @@ void HDDMonitor::updateHDDConnections() const std::regex raw_pattern(".*/"); hdd_stats_[hdd_param.first].device_ = std::regex_replace(hdd_param.second.disk_device_, raw_pattern, ""); - - continue; } else { - // Unmount to solve the state where the device is mounted without existing - if (unmountDeviceWithLazy(hdd_param.second.part_device_)) { - RCLCPP_ERROR( - get_logger(), "Failed to unmount. %s", hdd_param.second.part_device_.c_str()); - } + // Deal with the issue that file system remains mounted when a drive is actually + // disconnected. + device_unmount_request_flags_[hdd_param.first] = true; } } - hdd_connected_flags_[hdd_param.first] = false; - } -} - -int HDDMonitor::unmountDeviceWithLazy(std::string & device) -{ - // boost::process create file descriptor without O_CLOEXEC required for multithreading. - // So create file descriptor with O_CLOEXEC and pass it to boost::process. - int out_fd[2]; - if (pipe2(out_fd, O_CLOEXEC) != 0) { - RCLCPP_ERROR(get_logger(), "Failed to execute pipe2. %s", strerror(errno)); - return -1; - } - bp::pipe out_pipe{out_fd[0], out_fd[1]}; - bp::ipstream is_out{std::move(out_pipe)}; - - int err_fd[2]; - if (pipe2(err_fd, O_CLOEXEC) != 0) { - RCLCPP_ERROR(get_logger(), "Failed to execute pipe2. %s", strerror(errno)); - return -1; - } - bp::pipe err_pipe{err_fd[0], err_fd[1]}; - bp::ipstream is_err{std::move(err_pipe)}; - - bp::child c( - "/bin/sh", "-c", fmt::format("umount -l {}", device.c_str()), bp::std_out > is_out, - bp::std_err > is_err); - c.wait(); - - if (c.exit_code() != 0) { - RCLCPP_ERROR(get_logger(), "Failed to execute umount command. %s", device.c_str()); - return -1; } - return 0; } #include From c97b8bc4436976d2d76d0efc701bf5ba54404eec Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Fri, 16 Sep 2022 17:17:45 +0900 Subject: [PATCH 10/10] fix(system_monitor): separate S.M.A.R.T. request and lazy unmount request for hdd_reader Signed-off-by: v-nakayama7440-esol --- .../include/hdd_reader/hdd_reader.hpp | 34 ++- .../hdd_monitor/hdd_monitor.hpp | 9 +- .../reader/hdd_reader/hdd_reader.cpp | 212 +++++++++++------- .../src/hdd_monitor/hdd_monitor.cpp | 122 ++++++++-- 4 files changed, 274 insertions(+), 103 deletions(-) diff --git a/system/system_monitor/include/hdd_reader/hdd_reader.hpp b/system/system_monitor/include/hdd_reader/hdd_reader.hpp index 9b34a3ece413b..2762dd8995b00 100644 --- a/system/system_monitor/include/hdd_reader/hdd_reader.hpp +++ b/system/system_monitor/include/hdd_reader/hdd_reader.hpp @@ -28,6 +28,14 @@ #include #include +/** + * @brief Enumeration of Request ID to hdd_reader + */ +enum HDDReaderRequestID { + GetHDDInfo, + UnmountDevice, +}; + /** * @brief HDD device */ @@ -40,9 +48,6 @@ struct HDDDevice total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written uint8_t recovered_error_attribute_id_; //!< @brief S.M.A.R.T attribute ID of recovered error - uint8_t unmount_request_flag_; //!< @brief unmount request flag - std::string part_device_; //!< @brief partition device - /** * @brief Load or save data members. * @param [inout] ar archive reference to load or save the serialized data members @@ -58,8 +63,6 @@ struct HDDDevice ar & power_on_hours_attribute_id_; ar & total_data_written_attribute_id_; ar & recovered_error_attribute_id_; - ar & unmount_request_flag_; - ar & part_device_; } }; @@ -106,6 +109,27 @@ struct HDDInfo } }; +/** + * @brief unmount device information + */ +struct UnmountDeviceInfo +{ + std::string part_device_; //!< @brief partition device + + /** + * @brief Load or save data members. + * @param [inout] ar archive reference to load or save the serialized data members + * @param [in] version version for the archive + * @note NOLINT syntax is needed since this is an interface to serialization and + * used inside boost serialization. + */ + template + void serialize(archive & ar, const unsigned /*version*/) // NOLINT(runtime/references) + { + ar & part_device_; + } +}; + /** * @brief HDD information list */ diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index 8eac2c632bf16..6e7d010645fe6 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -313,6 +313,13 @@ class HDDMonitor : public rclcpp::Node */ void updateHDDConnections(); + /** + * @brief unmount device + * @param [in] device device name + * @return result of success or failure + */ + int unmountDevice(std::string & device); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get HDD information from HDDReader @@ -322,8 +329,6 @@ class HDDMonitor : public rclcpp::Node std::map hdd_params_; //!< @brief list of error and warning levels std::map hdd_connected_flags_; //!< @brief list of flag whether HDD is connected - std::map - device_unmount_request_flags_; //!< @brief list of flag requesting device unmount std::map initial_recovered_errors_; //!< @brief list of initial recovered error count std::map hdd_stats_; //!< @brief list of HDD statistics diff --git a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp index e07f66e25046e..dc5581b3430f1 100644 --- a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp +++ b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp @@ -407,27 +407,127 @@ int get_nvme_SMARTData(int fd, HDDInfo * info) return EXIT_SUCCESS; } +/** + * @brief get HDD information + * @param [in] boost::archive::text_iarchive object + * @param [out] boost::archive::text_oarchive object + * @return 0 on success, otherwise error + */ +int get_hdd_info(boost::archive::text_iarchive & ia, boost::archive::text_oarchive & oa) +{ + std::vector hdd_devices; + HDDInfoList list; + + try { + ia & hdd_devices; + } catch (const std::exception & e) { + syslog(LOG_ERR, "exception. %s\n", e.what()); + return -1; + } + + for (auto & hdd_device : hdd_devices) { + HDDInfo info{}; + + // Open a file + int fd = open(hdd_device.name_.c_str(), O_RDONLY); + if (fd < 0) { + info.error_code_ = errno; + syslog(LOG_ERR, "Failed to open a file. %s\n", strerror(info.error_code_)); + continue; + } + + // AHCI device + if (boost::starts_with(hdd_device.name_.c_str(), "/dev/sd")) { + // Get IDENTIFY DEVICE for ATA drive + info.error_code_ = get_ata_identify(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get IDENTIFY DEVICE for ATA drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + // Get SMART DATA for ATA drive + info.error_code_ = get_ata_SMARTData(fd, &info, hdd_device); + if (info.error_code_ != 0) { + syslog(LOG_ERR, "Failed to get SMART LOG for ATA drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + } else if (boost::starts_with(hdd_device.name_.c_str(), "/dev/nvme")) { // NVMe device + // Get Identify for NVMe drive + info.error_code_ = get_nvme_identify(fd, &info); + if (info.error_code_ != 0) { + syslog(LOG_ERR, "Failed to get Identify for NVMe drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + // Get SMART / Health Information for NVMe drive + info.error_code_ = get_nvme_SMARTData(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get SMART / Health Information for NVMe drive. %s\n", + strerror(info.error_code_)); + close(fd); + continue; + } + } + + // Close the file descriptor FD + info.error_code_ = close(fd); + if (info.error_code_ < 0) { + info.error_code_ = errno; + syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(info.error_code_)); + } + + list[hdd_device.name_] = info; + } + + oa << list; + return 0; +} + /** * @brief unmount device with lazy option - * @param [in] device device name + * @param [in] boost::archive::text_iarchive object + * @param [out] boost::archive::text_oarchive object + * @return 0 on success, otherwise error */ -void unmount_device_with_lazy(std::string & device) +int unmount_device_with_lazy(boost::archive::text_iarchive & ia, boost::archive::text_oarchive & oa) { - boost::process::ipstream is_out; - boost::process::ipstream is_err; + std::vector unmount_devices; + std::vector responses; + + try { + ia & unmount_devices; + } catch (const std::exception & e) { + syslog(LOG_ERR, "exception. %s\n", e.what()); + return -1; + } + + for (auto & unmount_device : unmount_devices) { + int ret = 0; + boost::process::ipstream is_out; + boost::process::ipstream is_err; - boost::process::child c( - "/bin/sh", "-c", fmt::format("umount -l {}", device.c_str()), boost::process::std_out > is_out, - boost::process::std_err > is_err); - c.wait(); + boost::process::child c( + "/bin/sh", "-c", fmt::format("umount -l {}", unmount_device.part_device_.c_str()), + boost::process::std_out > is_out, boost::process::std_err > is_err); + c.wait(); - if (c.exit_code() != 0) { - syslog(LOG_ERR, "Failed to execute umount command. %s\n", device.c_str()); + if (c.exit_code() != 0) { + syslog( + LOG_ERR, "Failed to execute umount command. %s\n", unmount_device.part_device_.c_str()); + ret = -1; + } + responses.push_back(ret); } + + oa << responses; + return 0; } /** - * @brief check HDD temperature + * @brief hdd_reader main procedure * @param [in] port port to listen */ void run(int port) @@ -500,14 +600,14 @@ void run(int port) return; } - // Restore list of devices - std::vector hdd_devices; + uint8_t request_id; + + buf[sizeof(buf) - 1] = '\0'; + std::istringstream iss(buf); + boost::archive::text_iarchive ia(iss); try { - buf[sizeof(buf) - 1] = '\0'; - std::istringstream iss(buf); - boost::archive::text_iarchive oa(iss); - oa & hdd_devices; + ia & request_id; } catch (const std::exception & e) { syslog(LOG_ERR, "exception. %s\n", e.what()); close(new_sock); @@ -515,76 +615,26 @@ void run(int port) return; } - HDDInfoList list; std::ostringstream oss; boost::archive::text_oarchive oa(oss); - for (auto & hdd_device : hdd_devices) { - if (hdd_device.unmount_request_flag_) { - unmount_device_with_lazy(hdd_device.part_device_); - continue; - } - - HDDInfo info{}; - - // Open a file - int fd = open(hdd_device.name_.c_str(), O_RDONLY); - if (fd < 0) { - info.error_code_ = errno; - syslog(LOG_ERR, "Failed to open a file. %s\n", strerror(info.error_code_)); + switch (request_id) { + case HDDReaderRequestID::GetHDDInfo: + ret = get_hdd_info(ia, oa); + break; + case HDDReaderRequestID::UnmountDevice: + ret = unmount_device_with_lazy(ia, oa); + break; + default: + syslog(LOG_ERR, "Request ID is invalid. %d\n", request_id); continue; - } - - // AHCI device - if (boost::starts_with(hdd_device.name_.c_str(), "/dev/sd")) { - // Get IDENTIFY DEVICE for ATA drive - info.error_code_ = get_ata_identify(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get IDENTIFY DEVICE for ATA drive. %s\n", - strerror(info.error_code_)); - close(fd); - continue; - } - // Get SMART DATA for ATA drive - info.error_code_ = get_ata_SMARTData(fd, &info, hdd_device); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get SMART LOG for ATA drive. %s\n", strerror(info.error_code_)); - close(fd); - continue; - } - } else if (boost::starts_with(hdd_device.name_.c_str(), "/dev/nvme")) { // NVMe device - // Get Identify for NVMe drive - info.error_code_ = get_nvme_identify(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get Identify for NVMe drive. %s\n", strerror(info.error_code_)); - close(fd); - continue; - } - // Get SMART / Health Information for NVMe drive - info.error_code_ = get_nvme_SMARTData(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get SMART / Health Information for NVMe drive. %s\n", - strerror(info.error_code_)); - close(fd); - continue; - } - } - - // Close the file descriptor FD - info.error_code_ = close(fd); - if (info.error_code_ < 0) { - info.error_code_ = errno; - syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(info.error_code_)); - } - - list[hdd_device.name_] = info; + } + if (ret != 0) { + close(new_sock); + close(sock); + return; } - oa << list; // Write N bytes of BUF to FD ret = write(new_sock, oss.str().c_str(), oss.str().length()); if (ret < 0) { diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index 624635028190a..85a3ba7742cab 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -624,29 +624,26 @@ void HDDMonitor::updateHDDInfoList() return; } + uint8_t request_id = HDDReaderRequestID::GetHDDInfo; std::vector hdd_devices; for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr) { - HDDDevice device; - - if (hdd_connected_flags_[itr->first]) { - device.name_ = itr->second.disk_device_; - device.temp_attribute_id_ = itr->second.temp_attribute_id_; - device.power_on_hours_attribute_id_ = itr->second.power_on_hours_attribute_id_; - device.total_data_written_attribute_id_ = itr->second.total_data_written_attribute_id_; - device.recovered_error_attribute_id_ = itr->second.recovered_error_attribute_id_; - device.unmount_request_flag_ = 0; - } else if (device_unmount_request_flags_[itr->first]) { - device.part_device_ = itr->second.part_device_; - device.unmount_request_flag_ = 1; - } else { + if (!hdd_connected_flags_[itr->first]) { continue; } + HDDDevice device; + device.name_ = itr->second.disk_device_; + device.temp_attribute_id_ = itr->second.temp_attribute_id_; + device.power_on_hours_attribute_id_ = itr->second.power_on_hours_attribute_id_; + device.total_data_written_attribute_id_ = itr->second.total_data_written_attribute_id_; + device.recovered_error_attribute_id_ = itr->second.recovered_error_attribute_id_; + hdd_devices.push_back(device); } std::ostringstream oss; boost::archive::text_oarchive oa(oss); + oa & request_id; oa & hdd_devices; // Write list of devices to FD @@ -797,7 +794,6 @@ void HDDMonitor::updateHDDConnections() { for (auto & hdd_param : hdd_params_) { hdd_connected_flags_[hdd_param.first] = false; - device_unmount_request_flags_[hdd_param.first] = false; // Get device name from mount point hdd_param.second.part_device_ = getDeviceFromMountPoint(hdd_param.first); @@ -824,11 +820,107 @@ void HDDMonitor::updateHDDConnections() } else { // Deal with the issue that file system remains mounted when a drive is actually // disconnected. - device_unmount_request_flags_[hdd_param.first] = true; + if (unmountDevice(hdd_param.second.part_device_)) { + RCLCPP_ERROR( + get_logger(), "Failed to unmount device : %s", hdd_param.second.part_device_.c_str()); + } } } } } +int HDDMonitor::unmountDevice(std::string & device) +{ + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + RCLCPP_ERROR(get_logger(), "socket create error. %s", strerror(errno)); + return -1; + } + + // Specify the receiving timeouts until reporting an error + struct timeval tv; + tv.tv_sec = 10; + tv.tv_usec = 0; + int ret = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "setsockopt error. %s", strerror(errno)); + close(sock); + return -1; + } + + // Connect the socket referred to by the file descriptor + sockaddr_in addr; + memset(&addr, 0, sizeof(sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_port = htons(hdd_reader_port_); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket connect error. %s", strerror(errno)); + close(sock); + return -1; + } + + uint8_t request_id = HDDReaderRequestID::UnmountDevice; + std::vector umount_dev_infos; + UnmountDeviceInfo dev_info; + + dev_info.part_device_ = device; + umount_dev_infos.push_back(dev_info); + + std::ostringstream oss; + boost::archive::text_oarchive oa(oss); + oa & request_id; + oa & umount_dev_infos; + + // Write list of devices to FD + ret = write(sock, oss.str().c_str(), oss.str().length()); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket write error. %s", strerror(errno)); + close(sock); + return -1; + } + + // Receive messages from a socket + char buf[1024] = ""; + ret = recv(sock, buf, sizeof(buf) - 1, 0); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket recv error. %s", strerror(errno)); + close(sock); + return -1; + } + // No data received + if (ret == 0) { + RCLCPP_ERROR(get_logger(), "no data received from hdd_reader."); + close(sock); + return -1; + } + + // Close the file descriptor FD + ret = close(sock); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket close error. %s", strerror(errno)); + return -1; + } + + std::vector responses; + + // Restore responses + try { + std::istringstream iss(buf); + boost::archive::text_iarchive ia(iss); + ia >> responses; + } catch (const std::exception & e) { + RCLCPP_ERROR(get_logger(), "restore responses exception. %s", e.what()); + return -1; + } + if (responses.empty()) { + RCLCPP_ERROR(get_logger(), "responses from hdd_reader is empty."); + return -1; + } + return responses[0]; +} + #include RCLCPP_COMPONENTS_REGISTER_NODE(HDDMonitor)