diff --git a/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml b/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml index 77a23eb0f9aa8..04dd3afd09280 100644 --- a/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml +++ b/launch/tier4_system_launch/config/system_monitor/hdd_monitor.param.yaml @@ -5,10 +5,19 @@ disks: # Until multi type lists are allowed, name N the disks as disk0...disk{N-1} disk0: name: /dev/sda3 + temp_attribute_id: 0xC2 temp_warn: 55.0 temp_error: 70.0 + power_on_hours_attribute_id: 0x09 power_on_hours_warn: 3000000 + total_data_written_attribute_id: 0xF1 total_data_written_warn: 4915200 # =150TB (1unit=32MB) total_data_written_safety_factor: 0.05 + recovered_error_attribute_id: 0xC3 + recovered_error_warn: 1 free_warn: 5120 # MB(8hour) free_error: 100 # MB(last 1 minute) + read_data_rate_warn: 360.0 # MB/s + write_data_rate_warn: 103.5 # MB/s + read_iops_warn: 63360.0 # IOPS + write_iops_warn: 24120.0 # IOPS diff --git a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml index 8cf97cc95553c..bd39a73361b7c 100644 --- a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml +++ b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml @@ -150,6 +150,36 @@ contains: [": HDD Temperature"] timeout: 3.0 + recovered_error: + type: diagnostic_aggregator/GenericAnalyzer + path: recovered_error + contains: [": HDD RecoveredError"] + timeout: 3.0 + + read_data_rate: + type: diagnostic_aggregator/GenericAnalyzer + path: read_data_rate + contains: [": HDD ReadDataRate"] + timeout: 3.0 + + write_data_rate: + type: diagnostic_aggregator/GenericAnalyzer + path: write_data_rate + contains: [": HDD WriteDataRate"] + timeout: 3.0 + + read_iops: + type: diagnostic_aggregator/GenericAnalyzer + path: read_iops + contains: [": HDD ReadIOPS"] + timeout: 3.0 + + write_iops: + type: diagnostic_aggregator/GenericAnalyzer + path: write_iops + contains: [": HDD WriteIOPS"] + timeout: 3.0 + usage: type: diagnostic_aggregator/GenericAnalyzer path: usage @@ -168,6 +198,12 @@ contains: [": HDD TotalDataWritten"] timeout: 3.0 + connection: + type: diagnostic_aggregator/GenericAnalyzer + path: connection + contains: [": HDD Connection"] + timeout: 3.0 + process: type: diagnostic_aggregator/AnalyzerGroup path: process diff --git a/system/system_monitor/README.md b/system/system_monitor/README.md index 5c786e9762875..c30709d5cffea 100644 --- a/system/system_monitor/README.md +++ b/system/system_monitor/README.md @@ -63,7 +63,13 @@ Every topic is published in 1 minute interval. | HDD Monitor | HDD Temperature | ✓ | ✓ | ✓ | | | | HDD PowerOnHours | ✓ | ✓ | ✓ | | | | HDD TotalDataWritten | ✓ | ✓ | ✓ | | +| | HDD RecoveredError | ✓ | ✓ | ✓ | | | | HDD Usage | ✓ | ✓ | ✓ | | +| | HDD ReadDataRate | ✓ | ✓ | ✓ | | +| | HDD WriteDataRate | ✓ | ✓ | ✓ | | +| | HDD ReadIOPS | ✓ | ✓ | ✓ | | +| | HDD WriteIOPS | ✓ | ✓ | ✓ | | +| | HDD Connection | ✓ | ✓ | ✓ | | | Memory Monitor | Memory Usage | ✓ | ✓ | ✓ | | | Net Monitor | Network Usage | ✓ | ✓ | ✓ | | | | Network CRC Error | ✓ | ✓ | ✓ | Warning occurs when the number of CRC errors in the period reaches the threshold value. The number of CRC errors that occur is the same as the value that can be confirmed with the ip command. | diff --git a/system/system_monitor/config/hdd_monitor.param.yaml b/system/system_monitor/config/hdd_monitor.param.yaml index 70f8dc5ffa13f..d818d848be801 100644 --- a/system/system_monitor/config/hdd_monitor.param.yaml +++ b/system/system_monitor/config/hdd_monitor.param.yaml @@ -5,10 +5,19 @@ disks: # Until multi type lists are allowed, name N the disks as disk0...disk{N-1} disk0: name: / + temp_attribute_id: 0xC2 temp_warn: 55.0 temp_error: 70.0 + power_on_hours_attribute_id: 0x09 power_on_hours_warn: 3000000 + total_data_written_attribute_id: 0xF1 total_data_written_warn: 4915200 # =150TB (1unit=32MB) total_data_written_safety_factor: 0.05 + recovered_error_attribute_id: 0xC3 + recovered_error_warn: 1 free_warn: 5120 # MB(8hour) free_error: 100 # MB(last 1 minute) + read_data_rate_warn: 360.0 # MB/s + write_data_rate_warn: 103.5 # MB/s + read_iops_warn: 63360.0 # IOPS + write_iops_warn: 24120.0 # IOPS diff --git a/system/system_monitor/docs/ros_parameters.md b/system/system_monitor/docs/ros_parameters.md index 64ebe5e6a64e3..a5bb41cb7c769 100644 --- a/system/system_monitor/docs/ros_parameters.md +++ b/system/system_monitor/docs/ros_parameters.md @@ -25,12 +25,20 @@ hdd_monitor: | Name | Type | Unit | Default | Notes | | :------------------------------- | :----: | :---------------: | :-----: | :--------------------------------------------------------------------------------- | | name | string | n/a | none | The disk name to monitor temperature. (e.g. /dev/sda) | +| temp_attribute_id | int | n/a | 0xC2 | S.M.A.R.T attribute ID of temperature. | | temp_warn | float | DegC | 55.0 | Generates warning when HDD temperature reaches a specified value or higher. | | temp_error | float | DegC | 70.0 | Generates error when HDD temperature reaches a specified value or higher. | +| power_on_hours_attribute_id | int | n/a | 0x09 | S.M.A.R.T attribute ID of power-on hours. | | power_on_hours_warn | int | Hour | 3000000 | Generates warning when HDD power-on hours reaches a specified value or higher. | | total_data_written_attribute_id | int | n/a | 0xF1 | S.M.A.R.T attribute ID of total data written. | | total_data_written_warn | int | depends on device | 4915200 | Generates warning when HDD total data written reaches a specified value or higher. | | total_data_written_safety_factor | int | %(1e-2) | 0.05 | Safety factor of HDD total data written. | +| recovered_error_attribute_id | int | n/a | 0xC3 | S.M.A.R.T attribute ID of recovered error. | +| recovered_error_warn | int | n/a | 1 | Generates warning when HDD recovered error reaches a specified value or higher. | +| read_data_rate_warn | float | MB/s | 360.0 | Generates warning when HDD read data rate reaches a specified value or higher. | +| write_data_rate_warn | float | MB/s | 103.5 | Generates warning when HDD write data rate reaches a specified value or higher. | +| read_iops_warn | float | IOPS | 63360.0 | Generates warning when HDD read IOPS reaches a specified value or higher. | +| write_iops_warn | float | IOPS | 24120.0 | Generates warning when HDD write IOPS reaches a specified value or higher. | hdd_monitor: diff --git a/system/system_monitor/docs/topics_hdd_monitor.md b/system/system_monitor/docs/topics_hdd_monitor.md index 4ef37b6ae5207..31301177f38f1 100644 --- a/system/system_monitor/docs/topics_hdd_monitor.md +++ b/system/system_monitor/docs/topics_hdd_monitor.md @@ -14,13 +14,13 @@ [values] -| key | value (example) | -| ---------------------- | -------------------------- | -| HDD [0-9]: status | OK / hot / critical hot | -| HDD [0-9]: name | /dev/nvme0 | -| HDD [0-9]: model | SAMSUNG MZVLB1T0HBLR-000L7 | -| HDD [0-9]: serial | S4EMNF0M820682 | -| HDD [0-9]: temperature | 37.0 DegC | +| key | value (example) | +| ---------------------- | ---------------------------- | +| HDD [0-9]: status | OK / hot / critical hot | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: model | SAMSUNG MZVLB1T0HBLR-000L7 | +| HDD [0-9]: serial | S4EMNF0M820682 | +| HDD [0-9]: temperature | 37.0 DegC
not available | ## HDD PowerOnHours @@ -35,13 +35,13 @@ [values] -| key | value (example) | -| ------------------------- | ----------------------- | -| HDD [0-9]: status | OK / lifetime limit | -| HDD [0-9]: name | /dev/nvme0 | -| HDD [0-9]: model | PHISON PS5012-E12S-512G | -| HDD [0-9]: serial | FB590709182505050767 | -| HDD [0-9]: power on hours | 4834 Hours | +| key | value (example) | +| ------------------------- | ----------------------------- | +| HDD [0-9]: status | OK / lifetime limit | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: model | PHISON PS5012-E12S-512G | +| HDD [0-9]: serial | FB590709182505050767 | +| HDD [0-9]: power on hours | 4834 Hours
not available | ## HDD TotalDataWritten @@ -64,6 +64,27 @@ | HDD [0-9]: serial | FB590709182505050767 | | HDD [0-9]: total data written | 146295330
not available | +## HDD RecoveredError + +/diagnostics/hdd_monitor: HDD RecoveredError + +[summary] + +| level | message | +| ----- | -------------------- | +| OK | OK | +| WARN | high soft error rate | + +[values] + +| key | value (example) | +| -------------------------- | ------------------------- | +| HDD [0-9]: status | OK / high soft error rate | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: model | PHISON PS5012-E12S-512G | +| HDD [0-9]: serial | FB590709182505050767 | +| HDD [0-9]: recovered error | 0
not available | + ## HDD Usage /diagnostics/hdd_monitor: HDD Usage @@ -87,3 +108,98 @@ | HDD [0-9]: avail | 749G | | HDD [0-9]: use | 69% | | HDD [0-9]: mounted on | / | + +## HDD ReadDataRate + +/diagnostics/hdd_monitor: HDD ReadDataRate + +[summary] + +| level | message | +| ----- | ---------------------- | +| OK | OK | +| WARN | high data rate of read | + +[values] + +| key | value (example) | +| ---------------------------- | --------------------------- | +| HDD [0-9]: status | OK / high data rate of read | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: data rate of read | 0.00 MB/s | + +## HDD WriteDataRate + +/diagnostics/hdd_monitor: HDD WriteDataRate + +[summary] + +| level | message | +| ----- | ----------------------- | +| OK | OK | +| WARN | high data rate of write | + +[values] + +| key | value (example) | +| ----------------------------- | ---------------------------- | +| HDD [0-9]: status | OK / high data rate of write | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: data rate of write | 0.00 MB/s | + +## HDD ReadIOPS + +/diagnostics/hdd_monitor: HDD ReadIOPS + +[summary] + +| level | message | +| ----- | ----------------- | +| OK | OK | +| WARN | high IOPS of read | + +[values] + +| key | value (example) | +| ----------------------- | ---------------------- | +| HDD [0-9]: status | OK / high IOPS of read | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: IOPS of read | 0.00 IOPS | + +## HDD WriteIOPS + +/diagnostics/hdd_monitor: HDD WriteIOPS + +[summary] + +| level | message | +| ----- | ------------------ | +| OK | OK | +| WARN | high IOPS of write | + +[values] + +| key | value (example) | +| ------------------------ | ----------------------- | +| HDD [0-9]: status | OK / high IOPS of write | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: IOPS of write | 0.00 IOPS | + +## HDD Connection + +/diagnostics/hdd_monitor: HDD Connection + +[summary] + +| level | message | +| ----- | ------------- | +| OK | OK | +| WARN | not connected | + +[values] + +| key | value (example) | +| ---------------------- | ------------------ | +| HDD [0-9]: status | OK / not connected | +| HDD [0-9]: name | /dev/nvme0 | +| HDD [0-9]: mount point | / | diff --git a/system/system_monitor/include/hdd_reader/hdd_reader.hpp b/system/system_monitor/include/hdd_reader/hdd_reader.hpp index 44c52a491b9fe..2762dd8995b00 100644 --- a/system/system_monitor/include/hdd_reader/hdd_reader.hpp +++ b/system/system_monitor/include/hdd_reader/hdd_reader.hpp @@ -29,18 +29,24 @@ #include /** - * @brief ATA attribute IDs + * @brief Enumeration of Request ID to hdd_reader */ -enum class ATAAttributeIDs : uint8_t { TEMPERATURE = 0, POWER_ON_HOURS = 1, SIZE }; +enum HDDReaderRequestID { + GetHDDInfo, + UnmountDevice, +}; /** * @brief HDD device */ struct HDDDevice { - std::string name_; //!< @brief Device name + std::string name_; //!< @brief Device name + uint8_t temp_attribute_id_; //!< @brief S.M.A.R.T attribute ID of temperature + uint8_t power_on_hours_attribute_id_; //!< @brief S.M.A.R.T attribute ID of power on hours uint8_t - total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written + total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written + uint8_t recovered_error_attribute_id_; //!< @brief S.M.A.R.T attribute ID of recovered error /** * @brief Load or save data members. @@ -53,7 +59,10 @@ struct HDDDevice void serialize(archive & ar, const unsigned /*version*/) // NOLINT(runtime/references) { ar & name_; + ar & temp_attribute_id_; + ar & power_on_hours_attribute_id_; ar & total_data_written_attribute_id_; + ar & recovered_error_attribute_id_; } }; @@ -70,7 +79,11 @@ struct HDDInfo // in S.M.A.R.T. information. uint64_t power_on_hours_; //!< @brief power on hours count uint64_t total_data_written_; //!< @brief total data written + uint32_t recovered_error_; //!< @brief recovered error count + bool is_valid_temp_; //!< @brief whether temp_ is valid value + bool is_valid_power_on_hours_; //!< @brief whether power_on_hours_ is valid value bool is_valid_total_data_written_; //!< @brief whether total_data_written_ is valid value + bool is_valid_recovered_error_; //!< @brief whether recovered_error_ is valid value /** * @brief Load or save data members. @@ -88,7 +101,32 @@ struct HDDInfo ar & temp_; ar & power_on_hours_; ar & total_data_written_; + ar & recovered_error_; + ar & is_valid_temp_; + ar & is_valid_power_on_hours_; ar & is_valid_total_data_written_; + ar & is_valid_recovered_error_; + } +}; + +/** + * @brief unmount device information + */ +struct UnmountDeviceInfo +{ + std::string part_device_; //!< @brief partition device + + /** + * @brief Load or save data members. + * @param [inout] ar archive reference to load or save the serialized data members + * @param [in] version version for the archive + * @note NOLINT syntax is needed since this is an interface to serialization and + * used inside boost serialization. + */ + template + void serialize(archive & ar, const unsigned /*version*/) // NOLINT(runtime/references) + { + ar & part_device_; } }; diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index c739f466e95e0..6e7d010645fe6 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -33,14 +33,25 @@ */ struct HDDParam { - std::string device_; //!< @brief device + std::string part_device_; //!< @brief partition device + std::string disk_device_; //!< @brief disk device float temp_warn_; //!< @brief HDD temperature(DegC) to generate warning float temp_error_; //!< @brief HDD temperature(DegC) to generate error int power_on_hours_warn_; //!< @brief HDD power on hours to generate warning uint64_t total_data_written_warn_; //!< @brief HDD total data written to generate warning float total_data_written_safety_factor_; //!< @brief safety factor of HDD total data written - int free_warn_; //!< @brief HDD free space(MB) to generate warning - int free_error_; //!< @brief HDD free space(MB) to generate error + int recovered_error_warn_; //!< @brief HDD recovered error count to generate warning + int free_warn_; //!< @brief HDD free space(MB) to generate warning + int free_error_; //!< @brief HDD free space(MB) to generate error + float read_data_rate_warn_; //!< @brief HDD data rate(MB/s) of read to generate warning + float write_data_rate_warn_; //!< @brief HDD data rate(MB/s) of write to generate warning + float read_iops_warn_; //!< @brief HDD IOPS of read to generate warning + float write_iops_warn_; //!< @brief HDD IOPS of write to generate warning + uint8_t temp_attribute_id_; //!< @brief S.M.A.R.T attribute ID of temperature + uint8_t power_on_hours_attribute_id_; //!< @brief S.M.A.R.T attribute ID of power on hours + uint8_t + total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written + uint8_t recovered_error_attribute_id_; //!< @brief S.M.A.R.T attribute ID of recovered error HDDParam() : temp_warn_(55.0), @@ -48,8 +59,48 @@ struct HDDParam power_on_hours_warn_(3000000), total_data_written_warn_(4915200), total_data_written_safety_factor_(0.05), + recovered_error_warn_(1), free_warn_(5120), - free_error_(100) + free_error_(100), + read_data_rate_warn_(360.0), + write_data_rate_warn_(103.5), + read_iops_warn_(63360.0), + write_iops_warn_(24120.0), + temp_attribute_id_(0xC2), + power_on_hours_attribute_id_(0x09), + total_data_written_attribute_id_(0xF1), + recovered_error_attribute_id_(0xC3) + { + } +}; + +/** + * @brief statistics of sysfs device + */ +struct SysfsDevStat +{ + uint64_t rd_ios_; //!< @brief number of read operations issued to the device + uint64_t rd_sectors_; //!< @brief number of sectors read + uint64_t wr_ios_; //!< @brief number of write operations issued to the device + uint64_t wr_sectors_; //!< @brief number of sectors written + + SysfsDevStat() : rd_ios_(0), rd_sectors_(0), wr_ios_(0), wr_sectors_(0) {} +}; + +/** + * @brief statistics of HDD + */ +struct HDDStat +{ + std::string device_; //!< @brief device + std::string error_str_; //!< @brief error string + float read_data_rate_MBs_; //!< @brief data rate of read (MB/s) + float write_data_rate_MBs_; //!< @brief data rate of write (MB/s) + float read_iops_; //!< @brief IOPS of read + float write_iops_; //!< @brief IOPS of write + SysfsDevStat last_sfdevstat_; //!< @brief last statistics of sysfs device + + HDDStat() : read_data_rate_MBs_(0.0), write_data_rate_MBs_(0.0), read_iops_(0.0), write_iops_(0.0) { } }; @@ -61,6 +112,18 @@ enum class HDDSMARTInfoItem : uint32_t { TEMPERATURE = 0, POWER_ON_HOURS = 1, TOTAL_DATA_WRITTEN = 2, + RECOVERED_ERROR = 3, + SIZE +}; + +/** + * @brief HDD statistics items to check + */ +enum class HDDStatItem : uint32_t { + READ_DATA_RATE = 0, + WRITE_DATA_RATE = 1, + READ_IOPS = 2, + WRITE_IOPS = 3, SIZE }; @@ -103,9 +166,19 @@ class HDDMonitor : public rclcpp::Node void checkSMARTTotalDataWritten( diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** + * @brief check HDD recovered error count + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkSMARTRecoveredError( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** * @brief check S.M.A.R.T. information * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @param [in] item S.M.A.R.T information item to be checked * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference * to pass diagnostic message updated in this function to diagnostic publish calls. */ @@ -122,6 +195,62 @@ class HDDMonitor : public rclcpp::Node void checkUsage( diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** + * @brief check HDD data rate of read + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkReadDataRate( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD data rate of write + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkWriteDataRate( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD IOPS of read + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkReadIOPS( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD IOPS of write + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkWriteIOPS( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + + /** + * @brief check HDD statistics + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @param [in] item statistic item to be checked + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkStatistics( + diagnostic_updater::DiagnosticStatusWrapper & stat, + HDDStatItem item); // NOLINT(runtime/references) + + /** + * @brief check HDD connection + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkConnection( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** * @brief human readable size string to MB * @param [in] human readable size string @@ -151,6 +280,46 @@ class HDDMonitor : public rclcpp::Node */ void updateHDDInfoList(); + /** + * @brief start HDD transfer measurement + */ + void startHDDTransferMeasurement(); + + /** + * @brief update HDD statistics + */ + void updateHDDStatistics(); + + /** + * @brief get increment value of sysfs device stats per second + * @param [in] cur_val current value + * @param [in] last_val last value + * @param [in] duration_sec duration in seconds + * @return increment value + */ + double getIncreaseSysfsDeviceStatValuePerSec( + uint64_t cur_val, uint64_t last_val, double duration_sec); + + /** + * @brief read stats for current whole device using /sys/block/ directory + * @param [in] device device name + * @param [out] sfdevstat statistics of sysfs device + * @return result of success or failure + */ + int readSysfsDeviceStat(const std::string & device, SysfsDevStat & sfdevstat); + + /** + * @brief update HDD connections + */ + void updateHDDConnections(); + + /** + * @brief unmount device + * @param [in] device device name + * @return result of success or failure + */ + int unmountDevice(std::string & device); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get HDD information from HDDReader @@ -158,10 +327,15 @@ class HDDMonitor : public rclcpp::Node int hdd_reader_port_; //!< @brief port number to connect to hdd_reader std::map hdd_params_; //!< @brief list of error and warning levels - std::vector hdd_devices_; //!< @brief list of devices + std::map + hdd_connected_flags_; //!< @brief list of flag whether HDD is connected + std::map + initial_recovered_errors_; //!< @brief list of initial recovered error count + std::map hdd_stats_; //!< @brief list of HDD statistics //!< @brief diagnostic of connection diagnostic_updater::DiagnosticStatusWrapper connect_diag_; - HDDInfoList hdd_info_list_; //!< @brief list of HDD information + HDDInfoList hdd_info_list_; //!< @brief list of HDD information + rclcpp::Time last_hdd_stat_update_time_; //!< @brief last HDD statistics update time /** * @brief HDD SMART status messages @@ -173,6 +347,10 @@ class HDDMonitor : public rclcpp::Node {{DiagStatus::OK, "OK"}, {DiagStatus::WARN, "lifetime limit"}, {DiagStatus::ERROR, "unused"}}, // total data written {{DiagStatus::OK, "OK"}, {DiagStatus::WARN, "warranty period"}, {DiagStatus::ERROR, "unused"}}, + // recovered error count + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high soft error rate"}, + {DiagStatus::ERROR, "unused"}}, }; /** @@ -182,6 +360,34 @@ class HDDMonitor : public rclcpp::Node {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "low disk space"}, {DiagStatus::ERROR, "very low disk space"}}; + + /** + * @brief HDD statistics status messages + */ + const std::map stat_dicts_[static_cast(HDDStatItem::SIZE)] = { + // data rate of read + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high data rate of read"}, + {DiagStatus::ERROR, "unused"}}, + // data rate of write + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high data rate of write"}, + {DiagStatus::ERROR, "unused"}}, + // IOPS of read + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high IOPS of read"}, + {DiagStatus::ERROR, "unused"}}, + // IOPS of write + {{DiagStatus::OK, "OK"}, + {DiagStatus::WARN, "high IOPS of write"}, + {DiagStatus::ERROR, "unused"}}, + }; + + /** + * @brief HDD connection status messages + */ + const std::map connection_dict_ = { + {DiagStatus::OK, "OK"}, {DiagStatus::WARN, "not connected"}, {DiagStatus::ERROR, "unused"}}; }; #endif // SYSTEM_MONITOR__HDD_MONITOR__HDD_MONITOR_HPP_ diff --git a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp index f31efde2f5e01..dc5581b3430f1 100644 --- a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp +++ b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp @@ -24,9 +24,11 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -279,16 +281,21 @@ int get_ata_SMARTData(int fd, HDDInfo * info, const HDDDevice & device) return errno; } - std::bitset(ATAAttributeIDs::SIZE)> found_flag; + info->is_valid_temp_ = false; + info->is_valid_power_on_hours_ = false; info->is_valid_total_data_written_ = false; + info->is_valid_recovered_error_ = false; // Retrieve S.M.A.R.T. Informations for (int i = 0; i < 30; ++i) { - if (data.attribute_entry_[i].attribute_id_ == 0xC2) { // Temperature - Device Internal + if (data.attribute_entry_[i].attribute_id_ == device.temp_attribute_id_) { // Temperature - + // Device Internal info->temp_ = static_cast(data.attribute_entry_[i].data_); - found_flag.set(static_cast(ATAAttributeIDs::TEMPERATURE)); - } else if (data.attribute_entry_[i].attribute_id_ == 0x09) { // Power-on Hours Count + info->is_valid_temp_ = true; + } else if ( + data.attribute_entry_[i].attribute_id_ == + device.power_on_hours_attribute_id_) { // Power-on Hours Count info->power_on_hours_ = data.attribute_entry_[i].data_; - found_flag.set(static_cast(ATAAttributeIDs::POWER_ON_HOURS)); + info->is_valid_power_on_hours_ = true; } else if ( data.attribute_entry_[i].attribute_id_ == device.total_data_written_attribute_id_) { // Total LBAs Written @@ -296,14 +303,15 @@ int get_ata_SMARTData(int fd, HDDInfo * info, const HDDDevice & device) (data.attribute_entry_[i].data_ | (static_cast(data.attribute_entry_[i].attribute_specific_) << 32)); info->is_valid_total_data_written_ = true; + } else if ( + data.attribute_entry_[i].attribute_id_ == + device.recovered_error_attribute_id_) { // Hardware ECC Recovered + info->recovered_error_ = data.attribute_entry_[i].data_; + info->is_valid_recovered_error_ = true; } } - if (found_flag.all()) { - return EXIT_SUCCESS; - } - - return ENOENT; + return EXIT_SUCCESS; } /** @@ -376,6 +384,7 @@ int get_nvme_SMARTData(int fd, HDDInfo * info) // Bytes 2:1 Composite Temperature // Convert kelvin to celsius unsigned int temperature = ((data[2] << 8) | data[1]) - 273; + info->is_valid_temp_ = true; info->temp_ = static_cast(temperature); // Bytes 63:48 Data Units Written @@ -389,13 +398,136 @@ int get_nvme_SMARTData(int fd, HDDInfo * info) info->total_data_written_ = *(reinterpret_cast(&data[48])); // Bytes 143:128 Power On Hours + info->is_valid_power_on_hours_ = true; info->power_on_hours_ = *(reinterpret_cast(&data[128])); + // NVMe S.M.A.R.T has no information of recovered error count + info->is_valid_recovered_error_ = false; + return EXIT_SUCCESS; } /** - * @brief check HDD temperature + * @brief get HDD information + * @param [in] boost::archive::text_iarchive object + * @param [out] boost::archive::text_oarchive object + * @return 0 on success, otherwise error + */ +int get_hdd_info(boost::archive::text_iarchive & ia, boost::archive::text_oarchive & oa) +{ + std::vector hdd_devices; + HDDInfoList list; + + try { + ia & hdd_devices; + } catch (const std::exception & e) { + syslog(LOG_ERR, "exception. %s\n", e.what()); + return -1; + } + + for (auto & hdd_device : hdd_devices) { + HDDInfo info{}; + + // Open a file + int fd = open(hdd_device.name_.c_str(), O_RDONLY); + if (fd < 0) { + info.error_code_ = errno; + syslog(LOG_ERR, "Failed to open a file. %s\n", strerror(info.error_code_)); + continue; + } + + // AHCI device + if (boost::starts_with(hdd_device.name_.c_str(), "/dev/sd")) { + // Get IDENTIFY DEVICE for ATA drive + info.error_code_ = get_ata_identify(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get IDENTIFY DEVICE for ATA drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + // Get SMART DATA for ATA drive + info.error_code_ = get_ata_SMARTData(fd, &info, hdd_device); + if (info.error_code_ != 0) { + syslog(LOG_ERR, "Failed to get SMART LOG for ATA drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + } else if (boost::starts_with(hdd_device.name_.c_str(), "/dev/nvme")) { // NVMe device + // Get Identify for NVMe drive + info.error_code_ = get_nvme_identify(fd, &info); + if (info.error_code_ != 0) { + syslog(LOG_ERR, "Failed to get Identify for NVMe drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + // Get SMART / Health Information for NVMe drive + info.error_code_ = get_nvme_SMARTData(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get SMART / Health Information for NVMe drive. %s\n", + strerror(info.error_code_)); + close(fd); + continue; + } + } + + // Close the file descriptor FD + info.error_code_ = close(fd); + if (info.error_code_ < 0) { + info.error_code_ = errno; + syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(info.error_code_)); + } + + list[hdd_device.name_] = info; + } + + oa << list; + return 0; +} + +/** + * @brief unmount device with lazy option + * @param [in] boost::archive::text_iarchive object + * @param [out] boost::archive::text_oarchive object + * @return 0 on success, otherwise error + */ +int unmount_device_with_lazy(boost::archive::text_iarchive & ia, boost::archive::text_oarchive & oa) +{ + std::vector unmount_devices; + std::vector responses; + + try { + ia & unmount_devices; + } catch (const std::exception & e) { + syslog(LOG_ERR, "exception. %s\n", e.what()); + return -1; + } + + for (auto & unmount_device : unmount_devices) { + int ret = 0; + boost::process::ipstream is_out; + boost::process::ipstream is_err; + + boost::process::child c( + "/bin/sh", "-c", fmt::format("umount -l {}", unmount_device.part_device_.c_str()), + boost::process::std_out > is_out, boost::process::std_err > is_err); + c.wait(); + + if (c.exit_code() != 0) { + syslog( + LOG_ERR, "Failed to execute umount command. %s\n", unmount_device.part_device_.c_str()); + ret = -1; + } + responses.push_back(ret); + } + + oa << responses; + return 0; +} + +/** + * @brief hdd_reader main procedure * @param [in] port port to listen */ void run(int port) @@ -468,14 +600,14 @@ void run(int port) return; } - // Restore list of devices - std::vector hdd_devices; + uint8_t request_id; + + buf[sizeof(buf) - 1] = '\0'; + std::istringstream iss(buf); + boost::archive::text_iarchive ia(iss); try { - buf[sizeof(buf) - 1] = '\0'; - std::istringstream iss(buf); - boost::archive::text_iarchive oa(iss); - oa & hdd_devices; + ia & request_id; } catch (const std::exception & e) { syslog(LOG_ERR, "exception. %s\n", e.what()); close(new_sock); @@ -483,71 +615,26 @@ void run(int port) return; } - HDDInfoList list; std::ostringstream oss; boost::archive::text_oarchive oa(oss); - for (auto & hdd_device : hdd_devices) { - HDDInfo info{}; - - // Open a file - int fd = open(hdd_device.name_.c_str(), O_RDONLY); - if (fd < 0) { - info.error_code_ = errno; - syslog(LOG_ERR, "Failed to open a file. %s\n", strerror(info.error_code_)); + switch (request_id) { + case HDDReaderRequestID::GetHDDInfo: + ret = get_hdd_info(ia, oa); + break; + case HDDReaderRequestID::UnmountDevice: + ret = unmount_device_with_lazy(ia, oa); + break; + default: + syslog(LOG_ERR, "Request ID is invalid. %d\n", request_id); continue; - } - - // AHCI device - if (boost::starts_with(hdd_device.name_.c_str(), "/dev/sd")) { - // Get IDENTIFY DEVICE for ATA drive - info.error_code_ = get_ata_identify(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get IDENTIFY DEVICE for ATA drive. %s\n", - strerror(info.error_code_)); - close(fd); - continue; - } - // Get SMART DATA for ATA drive - info.error_code_ = get_ata_SMARTData(fd, &info, hdd_device); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get SMART LOG for ATA drive. %s\n", strerror(info.error_code_)); - close(fd); - continue; - } - } else if (boost::starts_with(hdd_device.name_.c_str(), "/dev/nvme")) { // NVMe device - // Get Identify for NVMe drive - info.error_code_ = get_nvme_identify(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get Identify for NVMe drive. %s\n", strerror(info.error_code_)); - close(fd); - continue; - } - // Get SMART / Health Information for NVMe drive - info.error_code_ = get_nvme_SMARTData(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get SMART / Health Information for NVMe drive. %s\n", - strerror(info.error_code_)); - close(fd); - continue; - } - } - - // Close the file descriptor FD - info.error_code_ = close(fd); - if (info.error_code_ < 0) { - info.error_code_ = errno; - syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(info.error_code_)); - } - - list[hdd_device.name_] = info; + } + if (ret != 0) { + close(new_sock); + close(sock); + return; } - oa << list; // Write N bytes of BUF to FD ret = write(new_sock, oss.str().c_str(), oss.str().length()); if (ret < 0) { diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index 3e05e76054f62..85a3ba7742cab 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -30,8 +30,10 @@ #include #include +#include #include +#include #include #include @@ -40,7 +42,8 @@ namespace bp = boost::process; HDDMonitor::HDDMonitor(const rclcpp::NodeOptions & options) : Node("hdd_monitor", options), updater_(this), - hdd_reader_port_(declare_parameter("hdd_reader_port", 7635)) + hdd_reader_port_(declare_parameter("hdd_reader_port", 7635)), + last_hdd_stat_update_time_{0, 0, this->get_clock()->get_clock_type()} { using namespace std::literals::chrono_literals; @@ -52,11 +55,23 @@ HDDMonitor::HDDMonitor(const rclcpp::NodeOptions & options) updater_.add("HDD Temperature", this, &HDDMonitor::checkSMARTTemperature); updater_.add("HDD PowerOnHours", this, &HDDMonitor::checkSMARTPowerOnHours); updater_.add("HDD TotalDataWritten", this, &HDDMonitor::checkSMARTTotalDataWritten); + updater_.add("HDD RecoveredError", this, &HDDMonitor::checkSMARTRecoveredError); updater_.add("HDD Usage", this, &HDDMonitor::checkUsage); + updater_.add("HDD ReadDataRate", this, &HDDMonitor::checkReadDataRate); + updater_.add("HDD WriteDataRate", this, &HDDMonitor::checkWriteDataRate); + updater_.add("HDD ReadIOPS", this, &HDDMonitor::checkReadIOPS); + updater_.add("HDD WriteIOPS", this, &HDDMonitor::checkWriteIOPS); + updater_.add("HDD Connection", this, &HDDMonitor::checkConnection); + + // get HDD connection status + updateHDDConnections(); // get HDD information from HDD reader for the first time updateHDDInfoList(); + // start HDD transfer measurement + startHDDTransferMeasurement(); + timer_ = rclcpp::create_timer(this, get_clock(), 1s, std::bind(&HDDMonitor::onTimer, this)); } @@ -75,6 +90,11 @@ void HDDMonitor::checkSMARTTotalDataWritten(diagnostic_updater::DiagnosticStatus checkSMART(stat, HDDSMARTInfoItem::TOTAL_DATA_WRITTEN); } +void HDDMonitor::checkSMARTRecoveredError(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkSMART(stat, HDDSMARTInfoItem::RECOVERED_ERROR); +} + void HDDMonitor::checkSMART( diagnostic_updater::DiagnosticStatusWrapper & stat, HDDSMARTInfoItem item) { @@ -103,11 +123,15 @@ void HDDMonitor::checkSMART( std::string val_str = ""; for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++index) { + if (!hdd_connected_flags_[itr->first]) { + continue; + } + // Retrieve HDD information - auto hdd_itr = hdd_info_list_.find(itr->second.device_); + auto hdd_itr = hdd_info_list_.find(itr->second.disk_device_); if (hdd_itr == hdd_info_list_.end()) { stat.add(fmt::format("HDD {}: status", index), "hdd_reader error"); - stat.add(fmt::format("HDD {}: name", index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: name", index), itr->second.part_device_.c_str()); stat.add(fmt::format("HDD {}: hdd_reader", index), strerror(ENOENT)); error_str = "hdd_reader error"; continue; @@ -115,7 +139,7 @@ void HDDMonitor::checkSMART( if (hdd_itr->second.error_code_ != 0) { stat.add(fmt::format("HDD {}: status", index), "hdd_reader error"); - stat.add(fmt::format("HDD {}: name", index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: name", index), itr->second.part_device_.c_str()); stat.add(fmt::format("HDD {}: hdd_reader", index), strerror(hdd_itr->second.error_code_)); error_str = "hdd_reader error"; continue; @@ -132,7 +156,11 @@ void HDDMonitor::checkSMART( level = DiagStatus::WARN; } key_str = fmt::format("HDD {}: temperature", index); - val_str = fmt::format("{:.1f} DegC", temp); + if (hdd_itr->second.is_valid_temp_) { + val_str = fmt::format("{:.1f} DegC", temp); + } else { + val_str = "not available"; + } } break; case HDDSMARTInfoItem::POWER_ON_HOURS: { int64_t power_on_hours = static_cast(hdd_itr->second.power_on_hours_); @@ -142,7 +170,11 @@ void HDDMonitor::checkSMART( level = DiagStatus::WARN; } key_str = fmt::format("HDD {}: power on hours", index); - val_str = fmt::format("{} Hours", hdd_itr->second.power_on_hours_); + if (hdd_itr->second.is_valid_power_on_hours_) { + val_str = fmt::format("{} Hours", hdd_itr->second.power_on_hours_); + } else { + val_str = "not available"; + } } break; case HDDSMARTInfoItem::TOTAL_DATA_WRITTEN: { uint64_t total_data_written = static_cast(hdd_itr->second.total_data_written_); @@ -158,13 +190,31 @@ void HDDMonitor::checkSMART( val_str = "not available"; } } break; + case HDDSMARTInfoItem::RECOVERED_ERROR: { + int32_t recovered_error = static_cast(hdd_itr->second.recovered_error_); + if (initial_recovered_errors_.find(itr->first) == initial_recovered_errors_.end()) { + initial_recovered_errors_[itr->first] = recovered_error; + } + recovered_error -= initial_recovered_errors_[itr->first]; + + level = DiagStatus::OK; + if (recovered_error >= itr->second.recovered_error_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: recovered error", index); + if (hdd_itr->second.is_valid_recovered_error_) { + val_str = fmt::format("{}", hdd_itr->second.recovered_error_); + } else { + val_str = "not available"; + } + } break; default: break; } stat.add( fmt::format("HDD {}: status", index), smart_dicts_[static_cast(item)].at(level)); - stat.add(fmt::format("HDD {}: name", index), itr->second.device_.c_str()); + stat.add(fmt::format("HDD {}: name", index), itr->second.disk_device_.c_str()); stat.add(fmt::format("HDD {}: model", index), hdd_itr->second.model_.c_str()); stat.add(fmt::format("HDD {}: serial", index), hdd_itr->second.serial_.c_str()); stat.addf(key_str, val_str.c_str()); @@ -197,6 +247,10 @@ void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) std::string error_str = ""; for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++hdd_index) { + if (!hdd_connected_flags_[itr->first]) { + continue; + } + // Get summary of disk space usage of ext4 // boost::process create file descriptor without O_CLOEXEC required for multithreading. @@ -225,8 +279,8 @@ void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) // Invoke shell to use shell wildcard expansion bp::child c( - "/bin/sh", "-c", fmt::format("df -Pm {}*", itr->first.c_str()), bp::std_out > is_out, - bp::std_err > is_err); + "/bin/sh", "-c", fmt::format("df -Pm {}*", itr->second.part_device_.c_str()), + bp::std_out > is_out, bp::std_err > is_err); c.wait(); if (c.exit_code() != 0) { @@ -234,7 +288,7 @@ void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) is_err >> os.rdbuf(); error_str = "df error"; stat.add(fmt::format("HDD {}: status", hdd_index), "df error"); - stat.add(fmt::format("HDD {}: name", hdd_index), itr->first.c_str()); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.part_device_.c_str()); stat.add(fmt::format("HDD {}: df", hdd_index), os.str().c_str()); continue; } @@ -300,18 +354,155 @@ void HDDMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) SystemMonitorUtility::stopMeasurement(t_start, stat); } +void HDDMonitor::checkReadDataRate(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkStatistics(stat, HDDStatItem::READ_DATA_RATE); +} + +void HDDMonitor::checkWriteDataRate(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkStatistics(stat, HDDStatItem::WRITE_DATA_RATE); +} + +void HDDMonitor::checkReadIOPS(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkStatistics(stat, HDDStatItem::READ_IOPS); +} + +void HDDMonitor::checkWriteIOPS(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + checkStatistics(stat, HDDStatItem::WRITE_IOPS); +} + +void HDDMonitor::checkStatistics( + diagnostic_updater::DiagnosticStatusWrapper & stat, HDDStatItem item) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (hdd_params_.empty()) { + stat.summary(DiagStatus::ERROR, "invalid disk parameter"); + return; + } + + int hdd_index = 0; + int whole_level = DiagStatus::OK; + std::string error_str = ""; + std::string key_str = ""; + std::string val_str = ""; + + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++hdd_index) { + if (!hdd_connected_flags_[itr->first]) { + continue; + } + + int level = DiagStatus::OK; + + switch (item) { + case HDDStatItem::READ_DATA_RATE: { + float read_data_rate = hdd_stats_[itr->first].read_data_rate_MBs_; + + if (read_data_rate >= itr->second.read_data_rate_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: data rate of read", hdd_index); + val_str = fmt::format("{:.2f} MB/s", read_data_rate); + } break; + case HDDStatItem::WRITE_DATA_RATE: { + float write_data_rate = hdd_stats_[itr->first].write_data_rate_MBs_; + + if (write_data_rate >= itr->second.write_data_rate_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: data rate of write", hdd_index); + val_str = fmt::format("{:.2f} MB/s", write_data_rate); + } break; + case HDDStatItem::READ_IOPS: { + float read_iops = hdd_stats_[itr->first].read_iops_; + + if (read_iops >= itr->second.read_iops_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: IOPS of read", hdd_index); + val_str = fmt::format("{:.2f} IOPS", read_iops); + } break; + case HDDStatItem::WRITE_IOPS: { + float write_iops = hdd_stats_[itr->first].write_iops_; + + if (write_iops >= itr->second.write_iops_warn_) { + level = DiagStatus::WARN; + } + key_str = fmt::format("HDD {}: IOPS of write", hdd_index); + val_str = fmt::format("{:.2f} IOPS", write_iops); + } break; + default: + break; + } + + if (!hdd_stats_[itr->first].error_str_.empty()) { + error_str = hdd_stats_[itr->first].error_str_; + stat.add(fmt::format("HDD {}: status", hdd_index), error_str); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.disk_device_.c_str()); + } else { + stat.add( + fmt::format("HDD {}: status", hdd_index), + stat_dicts_[static_cast(item)].at(level)); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.disk_device_.c_str()); + stat.add(key_str, val_str.c_str()); + } + + whole_level = std::max(whole_level, level); + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, error_str); + } else { + stat.summary(whole_level, stat_dicts_[static_cast(item)].at(whole_level)); + } + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void HDDMonitor::checkConnection(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (hdd_params_.empty()) { + stat.summary(DiagStatus::ERROR, "invalid disk parameter"); + return; + } + + int hdd_index = 0; + int whole_level = DiagStatus::OK; + + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr, ++hdd_index) { + int level = DiagStatus::OK; + + if (!hdd_connected_flags_[itr->first]) { + level = DiagStatus::WARN; + } + + stat.add(fmt::format("HDD {}: status", hdd_index), connection_dict_.at(level)); + stat.add(fmt::format("HDD {}: name", hdd_index), itr->second.disk_device_); + stat.add(fmt::format("HDD {}: mount point", hdd_index), itr->first.c_str()); + + whole_level = std::max(whole_level, level); + } + + stat.summary(whole_level, connection_dict_.at(whole_level)); + + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + void HDDMonitor::getHDDParams() { const auto num_disks = this->declare_parameter("num_disks", 0); for (auto i = 0; i < num_disks; ++i) { const auto prefix = "disks.disk" + std::to_string(i); - const auto name = declare_parameter(prefix + ".name", "/"); - - // Get device name from mount point - const auto device_name = getDeviceFromMountPoint(name); - if (device_name.empty()) { - continue; - } + const auto mount_point = declare_parameter(prefix + ".name", "/"); HDDParam param; param.temp_warn_ = declare_parameter(prefix + ".temp_warn", 55.0f); @@ -323,24 +514,26 @@ void HDDMonitor::getHDDParams() declare_parameter(prefix + ".total_data_written_warn", 4915200); param.total_data_written_warn_ = static_cast( total_data_written_warn_org * (1.0f - param.total_data_written_safety_factor_)); + param.recovered_error_warn_ = declare_parameter(prefix + ".recovered_error_warn", 1); param.free_warn_ = declare_parameter(prefix + ".free_warn", 5120); param.free_error_ = declare_parameter(prefix + ".free_error", 100); + param.read_data_rate_warn_ = declare_parameter(prefix + ".read_data_rate_warn", 360.0); + param.write_data_rate_warn_ = declare_parameter(prefix + ".write_data_rate_warn", 103.5); + param.read_iops_warn_ = declare_parameter(prefix + ".read_iops_warn", 63360.0); + param.write_iops_warn_ = declare_parameter(prefix + ".write_iops_warn", 24120.0); + param.temp_attribute_id_ = + static_cast(declare_parameter(prefix + ".temp_attribute_id", 0xC2)); + param.power_on_hours_attribute_id_ = + static_cast(declare_parameter(prefix + ".power_on_hours_attribute_id", 0x09)); + param.total_data_written_attribute_id_ = static_cast( + declare_parameter(prefix + ".total_data_written_attribute_id", 0xF1)); + param.recovered_error_attribute_id_ = + static_cast(declare_parameter(prefix + ".recovered_error_attribute_id", 0xC3)); - // Remove index number of partition for passing device name to hdd-reader - if (boost::starts_with(device_name, "/dev/sd")) { - const std::regex pattern("\\d+$"); - param.device_ = std::regex_replace(device_name, pattern, ""); - } else if (boost::starts_with(device_name, "/dev/nvme")) { - const std::regex pattern("p\\d+$"); - param.device_ = std::regex_replace(device_name, pattern, ""); - } - hdd_params_[device_name] = param; + hdd_params_[mount_point] = param; - HDDDevice device; - device.name_ = param.device_; - device.total_data_written_attribute_id_ = static_cast( - declare_parameter(prefix + ".total_data_written_attribute_id", 0xF1)); - hdd_devices_.push_back(device); + HDDStat stat; + hdd_stats_[mount_point] = stat; } } @@ -384,7 +577,12 @@ std::string HDDMonitor::getDeviceFromMountPoint(const std::string & mount_point) return ret; } -void HDDMonitor::onTimer() { updateHDDInfoList(); } +void HDDMonitor::onTimer() +{ + updateHDDConnections(); + updateHDDInfoList(); + updateHDDStatistics(); +} void HDDMonitor::updateHDDInfoList() { @@ -426,9 +624,27 @@ void HDDMonitor::updateHDDInfoList() return; } + uint8_t request_id = HDDReaderRequestID::GetHDDInfo; + std::vector hdd_devices; + for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr) { + if (!hdd_connected_flags_[itr->first]) { + continue; + } + + HDDDevice device; + device.name_ = itr->second.disk_device_; + device.temp_attribute_id_ = itr->second.temp_attribute_id_; + device.power_on_hours_attribute_id_ = itr->second.power_on_hours_attribute_id_; + device.total_data_written_attribute_id_ = itr->second.total_data_written_attribute_id_; + device.recovered_error_attribute_id_ = itr->second.recovered_error_attribute_id_; + + hdd_devices.push_back(device); + } + std::ostringstream oss; boost::archive::text_oarchive oa(oss); - oa & hdd_devices_; + oa & request_id; + oa & hdd_devices; // Write list of devices to FD ret = write(sock, oss.str().c_str(), oss.str().length()); @@ -477,5 +693,234 @@ void HDDMonitor::updateHDDInfoList() } } +void HDDMonitor::startHDDTransferMeasurement() +{ + for (auto & hdd_stat : hdd_stats_) { + hdd_stat.second.error_str_ = ""; + + if (!hdd_connected_flags_[hdd_stat.first]) { + continue; + } + + SysfsDevStat sfdevstat; + if (readSysfsDeviceStat(hdd_stat.second.device_, sfdevstat)) { + hdd_stat.second.error_str_ = "stat file read error"; + continue; + } + hdd_stat.second.last_sfdevstat_ = sfdevstat; + } + + last_hdd_stat_update_time_ = this->now(); +} + +void HDDMonitor::updateHDDStatistics() +{ + double duration_sec = (this->now() - last_hdd_stat_update_time_).seconds(); + + for (auto & hdd_stat : hdd_stats_) { + hdd_stat.second.error_str_ = ""; + + if (!hdd_connected_flags_[hdd_stat.first]) { + continue; + } + + SysfsDevStat sfdevstat; + if (readSysfsDeviceStat(hdd_stat.second.device_, sfdevstat)) { + hdd_stat.second.error_str_ = "stat file read error"; + continue; + } + + SysfsDevStat & last_sfdevstat = hdd_stat.second.last_sfdevstat_; + + hdd_stat.second.read_data_rate_MBs_ = getIncreaseSysfsDeviceStatValuePerSec( + sfdevstat.rd_sectors_, last_sfdevstat.rd_sectors_, duration_sec); + hdd_stat.second.read_data_rate_MBs_ /= 2048; + hdd_stat.second.write_data_rate_MBs_ = getIncreaseSysfsDeviceStatValuePerSec( + sfdevstat.wr_sectors_, last_sfdevstat.wr_sectors_, duration_sec); + hdd_stat.second.write_data_rate_MBs_ /= 2048; + hdd_stat.second.read_iops_ = getIncreaseSysfsDeviceStatValuePerSec( + sfdevstat.rd_ios_, last_sfdevstat.rd_ios_, duration_sec); + hdd_stat.second.write_iops_ = getIncreaseSysfsDeviceStatValuePerSec( + sfdevstat.wr_ios_, last_sfdevstat.wr_ios_, duration_sec); + + hdd_stat.second.last_sfdevstat_ = sfdevstat; + } + + last_hdd_stat_update_time_ = this->now(); +} + +double HDDMonitor::getIncreaseSysfsDeviceStatValuePerSec( + uint64_t cur_val, uint64_t last_val, double duration_sec) +{ + if (cur_val > last_val && duration_sec > 0.0) { + return static_cast(cur_val - last_val) / duration_sec; + } + return 0.0; +} + +int HDDMonitor::readSysfsDeviceStat(const std::string & device, SysfsDevStat & sfdevstat) +{ + int ret = -1; + unsigned int ios_pgr, tot_ticks, rq_ticks, wr_ticks; + uint64_t rd_ios, rd_merges_or_rd_sec, wr_ios, wr_merges; + uint64_t rd_sec_or_wr_ios, wr_sec, rd_ticks_or_wr_sec; + uint64_t dc_ios, dc_merges, dc_sec, dc_ticks; + + std::string filename("/sys/block/"); + filename += device + "/stat"; + FILE * fp = fopen(filename.c_str(), "r"); + if (fp == NULL) { + return ret; + } + + int i = fscanf( + fp, "%lu %lu %lu %lu %lu %lu %lu %u %u %u %u %lu %lu %lu %lu", &rd_ios, &rd_merges_or_rd_sec, + &rd_sec_or_wr_ios, &rd_ticks_or_wr_sec, &wr_ios, &wr_merges, &wr_sec, &wr_ticks, &ios_pgr, + &tot_ticks, &rq_ticks, &dc_ios, &dc_merges, &dc_sec, &dc_ticks); + + if (i >= 7) { + sfdevstat.rd_ios_ = rd_ios; + sfdevstat.rd_sectors_ = rd_sec_or_wr_ios; + sfdevstat.wr_ios_ = wr_ios; + sfdevstat.wr_sectors_ = wr_sec; + ret = 0; + } + + fclose(fp); + return ret; +} + +void HDDMonitor::updateHDDConnections() +{ + for (auto & hdd_param : hdd_params_) { + hdd_connected_flags_[hdd_param.first] = false; + + // Get device name from mount point + hdd_param.second.part_device_ = getDeviceFromMountPoint(hdd_param.first); + if (!hdd_param.second.part_device_.empty()) { + // Check the existence of device file + std::error_code ec; + if (std::filesystem::exists(hdd_param.second.part_device_, ec)) { + hdd_connected_flags_[hdd_param.first] = true; + + // Remove index number of partition for passing device name to hdd-reader + if (boost::starts_with(hdd_param.second.part_device_, "/dev/sd")) { + const std::regex pattern("\\d+$"); + hdd_param.second.disk_device_ = + std::regex_replace(hdd_param.second.part_device_, pattern, ""); + } else if (boost::starts_with(hdd_param.second.part_device_, "/dev/nvme")) { + const std::regex pattern("p\\d+$"); + hdd_param.second.disk_device_ = + std::regex_replace(hdd_param.second.part_device_, pattern, ""); + } + + const std::regex raw_pattern(".*/"); + hdd_stats_[hdd_param.first].device_ = + std::regex_replace(hdd_param.second.disk_device_, raw_pattern, ""); + } else { + // Deal with the issue that file system remains mounted when a drive is actually + // disconnected. + if (unmountDevice(hdd_param.second.part_device_)) { + RCLCPP_ERROR( + get_logger(), "Failed to unmount device : %s", hdd_param.second.part_device_.c_str()); + } + } + } + } +} + +int HDDMonitor::unmountDevice(std::string & device) +{ + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + RCLCPP_ERROR(get_logger(), "socket create error. %s", strerror(errno)); + return -1; + } + + // Specify the receiving timeouts until reporting an error + struct timeval tv; + tv.tv_sec = 10; + tv.tv_usec = 0; + int ret = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "setsockopt error. %s", strerror(errno)); + close(sock); + return -1; + } + + // Connect the socket referred to by the file descriptor + sockaddr_in addr; + memset(&addr, 0, sizeof(sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_port = htons(hdd_reader_port_); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket connect error. %s", strerror(errno)); + close(sock); + return -1; + } + + uint8_t request_id = HDDReaderRequestID::UnmountDevice; + std::vector umount_dev_infos; + UnmountDeviceInfo dev_info; + + dev_info.part_device_ = device; + umount_dev_infos.push_back(dev_info); + + std::ostringstream oss; + boost::archive::text_oarchive oa(oss); + oa & request_id; + oa & umount_dev_infos; + + // Write list of devices to FD + ret = write(sock, oss.str().c_str(), oss.str().length()); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket write error. %s", strerror(errno)); + close(sock); + return -1; + } + + // Receive messages from a socket + char buf[1024] = ""; + ret = recv(sock, buf, sizeof(buf) - 1, 0); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket recv error. %s", strerror(errno)); + close(sock); + return -1; + } + // No data received + if (ret == 0) { + RCLCPP_ERROR(get_logger(), "no data received from hdd_reader."); + close(sock); + return -1; + } + + // Close the file descriptor FD + ret = close(sock); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket close error. %s", strerror(errno)); + return -1; + } + + std::vector responses; + + // Restore responses + try { + std::istringstream iss(buf); + boost::archive::text_iarchive ia(iss); + ia >> responses; + } catch (const std::exception & e) { + RCLCPP_ERROR(get_logger(), "restore responses exception. %s", e.what()); + return -1; + } + if (responses.empty()) { + RCLCPP_ERROR(get_logger(), "responses from hdd_reader is empty."); + return -1; + } + return responses[0]; +} + #include RCLCPP_COMPONENTS_REGISTER_NODE(HDDMonitor)