From c97b8bc4436976d2d76d0efc701bf5ba54404eec Mon Sep 17 00:00:00 2001 From: v-nakayama7440-esol Date: Fri, 16 Sep 2022 17:17:45 +0900 Subject: [PATCH] fix(system_monitor): separate S.M.A.R.T. request and lazy unmount request for hdd_reader Signed-off-by: v-nakayama7440-esol --- .../include/hdd_reader/hdd_reader.hpp | 34 ++- .../hdd_monitor/hdd_monitor.hpp | 9 +- .../reader/hdd_reader/hdd_reader.cpp | 212 +++++++++++------- .../src/hdd_monitor/hdd_monitor.cpp | 122 ++++++++-- 4 files changed, 274 insertions(+), 103 deletions(-) diff --git a/system/system_monitor/include/hdd_reader/hdd_reader.hpp b/system/system_monitor/include/hdd_reader/hdd_reader.hpp index 9b34a3ece413b..2762dd8995b00 100644 --- a/system/system_monitor/include/hdd_reader/hdd_reader.hpp +++ b/system/system_monitor/include/hdd_reader/hdd_reader.hpp @@ -28,6 +28,14 @@ #include #include +/** + * @brief Enumeration of Request ID to hdd_reader + */ +enum HDDReaderRequestID { + GetHDDInfo, + UnmountDevice, +}; + /** * @brief HDD device */ @@ -40,9 +48,6 @@ struct HDDDevice total_data_written_attribute_id_; //!< @brief S.M.A.R.T attribute ID of total data written uint8_t recovered_error_attribute_id_; //!< @brief S.M.A.R.T attribute ID of recovered error - uint8_t unmount_request_flag_; //!< @brief unmount request flag - std::string part_device_; //!< @brief partition device - /** * @brief Load or save data members. * @param [inout] ar archive reference to load or save the serialized data members @@ -58,8 +63,6 @@ struct HDDDevice ar & power_on_hours_attribute_id_; ar & total_data_written_attribute_id_; ar & recovered_error_attribute_id_; - ar & unmount_request_flag_; - ar & part_device_; } }; @@ -106,6 +109,27 @@ struct HDDInfo } }; +/** + * @brief unmount device information + */ +struct UnmountDeviceInfo +{ + std::string part_device_; //!< @brief partition device + + /** + * @brief Load or save data members. + * @param [inout] ar archive reference to load or save the serialized data members + * @param [in] version version for the archive + * @note NOLINT syntax is needed since this is an interface to serialization and + * used inside boost serialization. + */ + template + void serialize(archive & ar, const unsigned /*version*/) // NOLINT(runtime/references) + { + ar & part_device_; + } +}; + /** * @brief HDD information list */ diff --git a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp index 8eac2c632bf16..6e7d010645fe6 100644 --- a/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp +++ b/system/system_monitor/include/system_monitor/hdd_monitor/hdd_monitor.hpp @@ -313,6 +313,13 @@ class HDDMonitor : public rclcpp::Node */ void updateHDDConnections(); + /** + * @brief unmount device + * @param [in] device device name + * @return result of success or failure + */ + int unmountDevice(std::string & device); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get HDD information from HDDReader @@ -322,8 +329,6 @@ class HDDMonitor : public rclcpp::Node std::map hdd_params_; //!< @brief list of error and warning levels std::map hdd_connected_flags_; //!< @brief list of flag whether HDD is connected - std::map - device_unmount_request_flags_; //!< @brief list of flag requesting device unmount std::map initial_recovered_errors_; //!< @brief list of initial recovered error count std::map hdd_stats_; //!< @brief list of HDD statistics diff --git a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp index e07f66e25046e..dc5581b3430f1 100644 --- a/system/system_monitor/reader/hdd_reader/hdd_reader.cpp +++ b/system/system_monitor/reader/hdd_reader/hdd_reader.cpp @@ -407,27 +407,127 @@ int get_nvme_SMARTData(int fd, HDDInfo * info) return EXIT_SUCCESS; } +/** + * @brief get HDD information + * @param [in] boost::archive::text_iarchive object + * @param [out] boost::archive::text_oarchive object + * @return 0 on success, otherwise error + */ +int get_hdd_info(boost::archive::text_iarchive & ia, boost::archive::text_oarchive & oa) +{ + std::vector hdd_devices; + HDDInfoList list; + + try { + ia & hdd_devices; + } catch (const std::exception & e) { + syslog(LOG_ERR, "exception. %s\n", e.what()); + return -1; + } + + for (auto & hdd_device : hdd_devices) { + HDDInfo info{}; + + // Open a file + int fd = open(hdd_device.name_.c_str(), O_RDONLY); + if (fd < 0) { + info.error_code_ = errno; + syslog(LOG_ERR, "Failed to open a file. %s\n", strerror(info.error_code_)); + continue; + } + + // AHCI device + if (boost::starts_with(hdd_device.name_.c_str(), "/dev/sd")) { + // Get IDENTIFY DEVICE for ATA drive + info.error_code_ = get_ata_identify(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get IDENTIFY DEVICE for ATA drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + // Get SMART DATA for ATA drive + info.error_code_ = get_ata_SMARTData(fd, &info, hdd_device); + if (info.error_code_ != 0) { + syslog(LOG_ERR, "Failed to get SMART LOG for ATA drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + } else if (boost::starts_with(hdd_device.name_.c_str(), "/dev/nvme")) { // NVMe device + // Get Identify for NVMe drive + info.error_code_ = get_nvme_identify(fd, &info); + if (info.error_code_ != 0) { + syslog(LOG_ERR, "Failed to get Identify for NVMe drive. %s\n", strerror(info.error_code_)); + close(fd); + continue; + } + // Get SMART / Health Information for NVMe drive + info.error_code_ = get_nvme_SMARTData(fd, &info); + if (info.error_code_ != 0) { + syslog( + LOG_ERR, "Failed to get SMART / Health Information for NVMe drive. %s\n", + strerror(info.error_code_)); + close(fd); + continue; + } + } + + // Close the file descriptor FD + info.error_code_ = close(fd); + if (info.error_code_ < 0) { + info.error_code_ = errno; + syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(info.error_code_)); + } + + list[hdd_device.name_] = info; + } + + oa << list; + return 0; +} + /** * @brief unmount device with lazy option - * @param [in] device device name + * @param [in] boost::archive::text_iarchive object + * @param [out] boost::archive::text_oarchive object + * @return 0 on success, otherwise error */ -void unmount_device_with_lazy(std::string & device) +int unmount_device_with_lazy(boost::archive::text_iarchive & ia, boost::archive::text_oarchive & oa) { - boost::process::ipstream is_out; - boost::process::ipstream is_err; + std::vector unmount_devices; + std::vector responses; + + try { + ia & unmount_devices; + } catch (const std::exception & e) { + syslog(LOG_ERR, "exception. %s\n", e.what()); + return -1; + } + + for (auto & unmount_device : unmount_devices) { + int ret = 0; + boost::process::ipstream is_out; + boost::process::ipstream is_err; - boost::process::child c( - "/bin/sh", "-c", fmt::format("umount -l {}", device.c_str()), boost::process::std_out > is_out, - boost::process::std_err > is_err); - c.wait(); + boost::process::child c( + "/bin/sh", "-c", fmt::format("umount -l {}", unmount_device.part_device_.c_str()), + boost::process::std_out > is_out, boost::process::std_err > is_err); + c.wait(); - if (c.exit_code() != 0) { - syslog(LOG_ERR, "Failed to execute umount command. %s\n", device.c_str()); + if (c.exit_code() != 0) { + syslog( + LOG_ERR, "Failed to execute umount command. %s\n", unmount_device.part_device_.c_str()); + ret = -1; + } + responses.push_back(ret); } + + oa << responses; + return 0; } /** - * @brief check HDD temperature + * @brief hdd_reader main procedure * @param [in] port port to listen */ void run(int port) @@ -500,14 +600,14 @@ void run(int port) return; } - // Restore list of devices - std::vector hdd_devices; + uint8_t request_id; + + buf[sizeof(buf) - 1] = '\0'; + std::istringstream iss(buf); + boost::archive::text_iarchive ia(iss); try { - buf[sizeof(buf) - 1] = '\0'; - std::istringstream iss(buf); - boost::archive::text_iarchive oa(iss); - oa & hdd_devices; + ia & request_id; } catch (const std::exception & e) { syslog(LOG_ERR, "exception. %s\n", e.what()); close(new_sock); @@ -515,76 +615,26 @@ void run(int port) return; } - HDDInfoList list; std::ostringstream oss; boost::archive::text_oarchive oa(oss); - for (auto & hdd_device : hdd_devices) { - if (hdd_device.unmount_request_flag_) { - unmount_device_with_lazy(hdd_device.part_device_); - continue; - } - - HDDInfo info{}; - - // Open a file - int fd = open(hdd_device.name_.c_str(), O_RDONLY); - if (fd < 0) { - info.error_code_ = errno; - syslog(LOG_ERR, "Failed to open a file. %s\n", strerror(info.error_code_)); + switch (request_id) { + case HDDReaderRequestID::GetHDDInfo: + ret = get_hdd_info(ia, oa); + break; + case HDDReaderRequestID::UnmountDevice: + ret = unmount_device_with_lazy(ia, oa); + break; + default: + syslog(LOG_ERR, "Request ID is invalid. %d\n", request_id); continue; - } - - // AHCI device - if (boost::starts_with(hdd_device.name_.c_str(), "/dev/sd")) { - // Get IDENTIFY DEVICE for ATA drive - info.error_code_ = get_ata_identify(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get IDENTIFY DEVICE for ATA drive. %s\n", - strerror(info.error_code_)); - close(fd); - continue; - } - // Get SMART DATA for ATA drive - info.error_code_ = get_ata_SMARTData(fd, &info, hdd_device); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get SMART LOG for ATA drive. %s\n", strerror(info.error_code_)); - close(fd); - continue; - } - } else if (boost::starts_with(hdd_device.name_.c_str(), "/dev/nvme")) { // NVMe device - // Get Identify for NVMe drive - info.error_code_ = get_nvme_identify(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get Identify for NVMe drive. %s\n", strerror(info.error_code_)); - close(fd); - continue; - } - // Get SMART / Health Information for NVMe drive - info.error_code_ = get_nvme_SMARTData(fd, &info); - if (info.error_code_ != 0) { - syslog( - LOG_ERR, "Failed to get SMART / Health Information for NVMe drive. %s\n", - strerror(info.error_code_)); - close(fd); - continue; - } - } - - // Close the file descriptor FD - info.error_code_ = close(fd); - if (info.error_code_ < 0) { - info.error_code_ = errno; - syslog(LOG_ERR, "Failed to close the file descriptor FD. %s\n", strerror(info.error_code_)); - } - - list[hdd_device.name_] = info; + } + if (ret != 0) { + close(new_sock); + close(sock); + return; } - oa << list; // Write N bytes of BUF to FD ret = write(new_sock, oss.str().c_str(), oss.str().length()); if (ret < 0) { diff --git a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp index 624635028190a..85a3ba7742cab 100644 --- a/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp +++ b/system/system_monitor/src/hdd_monitor/hdd_monitor.cpp @@ -624,29 +624,26 @@ void HDDMonitor::updateHDDInfoList() return; } + uint8_t request_id = HDDReaderRequestID::GetHDDInfo; std::vector hdd_devices; for (auto itr = hdd_params_.begin(); itr != hdd_params_.end(); ++itr) { - HDDDevice device; - - if (hdd_connected_flags_[itr->first]) { - device.name_ = itr->second.disk_device_; - device.temp_attribute_id_ = itr->second.temp_attribute_id_; - device.power_on_hours_attribute_id_ = itr->second.power_on_hours_attribute_id_; - device.total_data_written_attribute_id_ = itr->second.total_data_written_attribute_id_; - device.recovered_error_attribute_id_ = itr->second.recovered_error_attribute_id_; - device.unmount_request_flag_ = 0; - } else if (device_unmount_request_flags_[itr->first]) { - device.part_device_ = itr->second.part_device_; - device.unmount_request_flag_ = 1; - } else { + if (!hdd_connected_flags_[itr->first]) { continue; } + HDDDevice device; + device.name_ = itr->second.disk_device_; + device.temp_attribute_id_ = itr->second.temp_attribute_id_; + device.power_on_hours_attribute_id_ = itr->second.power_on_hours_attribute_id_; + device.total_data_written_attribute_id_ = itr->second.total_data_written_attribute_id_; + device.recovered_error_attribute_id_ = itr->second.recovered_error_attribute_id_; + hdd_devices.push_back(device); } std::ostringstream oss; boost::archive::text_oarchive oa(oss); + oa & request_id; oa & hdd_devices; // Write list of devices to FD @@ -797,7 +794,6 @@ void HDDMonitor::updateHDDConnections() { for (auto & hdd_param : hdd_params_) { hdd_connected_flags_[hdd_param.first] = false; - device_unmount_request_flags_[hdd_param.first] = false; // Get device name from mount point hdd_param.second.part_device_ = getDeviceFromMountPoint(hdd_param.first); @@ -824,11 +820,107 @@ void HDDMonitor::updateHDDConnections() } else { // Deal with the issue that file system remains mounted when a drive is actually // disconnected. - device_unmount_request_flags_[hdd_param.first] = true; + if (unmountDevice(hdd_param.second.part_device_)) { + RCLCPP_ERROR( + get_logger(), "Failed to unmount device : %s", hdd_param.second.part_device_.c_str()); + } } } } } +int HDDMonitor::unmountDevice(std::string & device) +{ + // Create a new socket + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + RCLCPP_ERROR(get_logger(), "socket create error. %s", strerror(errno)); + return -1; + } + + // Specify the receiving timeouts until reporting an error + struct timeval tv; + tv.tv_sec = 10; + tv.tv_usec = 0; + int ret = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "setsockopt error. %s", strerror(errno)); + close(sock); + return -1; + } + + // Connect the socket referred to by the file descriptor + sockaddr_in addr; + memset(&addr, 0, sizeof(sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_port = htons(hdd_reader_port_); + addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket connect error. %s", strerror(errno)); + close(sock); + return -1; + } + + uint8_t request_id = HDDReaderRequestID::UnmountDevice; + std::vector umount_dev_infos; + UnmountDeviceInfo dev_info; + + dev_info.part_device_ = device; + umount_dev_infos.push_back(dev_info); + + std::ostringstream oss; + boost::archive::text_oarchive oa(oss); + oa & request_id; + oa & umount_dev_infos; + + // Write list of devices to FD + ret = write(sock, oss.str().c_str(), oss.str().length()); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket write error. %s", strerror(errno)); + close(sock); + return -1; + } + + // Receive messages from a socket + char buf[1024] = ""; + ret = recv(sock, buf, sizeof(buf) - 1, 0); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket recv error. %s", strerror(errno)); + close(sock); + return -1; + } + // No data received + if (ret == 0) { + RCLCPP_ERROR(get_logger(), "no data received from hdd_reader."); + close(sock); + return -1; + } + + // Close the file descriptor FD + ret = close(sock); + if (ret < 0) { + RCLCPP_ERROR(get_logger(), "socket close error. %s", strerror(errno)); + return -1; + } + + std::vector responses; + + // Restore responses + try { + std::istringstream iss(buf); + boost::archive::text_iarchive ia(iss); + ia >> responses; + } catch (const std::exception & e) { + RCLCPP_ERROR(get_logger(), "restore responses exception. %s", e.what()); + return -1; + } + if (responses.empty()) { + RCLCPP_ERROR(get_logger(), "responses from hdd_reader is empty."); + return -1; + } + return responses[0]; +} + #include RCLCPP_COMPONENTS_REGISTER_NODE(HDDMonitor)