diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 5c14499841f2..b98cd5c4a2cd 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -191,7 +191,7 @@ class FanUpdater(logger.Logger): FAN_INFO_TABLE_NAME = 'FAN_INFO' FAN_DRAWER_INFO_TABLE_NAME = 'FAN_DRAWER_INFO' - def __init__(self, chassis): + def __init__(self, chassis, task_stopping_event): """ Initializer for FanUpdater :param chassis: Object representing a platform chassis @@ -199,6 +199,7 @@ class FanUpdater(logger.Logger): super(FanUpdater, self).__init__(SYSLOG_IDENTIFIER) self.chassis = chassis + self.task_stopping_event = task_stopping_event self.fan_status_dict = {} state_db = daemon_base.db_connect("STATE_DB") self.table = swsscommon.Table(state_db, FanUpdater.FAN_INFO_TABLE_NAME) @@ -236,8 +237,12 @@ class FanUpdater(logger.Logger): FanStatus.reset_fan_counter() for drawer_index, drawer in enumerate(self.chassis.get_all_fan_drawers()): + if self.task_stopping_event.is_set(): + return self._refresh_fan_drawer_status(drawer, drawer_index) for fan_index, fan in enumerate(drawer.get_all_fans()): + if self.task_stopping_event.is_set(): + return try: self._refresh_fan_status(drawer, drawer_index, fan, fan_index) except Exception as e: @@ -245,6 +250,8 @@ class FanUpdater(logger.Logger): for psu_index, psu in enumerate(self.chassis.get_all_psus()): for fan_index, fan in enumerate(psu.get_all_fans()): + if self.task_stopping_event.is_set(): + return try: self._refresh_fan_status(psu, psu_index, fan, fan_index, True) except Exception as e: @@ -396,6 +403,8 @@ class FanUpdater(logger.Logger): def _update_led_color(self): for fan_name, fan_status in self.fan_status_dict.items(): + if self.task_stopping_event.is_set(): + return try: fvs = swsscommon.FieldValuePairs([ ('led_status', str(try_get(fan_status.fan.get_status_led))) @@ -408,6 +417,8 @@ class FanUpdater(logger.Logger): self.table.set(fan_name, fvs) for drawer in self.chassis.get_all_fan_drawers(): + if self.task_stopping_event.is_set(): + return drawer_name = try_get(drawer.get_name) if drawer_name == NOT_AVAILABLE: continue @@ -510,7 +521,7 @@ class TemperatureUpdater(logger.Logger): # Temperature information table name in database TEMPER_INFO_TABLE_NAME = 'TEMPERATURE_INFO' - def __init__(self, chassis): + def __init__(self, chassis, task_stopping_event): """ Initializer of TemperatureUpdater :param chassis: Object representing a platform chassis @@ -518,6 +529,7 @@ class TemperatureUpdater(logger.Logger): super(TemperatureUpdater, self).__init__(SYSLOG_IDENTIFIER) self.chassis = chassis + self.task_stopping_event = task_stopping_event self.temperature_status_dict = {} state_db = daemon_base.db_connect("STATE_DB") self.table = swsscommon.Table(state_db, TemperatureUpdater.TEMPER_INFO_TABLE_NAME) @@ -562,6 +574,8 @@ class TemperatureUpdater(logger.Logger): """ self.log_debug("Start temperature updating") for index, thermal in enumerate(self.chassis.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index) except Exception as e: @@ -570,6 +584,8 @@ class TemperatureUpdater(logger.Logger): for psu_index, psu in enumerate(self.chassis.get_all_psus()): parent_name = 'PSU {}'.format(psu_index + 1) for thermal_index, thermal in enumerate(psu.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(parent_name, thermal, thermal_index) except Exception as e: @@ -578,6 +594,8 @@ class TemperatureUpdater(logger.Logger): for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()): parent_name = 'SFP {}'.format(sfp_index + 1) for thermal_index, thermal in enumerate(sfp.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(parent_name, thermal, thermal_index) except Exception as e: @@ -686,8 +704,8 @@ class ThermalMonitor(ProcessTaskBase): # Set minimum logging level to INFO self.logger.set_min_log_priority_info() - self.fan_updater = FanUpdater(chassis) - self.temperature_updater = TemperatureUpdater(chassis) + self.fan_updater = FanUpdater(chassis, self.task_stopping_event) + self.temperature_updater = TemperatureUpdater(chassis, self.task_stopping_event) def main(self): begin = time.time() @@ -789,6 +807,8 @@ class ThermalControlDaemon(daemon_base.DaemonBase): if sig in FATAL_SIGNALS: self.log_info("Caught signal '{}' - exiting...".format(SIGNALS_TO_NAMES_DICT[sig])) exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us + self.thermal_monitor.task_stop() + self.thermal_manager.stop() self.stop_event.set() elif sig in NONFATAL_SIGNALS: self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index d2f647384f13..7c272826b7d6 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -1,5 +1,6 @@ import os import sys +import multiprocessing from imp import load_source # TODO: Replace with importlib once we no longer need to support Python 2 # TODO: Clean this up once we no longer need to support Python 2 @@ -148,7 +149,7 @@ class TestFanUpdater(object): Test cases to cover functionality in FanUpdater class """ def test_deinit(self): - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) fan_updater.fan_status_dict = {'key1': 'value1', 'key2': 'value2'} fan_updater.table._del = mock.MagicMock() @@ -161,7 +162,7 @@ def test_deinit(self): @mock.patch('thermalctld.update_entity_info', mock.MagicMock()) def test_refresh_fan_drawer_status_fan_drawer_get_name_not_impl(self): # Test case where fan_drawer.get_name is not implemented - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) mock_fan_drawer = mock.MagicMock() fan_updater._refresh_fan_drawer_status(mock_fan_drawer, 1) assert thermalctld.update_entity_info.call_count == 0 @@ -175,7 +176,7 @@ def test_update_fan_with_exception(self): fan.make_over_speed() chassis.get_all_fans().append(fan) - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED assert fan_updater.log_warning.call_count == 1 @@ -192,7 +193,7 @@ def test_set_fan_led_exception(self): mock_fan = MockFan() mock_fan.set_status_led = mock.MagicMock(side_effect=NotImplementedError) - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) fan_updater._set_fan_led(mock_fan_drawer, mock_fan, 'Test Fan', fan_status) assert fan_updater.log_warning.call_count == 1 fan_updater.log_warning.assert_called_with('Failed to set status LED for fan Test Fan, set_status_led not implemented') @@ -200,7 +201,7 @@ def test_set_fan_led_exception(self): def test_fan_absent(self): chassis = MockChassis() chassis.make_absent_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -224,7 +225,7 @@ def test_fan_absent(self): def test_fan_faulty(self): chassis = MockChassis() chassis.make_faulty_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -248,7 +249,7 @@ def test_fan_faulty(self): def test_fan_under_speed(self): chassis = MockChassis() chassis.make_under_speed_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -264,7 +265,7 @@ def test_fan_under_speed(self): def test_fan_over_speed(self): chassis = MockChassis() chassis.make_over_speed_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -283,7 +284,7 @@ def test_update_psu_fans(self): mock_fan = MockFan() psu._fan_list.append(mock_fan) chassis._psu_list.append(psu) - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 0 @@ -331,7 +332,7 @@ def test_insufficient_fan_number(): chassis = MockChassis() chassis.make_absent_fan() chassis.make_faulty_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 3 expected_calls = [ @@ -415,7 +416,7 @@ class TestTemperatureUpdater(object): """ def test_deinit(self): chassis = MockChassis() - temp_updater = thermalctld.TemperatureUpdater(chassis) + temp_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temp_updater.temperature_status_dict = {'key1': 'value1', 'key2': 'value2'} temp_updater.table._del = mock.MagicMock() @@ -423,11 +424,12 @@ def test_deinit(self): assert temp_updater.table._del.call_count == 2 expected_calls = [mock.call('key1'), mock.call('key2')] temp_updater.table._del.assert_has_calls(expected_calls, any_order=True) + def test_over_temper(self): chassis = MockChassis() chassis.make_over_temper_thermal() - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() thermal_list = chassis.get_all_thermals() assert temperature_updater.log_warning.call_count == 1 @@ -441,7 +443,7 @@ def test_over_temper(self): def test_under_temper(self): chassis = MockChassis() chassis.make_under_temper_thermal() - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() thermal_list = chassis.get_all_thermals() assert temperature_updater.log_warning.call_count == 1 @@ -458,7 +460,7 @@ def test_update_psu_thermals(self): mock_thermal = MockThermal() psu._thermal_list.append(mock_thermal) chassis._psu_list.append(psu) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 @@ -478,7 +480,7 @@ def test_update_sfp_thermals(self): mock_thermal = MockThermal() sfp._thermal_list.append(mock_thermal) chassis._sfp_list.append(sfp) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 @@ -499,7 +501,7 @@ def test_update_thermal_with_exception(self): thermal.make_over_temper() chassis.get_all_thermals().append(thermal) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 2 @@ -524,17 +526,17 @@ def test_updater_thermal_check_modular_chassis(): chassis = MockChassis() assert chassis.is_modular_chassis() == False - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table == None chassis.set_modular_chassis(True) chassis.set_my_slot(-1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table == None my_slot = 1 chassis.set_my_slot(my_slot) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table != None assert temperature_updater.chassis_table.table_name == '{}_{}'.format(TEMPER_INFO_TABLE_NAME, str(my_slot)) @@ -547,7 +549,7 @@ def test_updater_thermal_check_chassis_table(): chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.chassis_table.get_size() == chassis.get_num_thermals() @@ -566,7 +568,7 @@ def test_updater_thermal_check_min_max(): chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() slot_dict = temperature_updater.chassis_table.get(thermal.get_name()) @@ -580,12 +582,14 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() daemon_thermalctld.signal_handler(thermalctld.signal.SIGHUP, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert assert daemon_thermalctld.log_info.call_count == 1 daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 0 + assert daemon_thermalctld.thermal_manager.stop.call_count == 0 assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN # Test SIGINT @@ -593,6 +597,7 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() test_signal = thermalctld.signal.SIGINT daemon_thermalctld.signal_handler(test_signal, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert @@ -600,6 +605,7 @@ def test_signal_handler(): daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGINT' - exiting...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 1 + assert daemon_thermalctld.thermal_manager.stop.call_count == 1 assert thermalctld.exit_code == (128 + test_signal) # Test SIGTERM @@ -608,6 +614,7 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() test_signal = thermalctld.signal.SIGTERM daemon_thermalctld.signal_handler(test_signal, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert @@ -615,6 +622,7 @@ def test_signal_handler(): daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGTERM' - exiting...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 1 + assert daemon_thermalctld.thermal_manager.stop.call_count == 1 assert thermalctld.exit_code == (128 + test_signal) # Test an unhandled signal @@ -623,12 +631,14 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() daemon_thermalctld.signal_handler(thermalctld.signal.SIGUSR1, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert assert daemon_thermalctld.log_warning.call_count == 1 daemon_thermalctld.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...") assert daemon_thermalctld.log_info.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 0 + assert daemon_thermalctld.thermal_manager.stop.call_count == 0 assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN