From 4ef8b38edb682cb09547eefa14b727fc762b670e Mon Sep 17 00:00:00 2001 From: yozhao101 <56170650+yozhao101@users.noreply.github.com> Date: Thu, 2 Jun 2022 15:59:36 -0700 Subject: [PATCH] [hostcfgd] Initialize `Restart=` in feature's systemd config by the value of `auto_restart` in `CONFIG_DB` (#10915) Why I did it Recently the nightly testing pipeline found that the autorestart test case was failed when it was run against master image. The reason is Restart= field in each container's systemd configuration file was set to Restart=no even the value of auto_restart field in FEATURE table of CONFIG_DB is enabled. This issue introduced by #10168 can be reproduced by the following steps: Issues the config command to disable the auto-restart feature of a container Runs command config reload or config reload minigraph to enable auto-restart of the container Checks Restart= field in the container's systemd config file mentioned in step 1 by running the command sudo systemctl cat .service Initially this PR (#10168) wants to revert the changes proposed by this: #8861. However, it did not fully revert all the changes. How I did it When hostcfgd started or was restarted, the Restart= field in each container's systemd configuration file should be initialized according to the value of auto_restart field in FEATURE table of CONFIG_DB. How to verify it I verified this change by running auto-restart test case against newly built master image and also ran the unittest: --- src/sonic-host-services/scripts/hostcfgd | 78 ++++--- .../tests/hostcfgd/hostcfgd_test.py | 196 +++++++++++------- .../tests/hostcfgd/test_vectors.py | 28 ++- 3 files changed, 188 insertions(+), 114 deletions(-) diff --git a/src/sonic-host-services/scripts/hostcfgd b/src/sonic-host-services/scripts/hostcfgd index 7f8f4be0b2e7..41da37490be4 100755 --- a/src/sonic-host-services/scripts/hostcfgd +++ b/src/sonic-host-services/scripts/hostcfgd @@ -104,6 +104,7 @@ def obfuscate(data): else: return data + def get_pid(procname): for dirname in os.listdir('/proc'): if dirname == 'curproc': @@ -117,6 +118,7 @@ def get_pid(procname): return dirname return "" + class Feature(object): """ Represents a feature configuration from CONFIG_DB data. """ @@ -182,10 +184,10 @@ class FeatureHandler(object): self._cached_config = {} self.is_multi_npu = device_info.is_multi_npu() - def handle(self, feature_name, op, feature_cfg): + def handler(self, feature_name, op, feature_cfg): if not feature_cfg: syslog.syslog(syslog.LOG_INFO, "Deregistering feature {}".format(feature_name)) - self._cached_config.pop(feature_name) + self._cached_config.pop(feature_name, None) self._feature_state_table._del(feature_name) return @@ -197,7 +199,11 @@ class FeatureHandler(object): # the next called self.update_feature_state will start it again. If it will fail # again the auto restart will kick-in. Another order may leave it in failed state # and not auto restart. - self.update_feature_auto_restart(feature, feature_name) + if self._cached_config[feature_name].auto_restart != feature.auto_restart: + syslog.syslog(syslog.LOG_INFO, "Auto-restart status of feature '{}' is changed from '{}' to '{}' ..." + .format(feature_name, self._cached_config[feature_name].auto_restart, feature.auto_restart)) + self.update_systemd_config(feature) + self._cached_config[feature_name].auto_restart = feature.auto_restart # Enable/disable the container service if the feature state was changed from its previous state. if self._cached_config[feature_name].state != feature.state: @@ -220,7 +226,7 @@ class FeatureHandler(object): feature = Feature(feature_name, feature_table[feature_name], self._device_config) self._cached_config.setdefault(feature_name, feature) - self.update_feature_auto_restart(feature, feature_name) + self.update_systemd_config(feature) self.update_feature_state(feature) self.resync_feature_state(feature) @@ -265,41 +271,45 @@ class FeatureHandler(object): return True - def update_feature_auto_restart(self, feature, feature_name): - dir_name = self.SYSTEMD_SERVICE_CONF_DIR.format(feature_name) - auto_restart_conf = os.path.join(dir_name, 'auto_restart.conf') + def update_systemd_config(self, feature_config): + """Updates `Restart=` field in feature's systemd configuration file + according to the value of `auto_restart` field in `FEATURE` table of `CONFIG_DB`. - write_conf = False - if not os.path.exists(auto_restart_conf): # if the auto_restart_conf file is not found, set it - write_conf = True - - if self._cached_config[feature_name].auto_restart != feature.auto_restart: - write_conf = True + Args: + feature: An object represents a feature's configuration in `FEATURE` + table of `CONFIG_DB`. - if not write_conf: - return + Returns: + None. + """ + restart_field_str = "always" if "enabled" in feature_config.auto_restart else "no" + feature_systemd_config = "[Service]\nRestart={}\n".format(restart_field_str) + feature_names, feature_suffixes = self.get_multiasic_feature_instances(feature_config) - self._cached_config[feature_name].auto_restart = feature.auto_restart # Update Cache + # On multi-ASIC device, creates systemd configuration file for each feature instance + # residing in difference namespace. + for feature_name in feature_names: + syslog.syslog(syslog.LOG_INFO, "Updating feature '{}' systemd config file related to auto-restart ..." + .format(feature_name)) + feature_systemd_config_dir_path = self.SYSTEMD_SERVICE_CONF_DIR.format(feature_name) + feature_systemd_config_file_path = os.path.join(feature_systemd_config_dir_path, 'auto_restart.conf') - restart_config = "always" if feature.auto_restart == "enabled" else "no" - service_conf = "[Service]\nRestart={}\n".format(restart_config) - feature_names, feature_suffixes = self.get_feature_attribute(feature) + if not os.path.exists(feature_systemd_config_dir_path): + os.mkdir(feature_systemd_config_dir_path) + with open(feature_systemd_config_file_path, 'w') as feature_systemd_config_file_handler: + feature_systemd_config_file_handler.write(feature_systemd_config) - for name in feature_names: - dir_name = self.SYSTEMD_SERVICE_CONF_DIR.format(name) - auto_restart_conf = os.path.join(dir_name, 'auto_restart.conf') - if not os.path.exists(dir_name): - os.mkdir(dir_name) - with open(auto_restart_conf, 'w') as cfgfile: - cfgfile.write(service_conf) + syslog.syslog(syslog.LOG_INFO, "Feautre '{}' systemd config file related to auto-restart is updated!" + .format(feature_name)) try: + syslog.syslog(syslog.LOG_INFO, "Reloading systemd configuration files ...") run_cmd("sudo systemctl daemon-reload", raise_exception=True) + syslog.syslog(syslog.LOG_INFO, "Systemd configuration files are reloaded!") except Exception as err: - syslog.syslog(syslog.LOG_ERR, "Feature '{}' failed to configure auto_restart".format(feature.name)) - return + syslog.syslog(syslog.LOG_ERR, "Failed to reload systemd configuration files!") - def get_feature_attribute(self, feature): + def get_multiasic_feature_instances(self, feature): # Create feature name suffix depending feature is running in host or namespace or in both feature_names = ( ([feature.name] if feature.has_global_scope or not self.is_multi_npu else []) + @@ -330,7 +340,7 @@ class FeatureHandler(object): def enable_feature(self, feature): cmds = [] - feature_names, feature_suffixes = self.get_feature_attribute(feature) + feature_names, feature_suffixes = self.get_multiasic_feature_instances(feature) for feature_name in feature_names: # Check if it is already enabled, if yes skip the system call unit_file_state = self.get_systemd_unit_state("{}.{}".format(feature_name, feature_suffixes[-1])) @@ -352,7 +362,7 @@ class FeatureHandler(object): run_cmd(cmd, raise_exception=True) except Exception as err: syslog.syslog(syslog.LOG_ERR, "Feature '{}.{}' failed to be enabled and started" - .format(feature.name, feature_suffixes[-1])) + .format(feature.name, feature_suffixes[-1])) self.set_feature_state(feature, self.FEATURE_STATE_FAILED) return @@ -360,7 +370,7 @@ class FeatureHandler(object): def disable_feature(self, feature): cmds = [] - feature_names, feature_suffixes = self.get_feature_attribute(feature) + feature_names, feature_suffixes = self.get_multiasic_feature_instances(feature) for feature_name in feature_names: # Check if it is already disabled, if yes skip the system call unit_file_state = self.get_systemd_unit_state("{}.{}".format(feature_name, feature_suffixes[-1])) @@ -377,7 +387,7 @@ class FeatureHandler(object): run_cmd(cmd, raise_exception=True) except Exception as err: syslog.syslog(syslog.LOG_ERR, "Feature '{}.{}' failed to be stopped and disabled" - .format(feature.name, feature_suffixes[-1])) + .format(feature.name, feature_suffixes[-1])) self.set_feature_state(feature, self.FEATURE_STATE_FAILED) return @@ -1212,7 +1222,7 @@ class HostConfigDaemon: self.config_db.subscribe('KDUMP', make_callback(self.kdump_handler)) # Handle FEATURE updates before other tables - self.config_db.subscribe('FEATURE', make_callback(self.feature_handler.handle)) + self.config_db.subscribe('FEATURE', make_callback(self.feature_handler.handler)) # Handle AAA, TACACS and RADIUS related tables self.config_db.subscribe('AAA', make_callback(self.aaa_handler)) self.config_db.subscribe('TACPLUS', make_callback(self.tacacs_global_handler)) diff --git a/src/sonic-host-services/tests/hostcfgd/hostcfgd_test.py b/src/sonic-host-services/tests/hostcfgd/hostcfgd_test.py index 28d4f6f8a724..786bd1c8f2a9 100644 --- a/src/sonic-host-services/tests/hostcfgd/hostcfgd_test.py +++ b/src/sonic-host-services/tests/hostcfgd/hostcfgd_test.py @@ -27,112 +27,162 @@ hostcfgd.Table = mock.Mock() -class TestHostcfgd(TestCase): +class TestFeatureHandler(TestCase): + """Test methods of `FeatureHandler` class. """ - Test hostcfd daemon - feature - """ - def __verify_table(self, table, feature_state_table, expected_table): + def checks_config_table(self, feature_table, expected_table): + """Compares `FEATURE` table in `CONFIG_DB` with expected output table. + + Args: + feature_table: A dictionary indicates current `FEATURE` table in `CONFIG_DB`. + expected_table A dictionary indicates the expected `FEATURE` table in `CONFIG_DB`. + + Returns: + Returns True if `FEATURE` table in `CONFIG_DB` was not modified unexpectedly; + otherwise, returns False. """ - verify config db tables + ddiff = DeepDiff(feature_table, expected_table, ignore_order=True) + + return True if not ddiff else False - Compares Config DB table (FEATURE) with expected output table. - Verifies that State DB table (FEATURE) is updated. + def checks_systemd_config_file(self, feature_table): + """Checks whether the systemd configuration file of each feature was created or not + and whether the `Restart=` field in the file is set correctly or not. - Args: - table(dict): Current Config Db table - feature_state_table(Mock): Mocked State DB FEATURE table - expected_table(dict): Expected Config Db table + Args: + feature_table: A dictionary indicates `Feature` table in `CONFIG_DB`. - Returns: - None + Returns: Boolean value indicates whether test passed or not. """ - ddiff = DeepDiff(table, expected_table, ignore_order=True) - print('DIFF:', ddiff) - - def get_state(cfg_state): - """ Translates CONFIG DB state field into STATE DB state field """ - if cfg_state == 'always_disabled': - return 'disabled' - elif cfg_state == 'always_enabled': - return 'enabled' - else: - return cfg_state - feature_state_table.set.assert_has_calls([ - mock.call(feature, [('state', get_state(table[feature]['state']))]) for feature in table - ]) - return True if not ddiff else False + truth_table = {'enabled': 'always', + 'disabled': 'no'} + + systemd_config_file_path = os.path.join(hostcfgd.FeatureHandler.SYSTEMD_SERVICE_CONF_DIR, + 'auto_restart.conf') + + for feature_name in feature_table: + auto_restart_status = feature_table[feature_name].get('auto_restart', 'disabled') + if "enabled" in auto_restart_status: + auto_restart_status = "enabled" + elif "disabled" in auto_restart_status: + auto_restart_status = "disabled" - def __verify_fs(self, table): + feature_systemd_config_file_path = systemd_config_file_path.format(feature_name) + is_config_file_existing = os.path.exists(feature_systemd_config_file_path) + assert is_config_file_existing, "Systemd configuration file of feature '{}' does not exist!".format(feature_name) + + with open(feature_systemd_config_file_path) as systemd_config_file: + status = systemd_config_file.read().strip() + assert status == '[Service]\nRestart={}'.format(truth_table[auto_restart_status]) + + def get_state_db_set_calls(self, feature_table): + """Returns a Mock call objects which recorded the `set` calls to `FEATURE` table in `STATE_DB`. + + Args: + feature_table: A dictionary indicates `FEATURE` table in `CONFIG_DB`. + + Returns: + set_call_list: A list indicates Mock call objects. """ - verify filesystem changes made by hostcfgd. + set_call_list = [] + + for feature_name in feature_table.keys(): + feature_state = "" + if "enabled" in feature_table[feature_name]["state"]: + feature_state = "enabled" + elif "disabled" in feature_table[feature_name]["state"]: + feature_state = "disabled" + else: + feature_state = feature_table[feature_name]["state"] + + set_call_list.append(mock.call(feature_name, [("state", feature_state)])) - Checks whether systemd override configuration files - were generated and Restart= for systemd unit is set - correctly + return set_call_list + + @parameterized.expand(HOSTCFGD_TEST_VECTOR) + @patchfs + def test_sync_state_field(self, test_scenario_name, config_data, fs): + """Tests the method `sync_state_field(...)` of `FeatureHandler` class. - Args: - table(dict): Current Config Db table + Args: + test_secnario_name: A string indicates different testing scenario. + config_data: A dictionary contains initial `CONFIG_DB` tables and expected results. - Returns: Boolean wether test passed. + Returns: + Boolean value indicates whether test will pass or not. """ + # add real path of sesscommon for database_config.json + fs.add_real_paths(swsscommon_package.__path__) + fs.create_dir(hostcfgd.FeatureHandler.SYSTEMD_SYSTEM_DIR) - exp_dict = { - 'enabled': 'always', - 'disabled': 'no', - } - auto_restart_conf = os.path.join(hostcfgd.FeatureHandler.SYSTEMD_SERVICE_CONF_DIR, 'auto_restart.conf') + MockConfigDb.set_config_db(config_data['config_db']) + feature_state_table_mock = mock.Mock() + with mock.patch('hostcfgd.subprocess') as mocked_subprocess: + popen_mock = mock.Mock() + attrs = config_data['popen_attributes'] + popen_mock.configure_mock(**attrs) + mocked_subprocess.Popen.return_value = popen_mock - for feature in table: - auto_restart = table[feature].get('auto_restart', 'disabled') - with open(auto_restart_conf.format(feature)) as conf: - conf = conf.read().strip() - assert conf == '[Service]\nRestart={}'.format(exp_dict[auto_restart]) + device_config = {} + device_config['DEVICE_METADATA'] = MockConfigDb.CONFIG_DB['DEVICE_METADATA'] + feature_handler = hostcfgd.FeatureHandler(MockConfigDb(), feature_state_table_mock, device_config) + + feature_table = MockConfigDb.CONFIG_DB['FEATURE'] + feature_handler.sync_state_field(feature_table) + + is_any_difference = self.checks_config_table(MockConfigDb.get_config_db()['FEATURE'], + config_data['expected_config_db']['FEATURE']) + assert is_any_difference, "'FEATURE' table in 'CONFIG_DB' is modified unexpectedly!" + + feature_table_state_db_calls = self.get_state_db_set_calls(feature_table) + self.checks_systemd_config_file(config_data['config_db']['FEATURE']) + mocked_subprocess.check_call.assert_has_calls(config_data['enable_feature_subprocess_calls'], + any_order=True) + mocked_subprocess.check_call.assert_has_calls(config_data['daemon_reload_subprocess_call'], + any_order=True) + feature_state_table_mock.set.assert_has_calls(feature_table_state_db_calls) + self.checks_systemd_config_file(config_data['config_db']['FEATURE']) @parameterized.expand(HOSTCFGD_TEST_VECTOR) @patchfs - def test_hostcfgd_feature_handler(self, test_name, test_data, fs): - """ - Test feature config capability in the hostcfd + def test_handler(self, test_scenario_name, config_data, fs): + """Tests the method `handle(...)` of `FeatureHandler` class. - Args: - test_name(str): test name - test_data(dict): test data which contains initial Config Db tables, and expected results + Args: + test_secnario_name: A string indicates different testing scenario. + config_data: A dictionary contains initial `CONFIG_DB` tables and expected results. - Returns: - None + Returns: + Boolean value indicates whether test will pass or not. """ - fs.add_real_paths(swsscommon_package.__path__) # add real path of swsscommon for database_config.json + # add real path of sesscommon for database_config.json + fs.add_real_paths(swsscommon_package.__path__) fs.create_dir(hostcfgd.FeatureHandler.SYSTEMD_SYSTEM_DIR) - MockConfigDb.set_config_db(test_data['config_db']) + + MockConfigDb.set_config_db(config_data['config_db']) feature_state_table_mock = mock.Mock() with mock.patch('hostcfgd.subprocess') as mocked_subprocess: popen_mock = mock.Mock() - attrs = test_data['popen_attributes'] + attrs = config_data['popen_attributes'] popen_mock.configure_mock(**attrs) mocked_subprocess.Popen.return_value = popen_mock - # Initialize Feature Handler device_config = {} device_config['DEVICE_METADATA'] = MockConfigDb.CONFIG_DB['DEVICE_METADATA'] feature_handler = hostcfgd.FeatureHandler(MockConfigDb(), feature_state_table_mock, device_config) - # sync the state field and Handle Feature Updates - features = MockConfigDb.CONFIG_DB['FEATURE'] - feature_handler.sync_state_field(features) - for key, fvs in features.items(): - feature_handler.handle(key, 'SET', fvs) - - # Verify if the updates are properly updated - assert self.__verify_table( - MockConfigDb.get_config_db()['FEATURE'], - feature_state_table_mock, - test_data['expected_config_db']['FEATURE'] - ), 'Test failed for test data: {0}'.format(test_data) - mocked_subprocess.check_call.assert_has_calls(test_data['expected_subprocess_calls'], any_order=True) - - self.__verify_fs(test_data['config_db']['FEATURE']) + feature_table = MockConfigDb.CONFIG_DB['FEATURE'] + + for feature_name, feature_config in feature_table.items(): + feature_handler.handler(feature_name, 'SET', feature_config) + + self.checks_systemd_config_file(config_data['config_db']['FEATURE']) + mocked_subprocess.check_call.assert_has_calls(config_data['enable_feature_subprocess_calls'], + any_order=True) + mocked_subprocess.check_call.assert_has_calls(config_data['daemon_reload_subprocess_call'], + any_order=True) def test_feature_config_parsing(self): swss_feature = hostcfgd.Feature('swss', { diff --git a/src/sonic-host-services/tests/hostcfgd/test_vectors.py b/src/sonic-host-services/tests/hostcfgd/test_vectors.py index 28ffe89d841c..43754252c0e3 100644 --- a/src/sonic-host-services/tests/hostcfgd/test_vectors.py +++ b/src/sonic-host-services/tests/hostcfgd/test_vectors.py @@ -41,7 +41,7 @@ "state": "{% if 'subtype' in DEVICE_METADATA['localhost'] and DEVICE_METADATA['localhost']['subtype'] == 'DualToR' %}enabled{% else %}always_disabled{% endif %}" }, "telemetry": { - "auto_restart": "disabled", + "auto_restart": "enabled", "has_global_scope": "True", "has_per_asic_scope": "False", "has_timer": "True", @@ -73,7 +73,7 @@ "state": "enabled" }, "telemetry": { - "auto_restart": "disabled", + "auto_restart": "enabled", "has_global_scope": "True", "has_per_asic_scope": "False", "has_timer": "True", @@ -84,7 +84,7 @@ }, }, }, - "expected_subprocess_calls": [ + "enable_feature_subprocess_calls": [ call("sudo systemctl unmask dhcp_relay.service", shell=True), call("sudo systemctl enable dhcp_relay.service", shell=True), call("sudo systemctl start dhcp_relay.service", shell=True), @@ -96,6 +96,9 @@ call("sudo systemctl enable telemetry.timer", shell=True), call("sudo systemctl start telemetry.timer", shell=True), ], + "daemon_reload_subprocess_call": [ + call("sudo systemctl daemon-reload", shell=True), + ], "popen_attributes": { 'communicate.return_value': ('output', 'error') }, @@ -198,7 +201,7 @@ }, }, }, - "expected_subprocess_calls": [ + "enable_feature_subprocess_calls": [ call("sudo systemctl stop mux.service", shell=True), call("sudo systemctl disable mux.service", shell=True), call("sudo systemctl mask mux.service", shell=True), @@ -210,6 +213,9 @@ call("sudo systemctl enable sflow.service", shell=True), call("sudo systemctl start sflow.service", shell=True), ], + "daemon_reload_subprocess_call": [ + call("sudo systemctl daemon-reload", shell=True), + ], "popen_attributes": { 'communicate.return_value': ('output', 'error') }, @@ -294,7 +300,7 @@ }, }, }, - "expected_subprocess_calls": [ + "enable_feature_subprocess_calls": [ call("sudo systemctl stop mux.service", shell=True), call("sudo systemctl disable mux.service", shell=True), call("sudo systemctl mask mux.service", shell=True), @@ -303,6 +309,9 @@ call("sudo systemctl enable telemetry.timer", shell=True), call("sudo systemctl start telemetry.timer", shell=True), ], + "daemon_reload_subprocess_call": [ + call("sudo systemctl daemon-reload", shell=True), + ], "popen_attributes": { 'communicate.return_value': ('output', 'error') }, @@ -387,7 +396,7 @@ }, }, }, - "expected_subprocess_calls": [ + "enable_feature_subprocess_calls": [ call("sudo systemctl unmask dhcp_relay.service", shell=True), call("sudo systemctl enable dhcp_relay.service", shell=True), call("sudo systemctl start dhcp_relay.service", shell=True), @@ -399,6 +408,9 @@ call("sudo systemctl enable telemetry.timer", shell=True), call("sudo systemctl start telemetry.timer", shell=True), ], + "daemon_reload_subprocess_call": [ + call("sudo systemctl daemon-reload", shell=True), + ], "popen_attributes": { 'communicate.return_value': ('output', 'error') }, @@ -484,7 +496,9 @@ }, }, }, - "expected_subprocess_calls": [ + "enable_feature_subprocess_calls": [], + "daemon_reload_subprocess_call": [ + call("sudo systemctl daemon-reload", shell=True), ], "popen_attributes": { 'communicate.return_value': ('enabled', 'error')