From aa81b978a6c65425edb0eeff7b57bc6c3dfefd8a Mon Sep 17 00:00:00 2001 From: Stepan Blyshchak <38952541+stepanblyschak@users.noreply.github.com> Date: Sat, 14 May 2022 15:31:33 +0300 Subject: [PATCH] [auto-ts] add memory check (#2116) - What I did Implemented memory threashold check in auto techsupport feature according to sonic-net/SONiC#939. - How I did it Added two scripts. The check script and the handler script. Few modifications made in auto tech implementation. UT added. - How to verify it Run the action script and the handler script on the switch. Run UT. Signed-off-by: Stepan Blyschak --- config/plugins/auto_techsupport.py | 60 ++++- scripts/coredump_gen_handler.py | 117 +-------- scripts/memory_threshold_check.py | 243 +++++++++++++++++++ scripts/memory_threshold_check_handler.py | 43 ++++ setup.py | 2 + show/plugins/auto_techsupport.py | 20 +- tests/coredump_gen_handler_test.py | 27 ++- tests/memory_check_handler_test.py | 28 +++ tests/memory_threshold_check/config_db.json | 12 + tests/memory_threshold_check/state_db.json | 10 + tests/memory_threshold_check/state_db_2.json | 14 ++ tests/memory_threshold_check/state_db_3.json | 14 ++ tests/memory_threshold_check_test.py | 71 ++++++ utilities_common/auto_techsupport_helper.py | 147 ++++++++++- 14 files changed, 673 insertions(+), 135 deletions(-) create mode 100644 scripts/memory_threshold_check.py create mode 100644 scripts/memory_threshold_check_handler.py create mode 100644 tests/memory_check_handler_test.py create mode 100644 tests/memory_threshold_check/config_db.json create mode 100644 tests/memory_threshold_check/state_db.json create mode 100644 tests/memory_threshold_check/state_db_2.json create mode 100644 tests/memory_threshold_check/state_db_3.json create mode 100644 tests/memory_threshold_check_test.py diff --git a/config/plugins/auto_techsupport.py b/config/plugins/auto_techsupport.py index c2960646d4..10bebbaaff 100644 --- a/config/plugins/auto_techsupport.py +++ b/config/plugins/auto_techsupport.py @@ -228,6 +228,50 @@ def AUTO_TECHSUPPORT_GLOBAL_max_core_limit(db, max_core_limit): exit_with_error(f"Error: {err}", fg="red") +@AUTO_TECHSUPPORT_GLOBAL.command(name="available-mem-threshold") +@click.argument( + "available-mem-threshold", + nargs=1, + required=True, +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL_available_mem_threshold(db, available_mem_threshold): + """ Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing. + """ + + table = "AUTO_TECHSUPPORT" + key = "GLOBAL" + data = { + "available_mem_threshold": available_mem_threshold, + } + try: + update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +@AUTO_TECHSUPPORT_GLOBAL.command(name="min-available-mem") +@click.argument( + "min-available-mem", + nargs=1, + required=True, +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL_min_available_mem(db, min_available_mem): + """ Minimum free memory amount in Kb when techsupport will be executed. + """ + + table = "AUTO_TECHSUPPORT" + key = "GLOBAL" + data = { + "min_available_mem": min_available_mem, + } + try: + update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + @AUTO_TECHSUPPORT_GLOBAL.command(name="since") @click.argument( "since", @@ -271,8 +315,12 @@ def AUTO_TECHSUPPORT_FEATURE(): "--rate-limit-interval", help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable", ) +@click.option( + "--available-mem-threshold", + help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.", +) @clicommon.pass_db -def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval): +def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval, available_mem_threshold): """ Add object in AUTO_TECHSUPPORT_FEATURE. """ table = "AUTO_TECHSUPPORT_FEATURE" @@ -282,6 +330,8 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval): data["state"] = state if rate_limit_interval is not None: data["rate_limit_interval"] = rate_limit_interval + if available_mem_threshold is not None: + data["available_mem_threshold"] = available_mem_threshold try: add_entry_validated(db.cfgdb, table, key, data) @@ -303,8 +353,12 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval): "--rate-limit-interval", help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable", ) +@click.option( + "--available-mem-threshold", + help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.", +) @clicommon.pass_db -def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval): +def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval, available_mem_threshold): """ Add object in AUTO_TECHSUPPORT_FEATURE. """ table = "AUTO_TECHSUPPORT_FEATURE" @@ -314,6 +368,8 @@ def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval data["state"] = state if rate_limit_interval is not None: data["rate_limit_interval"] = rate_limit_interval + if available_mem_threshold is not None: + data["available_mem_threshold"] = available_mem_threshold try: update_entry_validated(db.cfgdb, table, key, data) diff --git a/scripts/coredump_gen_handler.py b/scripts/coredump_gen_handler.py index 03ba2de89e..55dd1ee644 100644 --- a/scripts/coredump_gen_handler.py +++ b/scripts/coredump_gen_handler.py @@ -5,18 +5,11 @@ For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD """ import os -import time import argparse import syslog -import re from swsscommon.swsscommon import SonicV2Connector from utilities_common.auto_techsupport_helper import * -# Explicity Pass this to the subprocess invoking techsupport -ENV_VAR = os.environ -PATH_PREV = ENV_VAR["PATH"] if "PATH" in ENV_VAR else "" -ENV_VAR["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV - def handle_coredump_cleanup(dump_name, db): _, num_bytes = get_stats(os.path.join(CORE_DUMP_DIR, CORE_DUMP_PTRN)) @@ -49,8 +42,6 @@ def __init__(self, core_name, container_name, db): self.core_name = core_name self.container = container_name self.db = db - self.proc_mp = {} - self.core_ts_map = {} def handle_core_dump_creation_event(self): if self.db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled": @@ -66,112 +57,8 @@ def handle_core_dump_creation_event(self): syslog.syslog(syslog.LOG_NOTICE, msg.format(self.container, self.core_name)) return - global_cooloff = self.db.get(CFG_DB, AUTO_TS, COOLOFF) - container_cooloff = self.db.get(CFG_DB, FEATURE_KEY, COOLOFF) - - try: - global_cooloff = float(global_cooloff) - except ValueError: - global_cooloff = 0.0 - - try: - container_cooloff = float(container_cooloff) - except ValueError: - container_cooloff = 0.0 - - cooloff_passed = self.verify_rate_limit_intervals(global_cooloff, container_cooloff) - if cooloff_passed: - since_cfg = self.get_since_arg() - new_file = self.invoke_ts_cmd(since_cfg) - if new_file: - self.write_to_state_db(int(time.time()), new_file) - - def write_to_state_db(self, timestamp, ts_dump): - name = strip_ts_ext(ts_dump) - key = TS_MAP + "|" + name - self.db.set(STATE_DB, key, CORE_DUMP, self.core_name) - self.db.set(STATE_DB, key, TIMESTAMP, str(timestamp)) - self.db.set(STATE_DB, key, CONTAINER, self.container) - - def get_since_arg(self): - since_cfg = self.db.get(CFG_DB, AUTO_TS, CFG_SINCE) - if not since_cfg: - return SINCE_DEFAULT - rc, _, stderr = subprocess_exec(["date", "--date={}".format(since_cfg)], env=ENV_VAR) - if rc == 0: - return since_cfg - return SINCE_DEFAULT - - def parse_ts_dump_name(self, ts_stdout): - """ Figure out the ts_dump name from the techsupport stdout """ - matches = re.findall(TS_PTRN, ts_stdout) - if matches: - return matches[-1] - syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name") - return "" - - def invoke_ts_cmd(self, since_cfg, num_retry=0): - cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg] - cmd = " ".join(cmd_opts) - rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR) - new_dump = "" - if rc == EXT_LOCKFAIL: - syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr)) - elif rc == EXT_RETRY: - if num_retry <= MAX_RETRY_LIMIT: - return self.invoke_ts_cmd(since_cfg, num_retry+1) - else: - syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr)) - elif rc != EXT_SUCCESS: - syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr)) - else: # EXT_SUCCESS - new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name - if not new_dump: - syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) - else: - syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump)) - return new_dump - - def verify_rate_limit_intervals(self, global_cooloff, container_cooloff): - """Verify both the global and per-proc rate_limit_intervals have passed""" - curr_ts_list = get_ts_dumps(True) - if global_cooloff and curr_ts_list: - last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1]) - if time.time() - last_ts_dump_creation < global_cooloff: - msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}" - syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name)) - return False - - self.parse_ts_map() - if container_cooloff and self.container in self.core_ts_map: - last_creation_time = self.core_ts_map[self.container][0][0] - if time.time() - last_creation_time < container_cooloff: - msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped. Core: {}" - syslog.syslog(syslog.LOG_INFO, msg.format(self.container, self.core_name)) - return False - return True - - def parse_ts_map(self): - """Create proc_name, ts_dump & creation_time map""" - ts_keys = self.db.keys(STATE_DB, TS_MAP+"*") - if not ts_keys: - return - for ts_key in ts_keys: - data = self.db.get_all(STATE_DB, ts_key) - if not data: - continue - container_name = data.get(CONTAINER, "") - creation_time = data.get(TIMESTAMP, "") - try: - creation_time = int(creation_time) - except Exception: - continue # if the creation time is invalid, skip the entry - ts_dump = ts_key.split("|")[-1] - if container_name and container_name not in self.core_ts_map: - self.core_ts_map[container_name] = [] - self.core_ts_map[container_name].append((int(creation_time), ts_dump)) - for container_name in self.core_ts_map: - self.core_ts_map[container_name].sort() + invoke_ts_command_rate_limited(self.db, EVENT_TYPE_CORE, {CORE_DUMP: self.core_name}, self.container) + def main(): parser = argparse.ArgumentParser(description='Auto Techsupport Invocation and CoreDump Mgmt Script') diff --git a/scripts/memory_threshold_check.py b/scripts/memory_threshold_check.py new file mode 100644 index 0000000000..ba54c7296a --- /dev/null +++ b/scripts/memory_threshold_check.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 + +import sys + +import sonic_py_common.logger +from swsscommon.swsscommon import ConfigDBConnector, SonicV2Connector +from utilities_common.auto_techsupport_helper import STATE_DB + +# Exit codes +EXIT_SUCCESS = 0 # Success +EXIT_FAILURE = 1 # General failure occurred, no techsupport is invoked +EXIT_THRESHOLD_CROSSED = 2 # Memory threshold crossed, techsupport is invoked + +SYSLOG_IDENTIFIER = "memory_threshold_check" + +# Config DB tables +AUTO_TECHSUPPORT = "AUTO_TECHSUPPORT" +AUTO_TECHSUPPORT_FEATURE = "AUTO_TECHSUPPORT_FEATURE" + +# State DB docker stats table +DOCKER_STATS = "DOCKER_STATS" + +# (%) Default value for available memory left in the system +DEFAULT_MEMORY_AVAILABLE_THRESHOLD = 10 +# (MB) Default value for minimum available memory in the system to run techsupport +DEFAULT_MEMORY_AVAILABLE_MIN_THRESHOLD = 200 +# (%) Default value for available memory inside container +DEFAULT_MEMORY_AVAILABLE_FEATURE_THRESHOLD = 0 + +MB_TO_KB_MULTIPLIER = 1024 + +# Global logger instance +logger = sonic_py_common.logger.Logger(SYSLOG_IDENTIFIER) + + +class MemoryCheckerException(Exception): + """General memory checker exception""" + + pass + + +class MemoryStats: + """MemoryStats provides an interface to query memory statistics of the system and per feature.""" + + def __init__(self, state_db): + """Initialize MemoryStats + + Args: + state_db (swsscommon.DBConnector): state DB connector instance + """ + self.state_db = state_db + + def get_sys_memory_stats(self): + """Returns system memory statistic dictionary, reflects the /proc/meminfo + + Returns: + Dictionary of strings to integers where integer values + represent memory amount in Kb, e.g: + { + "MemTotal": 8104856, + "MemAvailable": 6035192, + ... + } + """ + with open("/proc/meminfo") as fd: + lines = fd.read().split("\n") + rows = [line.split() for line in lines] + + # e.g row is ('MemTotal:', '8104860', 'kB') + # key is the first element with removed remove last ':'. + # value is the second element converted to int. + def row_to_key(row): + return row[0][:-1] + + def row_to_value(row): + return int(row[1]) + + return {row_to_key(row): row_to_value(row) for row in rows if len(row) >= 2} + + def get_containers_memory_usage(self): + """Returns per container memory usage, reflects the DOCKER_STATS state DB table + + Returns: + Dictionary of strings to floats where floating point values + represent memory usage of the feature container which is carefully + calculated for us by the dockerd and published by procdockerstatsd, e.g: + + { + "swss": 1.5, + "teamd": 10.92, + ... + } + """ + result = {} + dockers = self.state_db.keys(STATE_DB, DOCKER_STATS + "|*") + stats = [ + stat for stat in [self.state_db.get_all(STATE_DB, key) for key in dockers] if stat is not None + ] + + for stat in stats: + try: + name = stat["NAME"] + mem_usage = float(stat["MEM%"]) + except KeyError as err: + continue + except ValueError as err: + logger.log_error(f'Failed to parse memory usage for "{stat}": {err}') + raise MemoryCheckerException(err) + + result[name] = mem_usage + + return result + + +class Config: + def __init__(self, cfg_db): + self.table = cfg_db.get_table(AUTO_TECHSUPPORT) + self.feature_table = cfg_db.get_table(AUTO_TECHSUPPORT_FEATURE) + + config = self.table.get("GLOBAL") + + self.memory_available_threshold = self.parse_value_from_db( + config, + "available_mem_threshold", + float, + DEFAULT_MEMORY_AVAILABLE_THRESHOLD, + ) + self.memory_available_min_threshold = self.parse_value_from_db( + config, + "min_available_mem", + float, + DEFAULT_MEMORY_AVAILABLE_MIN_THRESHOLD, + ) * MB_TO_KB_MULTIPLIER + + keys = self.feature_table.keys() + self.feature_config = {} + for key in keys: + config = self.feature_table.get(key) + + self.feature_config[key] = self.parse_value_from_db( + config, + "available_mem_threshold", + float, + DEFAULT_MEMORY_AVAILABLE_FEATURE_THRESHOLD, + ) + + @staticmethod + def parse_value_from_db(config, key, converter, default): + value = config.get(key) + if not value: + return default + try: + return converter(value) + except ValueError as err: + logger.log_error(f'Failed to parse {key} value "{value}": {err}') + raise MemoryCheckerException(err) + + +class MemoryChecker: + """Business logic of the memory checker""" + + def __init__(self, stats, config): + """Initialize MemoryChecker""" + self.stats = stats + self.config = config + + def run_check(self): + """Runs the checks and returns a tuple of check result boolean + and a container name or empty string if the check failed for the host. + + Returns: + (bool, str) + """ + # don't bother getting stats if available memory threshold is set to 0 + if self.config.memory_available_threshold: + memory_stats = self.stats.get_sys_memory_stats() + memory_free = memory_stats["MemAvailable"] + memory_total = memory_stats["MemTotal"] + memory_free_threshold = ( + memory_total * self.config.memory_available_threshold / 100 + ) + memory_min_free_threshold = self.config.memory_available_min_threshold + + # free memory amount is less then configured minimum required memory for + # running "show techsupport" + if memory_free <= memory_min_free_threshold: + logger.log_error( + f"Free memory {memory_free} is less then " + f"min free memory threshold {memory_min_free_threshold}" + ) + return (False, "") + + # free memory amount is less then configured threshold + if memory_free <= memory_free_threshold: + logger.log_error( + f"Free memory {memory_free} is less then " + f"free memory threshold {memory_free_threshold}" + ) + return (False, "") + + container_memory_usage = self.stats.get_containers_memory_usage() + for feature, memory_available_threshold in self.config.feature_config.items(): + for container, memory_usage in container_memory_usage.items(): + # startswith to handle multi asic instances + if not container.startswith(feature): + continue + + # free memory amount is less then configured threshold + if (100 - memory_usage) <= memory_available_threshold: + logger.log_error( + f"Available {100 - memory_usage} for {feature} is less " + f"then free memory threshold {memory_available_threshold}" + ) + return (False, feature) + + return (True, "") + + +def main(): + cfg_db = ConfigDBConnector(use_unix_socket_path=True) + cfg_db.connect() + state_db = SonicV2Connector(use_unix_socket_path=True) + state_db.connect(STATE_DB) + + config = Config(cfg_db) + mem_stats = MemoryStats(state_db) + mem_checker = MemoryChecker(mem_stats, config) + + try: + passed, name = mem_checker.run_check() + if not passed: + return EXIT_THRESHOLD_CROSSED, name + except MemoryCheckerException as err: + logger.log_error(f"Failure occurred {err}") + return EXIT_FAILURE, "" + + return EXIT_SUCCESS, "" + + +if __name__ == "__main__": + rc, component = main() + print(component) + sys.exit(rc) diff --git a/scripts/memory_threshold_check_handler.py b/scripts/memory_threshold_check_handler.py new file mode 100644 index 0000000000..1d8a82fb2f --- /dev/null +++ b/scripts/memory_threshold_check_handler.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import sys +import os +import syslog +from swsscommon.swsscommon import SonicV2Connector +from utilities_common.auto_techsupport_helper import * + +# Exit codes +EXIT_SUCCESS = 0 # Success +EXIT_FAILURE = 1 # General failure occurred, no techsupport is invoked + +def main(): + output = os.environ.get("MONIT_DESCRIPTION") + syslog.openlog(logoption=syslog.LOG_PID) + db = SonicV2Connector(use_unix_socket_path=True) + db.connect(CFG_DB) + db.connect(STATE_DB) + if not output: + syslog.syslog( + syslog.LOG_ERR, + "Expected to get output from environment variable MONIT_DESCRIPTION" + ) + return EXIT_FAILURE + if "--" not in output: + syslog.syslog(syslog.LOG_ERR, "Unexpected value in environment variable MONIT_DESCRIPTION") + return EXIT_FAILURE + + monit_output = output.split("--")[1].strip() + # If the output of memory_threshold_check is empty + # that means that memory threshold check failed for the host. + # In this case monit inserts "no output" string in MONIT_DESCRIPTION + if monit_output == "no output": + container = None + else: + container = monit_output + invoke_ts_command_rate_limited(db, EVENT_TYPE_MEMORY, container) + + return EXIT_SUCCESS + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/setup.py b/setup.py index 7ee1eb8574..b3c34a01e8 100644 --- a/setup.py +++ b/setup.py @@ -146,6 +146,8 @@ 'scripts/centralize_database', 'scripts/null_route_helper', 'scripts/coredump_gen_handler.py', + 'scripts/memory_threshold_check.py', + 'scripts/memory_threshold_check_handler.py', 'scripts/techsupport_cleanup.py', 'scripts/check_db_integrity.py' ], diff --git a/show/plugins/auto_techsupport.py b/show/plugins/auto_techsupport.py index 9bcda1b04c..2afb0bf9c5 100644 --- a/show/plugins/auto_techsupport.py +++ b/show/plugins/auto_techsupport.py @@ -61,6 +61,8 @@ def AUTO_TECHSUPPORT_GLOBAL(db): "RATE LIMIT INTERVAL (sec)", "MAX TECHSUPPORT LIMIT (%)", "MAX CORE LIMIT (%)", + "AVAILABLE MEM THRESHOLD (%)", + "MIN AVAILABLE MEM (Kb)", "SINCE", ] @@ -84,6 +86,14 @@ def AUTO_TECHSUPPORT_GLOBAL(db): entry, {'name': 'max_core_limit', 'description': 'Max Limit in percentage for the cummulative size of core dumps. No cleanup is performed if the value isn\'t congiured or is 0.0', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} ), + format_attr_value( + entry, + {'name': 'available_mem_threshold', 'description': 'Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), + format_attr_value( + entry, + {'name': 'min_available_mem', 'description': 'Minimum Free memory (in MB) that should be available for the techsupport execution to start', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), format_attr_value( entry, {'name': 'since', 'description': "Only collect the logs & core-dumps generated since the time provided. A default value of '2 days ago' is used if this value is not set explicitly or a non-valid string is provided", 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} @@ -98,14 +108,15 @@ def AUTO_TECHSUPPORT_GLOBAL(db): @clicommon.pass_db def AUTO_TECHSUPPORT_history(db): keys = db.db.keys("STATE_DB", "AUTO_TECHSUPPORT_DUMP_INFO|*") - header = ["TECHSUPPORT DUMP", "TRIGGERED BY", "CORE DUMP"] + header = ["TECHSUPPORT DUMP", "TRIGGERED BY", "EVENT TYPE", "CORE DUMP"] body = [] for key in keys: dump = key.split("|")[-1] fv_pairs = db.db.get_all("STATE_DB", key) core_dump = fv_pairs.get("core_dump", "") container = fv_pairs.get("container_name", "") - body.append([dump, container, core_dump]) + event_type = fv_pairs.get("event_type", "") + body.append([dump, container, event_type, core_dump]) click.echo(tabulate.tabulate(body, header, numalign="left")) @@ -120,6 +131,7 @@ def AUTO_TECHSUPPORT_FEATURE(db): "FEATURE NAME", "STATE", "RATE LIMIT INTERVAL (sec)", + "AVAILABLE MEM THRESHOLD (%)", ] body = [] @@ -139,6 +151,10 @@ def AUTO_TECHSUPPORT_FEATURE(db): entry, {'name': 'rate_limit_interval', 'description': 'Rate limit interval for the corresponding feature. Configure 0 to explicitly disable', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} ), + format_attr_value( + entry, + {'name': 'available_mem_threshold', 'description': 'Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), ] body.append(row) click.echo(tabulate.tabulate(body, header, numalign="left")) diff --git a/tests/coredump_gen_handler_test.py b/tests/coredump_gen_handler_test.py index bf4ae8dc78..74d965fe9e 100644 --- a/tests/coredump_gen_handler_test.py +++ b/tests/coredump_gen_handler_test.py @@ -6,6 +6,7 @@ import signal from pyfakefs.fake_filesystem_unittest import Patcher from swsscommon import swsscommon +import utilities_common.auto_techsupport_helper as ts_helper from utilities_common.general import load_module_from_source from utilities_common.db import Db from utilities_common.auto_techsupport_helper import EXT_RETRY @@ -91,7 +92,7 @@ def mock_cmd(cmd, env): else: return 1, "", "Command Not Found" return 0, AUTO_TS_STDOUT + ts_dump, "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") @@ -123,7 +124,7 @@ def mock_cmd(cmd, env): else: return 1, "", "Command Not Found" return 0, AUTO_TS_STDOUT + ts_dump, "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") @@ -157,7 +158,7 @@ def mock_cmd(cmd, env): else: return 1, "", "Command Not Found" return 0, AUTO_TS_STDOUT + ts_dump, "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) @@ -187,7 +188,7 @@ def mock_cmd(cmd, env): else: return 1, "", "Command Not Found" return 0, AUTO_TS_STDOUT + ts_dump, "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") @@ -218,7 +219,7 @@ def mock_cmd(cmd, env): else: return 1, "", "Command Not Found" return 0, AUTO_TS_STDOUT + ts_dump, "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/core/snmpd.12345.123.core.gz") cls = cdump_mod.CriticalProcCoreDumpHandle("snmpd.12345.123.core.gz", "whatevver", redis_mock) @@ -247,7 +248,7 @@ def mock_cmd(cmd, env): else: return 1, "", "Command Not Found" return 0, AUTO_TS_STDOUT + ts_dump, "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/core/python3.12345.123.core.gz") cls = cdump_mod.CriticalProcCoreDumpHandle("python3.12345.123.core.gz", "snmp", redis_mock) @@ -269,14 +270,14 @@ def test_since_argument(self): def mock_cmd(cmd, env): ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) - if "--since 4 days ago" in cmd_str: + if "--since '4 days ago'" in cmd_str: patcher.fs.create_file(ts_dump) return 0, AUTO_TS_STDOUT + ts_dump, "" elif "date --date=4 days ago" in cmd_str: return 0, "", "" else: return 1, "", "Invalid Command" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") @@ -308,7 +309,7 @@ def mock_cmd(cmd, env): else: return 1, "", "Command Not Found" return 0, AUTO_TS_STDOUT + ts_dump, "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") @@ -335,7 +336,7 @@ def test_invalid_since_argument(self): def mock_cmd(cmd, env): ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) - if "--since 2 days ago" in cmd_str: + if "--since '2 days ago'" in cmd_str: patcher.fs.create_file(ts_dump) print(AUTO_TS_STDOUT + ts_dump) return 0, AUTO_TS_STDOUT + ts_dump, "" @@ -343,7 +344,7 @@ def mock_cmd(cmd, env): return 1, "", "Invalid Date Format" else: return 1, "", "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") @@ -415,10 +416,10 @@ def test_max_retry_ts_failure(self): def mock_cmd(cmd, env): return EXT_RETRY, "", "" - cdump_mod.subprocess_exec = mock_cmd + ts_helper.subprocess_exec = mock_cmd patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) - + signal.signal(signal.SIGALRM, signal_handler) signal.alarm(5) # 5 seconds try: diff --git a/tests/memory_check_handler_test.py b/tests/memory_check_handler_test.py new file mode 100644 index 0000000000..12cbbe8f28 --- /dev/null +++ b/tests/memory_check_handler_test.py @@ -0,0 +1,28 @@ +""" These tests check the memory_threshold_check_handler script monit description string handling while the rest of auto thechsupport is unit tested by coredump_gen_handler_test.py """ + +import sys +from unittest.mock import patch, ANY +from utilities_common.auto_techsupport_helper import EVENT_TYPE_MEMORY +from utilities_common.db import Db + +sys.path.append("scripts") +import memory_threshold_check_handler + + +@patch("os.environ.get", lambda var: "status code 2 -- swss") +def test_memory_threshold_check_handler_host(): + with patch('memory_threshold_check_handler.invoke_ts_command_rate_limited') as invoke_ts: + memory_threshold_check_handler.main() + invoke_ts.assert_called_once_with(ANY, EVENT_TYPE_MEMORY, 'swss') + +@patch("os.environ.get", lambda var: "status code 2 -- no output") +def test_memory_threshold_check_handler_host(): + with patch('memory_threshold_check_handler.invoke_ts_command_rate_limited') as invoke_ts: + memory_threshold_check_handler.main() + invoke_ts.assert_called_once_with(ANY, EVENT_TYPE_MEMORY, None) + +@patch("os.environ.get", lambda var: "foo bar") +def test_memory_threshold_check_handler_host(): + with patch('memory_threshold_check_handler.invoke_ts_command_rate_limited') as invoke_ts: + memory_threshold_check_handler.main() + invoke_ts.assert_not_called() diff --git a/tests/memory_threshold_check/config_db.json b/tests/memory_threshold_check/config_db.json new file mode 100644 index 0000000000..a270db5bc1 --- /dev/null +++ b/tests/memory_threshold_check/config_db.json @@ -0,0 +1,12 @@ +{ + "AUTO_TECHSUPPORT|GLOBAL": { + "available_mem_threshold": "10.0", + "min_available_mem": "200" + }, + "AUTO_TECHSUPPORT_FEATURE|swss": { + "available_mem_threshold": "50.0" + }, + "AUTO_TECHSUPPORT_FEATURE|telemetry": { + "available_mem_threshold": "50.0" + } +} diff --git a/tests/memory_threshold_check/state_db.json b/tests/memory_threshold_check/state_db.json new file mode 100644 index 0000000000..0cd8c34e13 --- /dev/null +++ b/tests/memory_threshold_check/state_db.json @@ -0,0 +1,10 @@ +{ + "DOCKER_STATS|b77647f643a2": { + "MEM%": "10", + "NAME": "swss1" + }, + "DOCKER_STATS|b77647f643a3": { + "MEM%": "10", + "NAME": "telemetry" + } +} diff --git a/tests/memory_threshold_check/state_db_2.json b/tests/memory_threshold_check/state_db_2.json new file mode 100644 index 0000000000..67f18ac89f --- /dev/null +++ b/tests/memory_threshold_check/state_db_2.json @@ -0,0 +1,14 @@ +{ + "DOCKER_STATS|b77647f643a1": { + "MEM%": "10.5", + "NAME": "swss1" + }, + "DOCKER_STATS|b77647f643a2": { + "MEM%": "10.1", + "NAME": "swss2" + }, + "DOCKER_STATS|b77647f643a3": { + "MEM%": "80.6", + "NAME": "telemetry" + } +} diff --git a/tests/memory_threshold_check/state_db_3.json b/tests/memory_threshold_check/state_db_3.json new file mode 100644 index 0000000000..0ded64771a --- /dev/null +++ b/tests/memory_threshold_check/state_db_3.json @@ -0,0 +1,14 @@ +{ + "DOCKER_STATS|b77647f643a1": { + "MEM%": "80.1", + "NAME": "swss1" + }, + "DOCKER_STATS|b77647f643a2": { + "MEM%": "10.1", + "NAME": "swss2" + }, + "DOCKER_STATS|b77647f643a3": { + "MEM%": "10.2", + "NAME": "telemetry" + } +} diff --git a/tests/memory_threshold_check_test.py b/tests/memory_threshold_check_test.py new file mode 100644 index 0000000000..b6b4b2fee8 --- /dev/null +++ b/tests/memory_threshold_check_test.py @@ -0,0 +1,71 @@ +import os +import sys +import pytest +from unittest import mock +from .mock_tables import dbconnector +from utilities_common.general import load_module_from_source + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +scripts_path = os.path.join(modules_path, 'scripts') +sys.path.insert(0, scripts_path) + +memory_threshold_check_path = os.path.join(scripts_path, 'memory_threshold_check.py') +memory_threshold_check = load_module_from_source('memory_threshold_check.py', memory_threshold_check_path) + +@pytest.fixture() +def setup_dbs_regular_mem_usage(): + cfg_db = dbconnector.dedicated_dbs.get('CONFIG_DB') + state_db = dbconnector.dedicated_dbs.get('STATE_DB') + dbconnector.dedicated_dbs['CONFIG_DB'] = os.path.join(test_path, 'memory_threshold_check', 'config_db') + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(test_path, 'memory_threshold_check', 'state_db') + yield + dbconnector.dedicated_dbs['CONFIG_DB'] = cfg_db + dbconnector.dedicated_dbs['STATE_DB'] = state_db + + +@pytest.fixture() +def setup_dbs_telemetry_high_mem_usage(): + memory_threshold_check.MemoryStats.get_sys_memory_stats = mock.Mock(return_value={'MemAvailable': 10000000, 'MemTotal': 20000000}) + cfg_db = dbconnector.dedicated_dbs.get('CONFIG_DB') + state_db = dbconnector.dedicated_dbs.get('STATE_DB') + dbconnector.dedicated_dbs['CONFIG_DB'] = os.path.join(test_path, 'memory_threshold_check', 'config_db') + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(test_path, 'memory_threshold_check', 'state_db_2') + yield + dbconnector.dedicated_dbs['CONFIG_DB'] = cfg_db + dbconnector.dedicated_dbs['STATE_DB'] = state_db + + +@pytest.fixture() +def setup_dbs_swss_high_mem_usage(): + memory_threshold_check.MemoryStats.get_sys_memory_stats = mock.Mock(return_value={'MemAvailable': 10000000, 'MemTotal': 20000000}) + cfg_db = dbconnector.dedicated_dbs.get('CONFIG_DB') + state_db = dbconnector.dedicated_dbs.get('STATE_DB') + dbconnector.dedicated_dbs['CONFIG_DB'] = os.path.join(test_path, 'memory_threshold_check', 'config_db') + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(test_path, 'memory_threshold_check', 'state_db_3') + yield + dbconnector.dedicated_dbs['CONFIG_DB'] = cfg_db + dbconnector.dedicated_dbs['STATE_DB'] = state_db + + +def test_memory_check_host_not_crossed(setup_dbs_regular_mem_usage): + memory_threshold_check.MemoryStats.get_sys_memory_stats = mock.Mock(return_value={'MemAvailable': 1000000, 'MemTotal': 2000000}) + assert memory_threshold_check.main() == (memory_threshold_check.EXIT_SUCCESS, '') + + +def test_memory_check_host_less_then_min_required(setup_dbs_regular_mem_usage): + memory_threshold_check.MemoryStats.get_sys_memory_stats = mock.Mock(return_value={'MemAvailable': 1000, 'MemTotal': 2000000}) + assert memory_threshold_check.main() == (memory_threshold_check.EXIT_THRESHOLD_CROSSED, '') + + +def test_memory_check_host_threshold_crossed(setup_dbs_regular_mem_usage): + memory_threshold_check.MemoryStats.get_sys_memory_stats = mock.Mock(return_value={'MemAvailable': 2000000, 'MemTotal': 20000000}) + assert memory_threshold_check.main() == (memory_threshold_check.EXIT_THRESHOLD_CROSSED, '') + + +def test_memory_check_telemetry_threshold_crossed(setup_dbs_telemetry_high_mem_usage): + assert memory_threshold_check.main() == (memory_threshold_check.EXIT_THRESHOLD_CROSSED, 'telemetry') + + +def test_memory_check_swss_threshold_crossed(setup_dbs_swss_high_mem_usage): + assert memory_threshold_check.main() == (memory_threshold_check.EXIT_THRESHOLD_CROSSED, 'swss') diff --git a/utilities_common/auto_techsupport_helper.py b/utilities_common/auto_techsupport_helper.py index 4eaae933b0..b56693ac39 100644 --- a/utilities_common/auto_techsupport_helper.py +++ b/utilities_common/auto_techsupport_helper.py @@ -1,4 +1,5 @@ import os +import re import glob import time import subprocess @@ -13,7 +14,8 @@ "CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB", "TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", "TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB", "EXT_LOCKFAIL", "EXT_RETRY", - "EXT_SUCCESS", "MAX_RETRY_LIMIT" + "EXT_SUCCESS", "MAX_RETRY_LIMIT", "EVENT_TYPE", "EVENT_TYPE_CORE", + "EVENT_TYPE_MEMORY" ] + [ # Methods "verify_recent_file_creation", "get_ts_dumps", @@ -22,7 +24,8 @@ "pretty_size", "cleanup_process", "subprocess_exec", - "trim_masic_suffix" + "trim_masic_suffix", + "invoke_ts_command_rate_limited", ] @@ -35,8 +38,9 @@ TS_PTRN = "sonic_dump_.*tar.*" # Regex Exp TS_PTRN_GLOB = "sonic_dump_*tar*" # Glob Exp -# CONFIG DB Attributes +# DBs identifiers CFG_DB = "CONFIG_DB" +STATE_DB = "STATE_DB" # AUTO_TECHSUPPORT|GLOBAL table attributes AUTO_TS = "AUTO_TECHSUPPORT|GLOBAL" @@ -57,10 +61,19 @@ CORE_DUMP = "core_dump" TIMESTAMP = "timestamp" CONTAINER = "container_name" +EVENT_TYPE = "event_type" + +EVENT_TYPE_CORE = "core" +EVENT_TYPE_MEMORY = "memory" TIME_BUF = 20 SINCE_DEFAULT = "2 days ago" +# Explicity Pass this to the subprocess invoking techsupport +ENV_VAR = os.environ +PATH_PREV = ENV_VAR["PATH"] if "PATH" in ENV_VAR else "" +ENV_VAR["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV + # Techsupport Exit Codes EXT_LOCKFAIL = 2 EXT_RETRY = 4 @@ -192,3 +205,131 @@ def trim_masic_suffix(container_name): break index = index - 1 return "".join(arr) + +def get_since_arg(db): + """Get since configuration from AUTO_TECHSUPPORT table or default value""" + since_cfg = db.get(CFG_DB, AUTO_TS, CFG_SINCE) + if not since_cfg: + return SINCE_DEFAULT + rc, _, stderr = subprocess_exec(["date", "--date={}".format(since_cfg)], env=ENV_VAR) + if rc == 0: + return since_cfg + return SINCE_DEFAULT + + +def parse_ts_dump_name(ts_stdout): + """ Figure out the ts_dump name from the techsupport stdout """ + matches = re.findall(TS_PTRN, ts_stdout) + if matches: + return matches[-1] + syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name") + return "" + + +def invoke_ts_cmd(db, num_retry=0): + """Invoke techsupport generation command""" + since_cfg = get_since_arg(db) + since_cfg = "'" + since_cfg + "'" + cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg] + cmd = " ".join(cmd_opts) + rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR) + new_dump = "" + if rc == EXT_LOCKFAIL: + syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr)) + elif rc == EXT_RETRY: + if num_retry <= MAX_RETRY_LIMIT: + return invoke_ts_cmd(db, num_retry+1) + else: + syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr)) + elif rc != EXT_SUCCESS: + syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr)) + else: # EXT_SUCCESS + new_dump = parse_ts_dump_name(stdout) # Parse the dump name + if not new_dump: + syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) + else: + syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump)) + return new_dump + + +def get_ts_map(db): + """Create ts_dump & creation_time map""" + ts_map = {} + ts_keys = db.keys(STATE_DB, TS_MAP+"*") + if not ts_keys: + return + for ts_key in ts_keys: + data = db.get_all(STATE_DB, ts_key) + if not data: + continue + container_name = data.get(CONTAINER, "") + creation_time = data.get(TIMESTAMP, "") + try: + creation_time = int(creation_time) + except Exception: + continue # if the creation time is invalid, skip the entry + ts_dump = ts_key.split("|")[-1] + if container_name not in ts_map: + ts_map[container_name] = [] + ts_map[container_name].append((int(creation_time), ts_dump)) + for container_name in ts_map: + ts_map[container_name].sort() + return ts_map + + +def verify_rate_limit_intervals(db, global_cooloff, container_cooloff, container): + """Verify both the global and per-proc rate_limit_intervals have passed""" + curr_ts_list = get_ts_dumps(True) + if global_cooloff and curr_ts_list: + last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1]) + if time.time() - last_ts_dump_creation < global_cooloff: + msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped" + syslog.syslog(msg) + return False + + ts_map = get_ts_map(db) + if container_cooloff and container in ts_map: + last_creation_time = ts_map[container][0][0] + if time.time() - last_creation_time < container_cooloff: + msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped" + syslog.syslog(msg.format(container)) + return False + return True + + +def write_to_state_db(db, timestamp, ts_dump, event_type, event_data, container=None): + name = strip_ts_ext(ts_dump) + key = TS_MAP + "|" + name + db.set(STATE_DB, key, TIMESTAMP, str(timestamp)) + db.set(STATE_DB, key, EVENT_TYPE, event_type) + for event_data_key, event_data_value in event_data.items(): + db.set(STATE_DB, key, event_data_key, event_data_value) + if container: + db.set(STATE_DB, key, CONTAINER, container) + + +def invoke_ts_command_rate_limited(db, event_type, event_data, container=None): + """Invoke techsupport generation command accounting the rate limit""" + global_cooloff = db.get(CFG_DB, AUTO_TS, COOLOFF) + if container: + container_cooloff = db.get( + CFG_DB, FEATURE.format(container), COOLOFF + ) + else: + container_cooloff = 0.0 + + try: + global_cooloff = float(global_cooloff) + except ValueError: + global_cooloff = 0.0 + + try: + container_cooloff = float(container_cooloff) + except ValueError: + container_cooloff = 0.0 + + cooloff_passed = verify_rate_limit_intervals(db, global_cooloff, container_cooloff, container) + if cooloff_passed: + new_file = invoke_ts_cmd(db) + if new_file: + write_to_state_db(db, int(time.time()), new_file, event_type, event_data, container)