From 6de91af5b1944db33aa8e34b30349b11af0e44bf Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Mon, 6 Dec 2021 08:31:19 -0800 Subject: [PATCH] [Auto-Techsupport] Issues related to Multiple Cores crashing handled (#1948) #### What I did **Issues seen when multiple cores are crashed in very quick succession:** 1) The **rate_limit_interval** is not honored. Because, i previously was finding out the last created tech-support using the glob pattern `sonic_dump_*tar*`, which will not include the dumps which are being currently run. These existing dump will not have .tar.gz extension. Thus, modified the `get_ts_dumps` to search based on the TS_ROOT i.e `sonic_dump_*` 2) **show auto-tech support history** is not showing all the created dumps. I've previously used to take the diff of tech support dumps before and after running the invocation and used to assign the diff as the corresponding techsupport for this core. This approach is prone to race condition as we can have multiple dumps in the diff found in the interval. Avoided this by parsing the stdout returned by `show techsupport` invocation #### How to verify it 1) Unit Tests 2) Generate core-dumps in very quick succession. Use the default rate limit interval. Should only see one entry in tech-support history 3) Set global rate limit interval to 0. Generate cores in quick succession. Should see a few entries in the history. --- scripts/coredump_gen_handler.py | 33 +++++++----- scripts/generate_dump | 3 +- scripts/techsupport_cleanup.py | 4 +- tests/coredump_gen_handler_test.py | 57 +++++++++++++-------- utilities_common/auto_techsupport_helper.py | 12 +++-- 5 files changed, 67 insertions(+), 42 deletions(-) diff --git a/scripts/coredump_gen_handler.py b/scripts/coredump_gen_handler.py index 895c22146a6b..eaa972bdca1e 100644 --- a/scripts/coredump_gen_handler.py +++ b/scripts/coredump_gen_handler.py @@ -8,6 +8,7 @@ import time import argparse import syslog +import re from swsscommon.swsscommon import SonicV2Connector from utilities_common.auto_techsupport_helper import * @@ -54,7 +55,6 @@ def __init__(self, core_name, container_name, db): self.db = db self.proc_mp = {} self.core_ts_map = {} - self.curr_ts_list = [] def handle_core_dump_creation_event(self): file_path = os.path.join(CORE_DUMP_DIR, self.core_name) @@ -93,7 +93,7 @@ def handle_core_dump_creation_event(self): since_cfg = self.get_since_arg() new_file = self.invoke_ts_cmd(since_cfg) if new_file: - self.write_to_state_db(int(time.time()), new_file[0]) + self.write_to_state_db(int(time.time()), new_file) def write_to_state_db(self, timestamp, ts_dump): name = strip_ts_ext(ts_dump) @@ -111,26 +111,33 @@ def get_since_arg(self): return since_cfg return SINCE_DEFAULT + def parse_ts_dump_name(self, ts_stdout): + """ Figure out the ts_dump name from the techsupport stdout """ + matches = re.findall(TS_PTRN, ts_stdout) + if matches: + return matches[-1] + syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name") + return "" + def invoke_ts_cmd(self, since_cfg): since_cfg = "'" + since_cfg + "'" - cmd = " ".join(["show", "techsupport", "--since", since_cfg]) - rc, _, stderr = subprocess_exec(["show", "techsupport", "--since", since_cfg], env=ENV_VAR) + cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg] + cmd = " ".join(cmd_opts) + rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR) if not rc: syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr:{}".format(rc, stderr)) - new_list = get_ts_dumps(True) - diff = list(set(new_list).difference(set(self.curr_ts_list))) - self.curr_ts_list = new_list - if not diff: + new_dump = self.parse_ts_dump_name(stdout) + if not new_dump: syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) else: - syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, diff)) - return diff + syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump)) + return new_dump def verify_rate_limit_intervals(self, global_cooloff, container_cooloff): """Verify both the global and per-proc rate_limit_intervals have passed""" - self.curr_ts_list = get_ts_dumps(True) - if global_cooloff and self.curr_ts_list: - last_ts_dump_creation = os.path.getmtime(self.curr_ts_list[-1]) + curr_ts_list = get_ts_dumps(True) + if global_cooloff and curr_ts_list: + last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1]) if time.time() - last_ts_dump_creation < global_cooloff: msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}" syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name)) diff --git a/scripts/generate_dump b/scripts/generate_dump index d5f3bb67311e..85bc930a5a2a 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -1339,8 +1339,7 @@ main() { fi # Invoke the TechSupport Cleanup Hook - setsid $(echo > /tmp/techsupport_cleanup.log; - python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &>> /tmp/techsupport_cleanup.log) & + setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log & echo ${TARFILE} diff --git a/scripts/techsupport_cleanup.py b/scripts/techsupport_cleanup.py index 53a10562e7a9..ba5a3c1798b4 100644 --- a/scripts/techsupport_cleanup.py +++ b/scripts/techsupport_cleanup.py @@ -22,7 +22,7 @@ def handle_techsupport_creation_event(dump_name, db): file_path = os.path.join(TS_DIR, dump_name) if not verify_recent_file_creation(file_path): return - _ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN)) + _ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN_GLOB)) if db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled": msg = "techsupport_cleanup is disabled. No cleanup is performed. current size occupied : {}" @@ -40,7 +40,7 @@ def handle_techsupport_creation_event(dump_name, db): syslog.syslog(syslog.LOG_NOTICE, msg.format(pretty_size(num_bytes))) return - removed_files = cleanup_process(max_ts, TS_PTRN, TS_DIR) + removed_files = cleanup_process(max_ts, TS_PTRN_GLOB, TS_DIR) clean_state_db_entries(removed_files, db) diff --git a/tests/coredump_gen_handler_test.py b/tests/coredump_gen_handler_test.py index c742f09d06ac..10a470b6b020 100644 --- a/tests/coredump_gen_handler_test.py +++ b/tests/coredump_gen_handler_test.py @@ -12,6 +12,11 @@ sys.path.append("scripts") import coredump_gen_handler as cdump_mod +AUTO_TS_STDOUT=""" +Techsupport is running with silent option. This command might take a long time. +The SAI dump is generated to /tmp/saisdkdump/sai_sdk_dump_11_22_2021_11_07_PM +/tmp/saisdkdump +""" def set_auto_ts_cfg(redis_mock, state="disabled", rate_limit_interval="0", @@ -74,12 +79,13 @@ def test_invoc_ts_state_db_update(self): populate_state_db(redis_mock) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) if "show techsupport" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + patcher.fs.create_file(ts_dump) else: return 1, "", "Command Not Found" - return 0, "", "" + return 0, AUTO_TS_STDOUT + ts_dump, "" cdump_mod.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") @@ -105,12 +111,13 @@ def test_global_rate_limit_interval(self): populate_state_db(redis_mock) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) if "show techsupport" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + patcher.fs.create_file(ts_dump) else: return 1, "", "Command Not Found" - return 0, "", "" + return 0, AUTO_TS_STDOUT + ts_dump, "" cdump_mod.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") @@ -138,12 +145,13 @@ def test_per_container_rate_limit_interval(self): "orchagent;{};swss".format(int(time.time()))}) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) if "show techsupport" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + patcher.fs.create_file(ts_dump) else: return 1, "", "Command Not Found" - return 0, "", "" + return 0, AUTO_TS_STDOUT + ts_dump, "" cdump_mod.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") @@ -167,12 +175,13 @@ def test_invoc_ts_after_rate_limit_interval(self): "orchagent;{};swss".format(int(time.time()))}) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) if "show techsupport" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + patcher.fs.create_file(ts_dump) else: return 1, "", "Command Not Found" - return 0, "", "" + return 0, AUTO_TS_STDOUT + ts_dump, "" cdump_mod.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") @@ -197,12 +206,13 @@ def test_core_dump_with_invalid_container_name(self): populate_state_db(redis_mock, {}) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) if "show techsupport" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + patcher.fs.create_file(ts_dump) else: return 1, "", "Command Not Found" - return 0, "", "" + return 0, AUTO_TS_STDOUT + ts_dump, "" cdump_mod.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/core/snmpd.12345.123.core.gz") @@ -225,12 +235,13 @@ def test_feature_table_not_set(self): populate_state_db(redis_mock, {}) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) if "show techsupport" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + patcher.fs.create_file(ts_dump) else: return 1, "", "Command Not Found" - return 0, "", "" + return 0, AUTO_TS_STDOUT + ts_dump, "" cdump_mod.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/core/python3.12345.123.core.gz") @@ -251,10 +262,11 @@ def test_since_argument(self): populate_state_db(redis_mock) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) - if "show techsupport --since '4 days ago'" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") - return 0, "", "" + if "--since '4 days ago'" in cmd_str: + patcher.fs.create_file(ts_dump) + return 0, AUTO_TS_STDOUT + ts_dump, "" elif "date --date='4 days ago'" in cmd_str: return 0, "", "" else: @@ -284,12 +296,13 @@ def test_masic_core_dump(self): populate_state_db(redis_mock) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) if "show techsupport" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + patcher.fs.create_file(ts_dump) else: return 1, "", "Command Not Found" - return 0, "", "" + return 0, AUTO_TS_STDOUT + ts_dump, "" cdump_mod.subprocess_exec = mock_cmd patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") @@ -315,10 +328,12 @@ def test_invalid_since_argument(self): populate_state_db(redis_mock) with Patcher() as patcher: def mock_cmd(cmd, env): + ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) - if "show techsupport --since '2 days ago'" in cmd_str: - patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") - return 0, "", "" + if "--since '2 days ago'" in cmd_str: + patcher.fs.create_file(ts_dump) + print(AUTO_TS_STDOUT + ts_dump) + return 0, AUTO_TS_STDOUT + ts_dump, "" elif "date --date='whatever'" in cmd_str: return 1, "", "Invalid Date Format" else: @@ -370,7 +385,7 @@ def mock_cmd(cmd, env): cmd_str = " ".join(cmd) if "show techsupport" in cmd_str: patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") - return 0, "", "" + return 0, AUTO_TS_STDOUT + ts_dump, "" patcher.fs.set_disk_usage(2000, path="/var/core/") patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz", st_size=25) patcher.fs.create_file("/var/core/lldpmgrd.12345.22.core.gz", st_size=25) diff --git a/utilities_common/auto_techsupport_helper.py b/utilities_common/auto_techsupport_helper.py index b94b7828974b..0fba65652112 100644 --- a/utilities_common/auto_techsupport_helper.py +++ b/utilities_common/auto_techsupport_helper.py @@ -12,7 +12,7 @@ "CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF", "CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB", "TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", - "TIME_BUF", "SINCE_DEFAULT" + "TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB" ] + [ # Methods "verify_recent_file_creation", "get_ts_dumps", @@ -30,7 +30,9 @@ CORE_DUMP_PTRN = "*.core.gz" TS_DIR = "/var/dump" -TS_PTRN = "sonic_dump_*.tar*" +TS_ROOT = "sonic_dump_*" +TS_PTRN = "sonic_dump_.*tar.*" # Regex Exp +TS_PTRN_GLOB = "sonic_dump_*tar*" # Glob Exp # CONFIG DB Attributes CFG_DB = "CONFIG_DB" @@ -78,8 +80,10 @@ def strip_ts_ext(ts_path): def get_ts_dumps(full_path=False): - """ Get the list of TS dumps in the TS_DIR, sorted by the creation time """ - curr_list = glob.glob(os.path.join(TS_DIR, TS_PTRN)) + """ + Get the list of TS dumps in the TS_DIR, sorted by the creation time + """ + curr_list = glob.glob(os.path.join(TS_DIR, TS_ROOT)) curr_list.sort(key=os.path.getmtime) if full_path: return curr_list