From d73bcebb4764112cfa1a495c1eac92a2f20b213c Mon Sep 17 00:00:00 2001 From: Vivek Reddy Karri Date: Thu, 24 Mar 2022 19:51:03 +0000 Subject: [PATCH 1/6] Minor Setbacks Signed-off-by: Vivek Reddy Karri --- scripts/generate_dump | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/generate_dump b/scripts/generate_dump index ef06e929e1..c489a4e571 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -63,6 +63,10 @@ handle_exit() ECODE=$? echo "Removing lock. Exit: $ECODE" >&2 $RM $V -rf ${LOCKDIR} + # Echo the filename as the last statement if the generation suceeds + if [[ -f $TARFILE && ($ECODE == $EXT_SUCCESS || $ECODE == $RETURN_CODE) ]]; then + echo $TARFILE + fi } handle_signal() @@ -1494,7 +1498,6 @@ while getopts ":xnvhzas:t:r:d" opt; do ;; v) # echo commands about to be run to stderr - set -v V="-v" ;; n) @@ -1547,7 +1550,7 @@ fi ## Attempt Locking ## -if mkdir "${LOCKDIR}" &>/dev/null; then +if $MKDIR "${LOCKDIR}" &>/dev/null; then trap 'handle_exit' EXIT echo "$$" > "${PIDFILE}" # This handler will exit the script upon receiving these interrupts @@ -1555,6 +1558,9 @@ if mkdir "${LOCKDIR}" &>/dev/null; then trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM echo "Lock succesfully accquired and installed signal handlers" # Proceed with the actual code + if [[ ! -z "${V}" ]]; then + set -v + fi main else # lock failed, check if the other PID is alive From 2f1d5453aa749b7e66b23b49a753b05695e2695d Mon Sep 17 00:00:00 2001 From: Vivek R Date: Thu, 24 Mar 2022 13:12:49 -0700 Subject: [PATCH 2/6] Update generate_dump --- scripts/generate_dump | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/generate_dump b/scripts/generate_dump index c489a4e571..0b69e9b051 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -1364,8 +1364,6 @@ main() { # Invoke the TechSupport Cleanup Hook setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log & - - echo ${TARFILE} if ! $SAVE_STDERR then From 8ba42b18133233128816e14b8827f1818d57a08c Mon Sep 17 00:00:00 2001 From: Vivek Reddy Karri Date: Fri, 25 Mar 2022 23:14:51 +0000 Subject: [PATCH 3/6] auto-ts updated Signed-off-by: Vivek Reddy Karri --- scripts/coredump_gen_handler.py | 15 +++++++-- tests/coredump_gen_handler_test.py | 34 +++++++++++++++++++-- utilities_common/auto_techsupport_helper.py | 10 ++++-- 3 files changed, 52 insertions(+), 7 deletions(-) diff --git a/scripts/coredump_gen_handler.py b/scripts/coredump_gen_handler.py index 33d661e3da..d23f927ec0 100644 --- a/scripts/coredump_gen_handler.py +++ b/scripts/coredump_gen_handler.py @@ -110,13 +110,22 @@ def parse_ts_dump_name(self, ts_stdout): syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name") return "" - def invoke_ts_cmd(self, since_cfg): - since_cfg = "'" + since_cfg + "'" + def invoke_ts_cmd(self, since_cfg, num_retry=0): cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg] cmd = " ".join(cmd_opts) rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR) - if rc: + if rc == EXT_LOCKFAIL: + syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr)) + return "" + elif rc == EXT_RETRY: + if num_retry <= MAX_RETRY_LIMIT: + print(num_retry) + return self.invoke_ts_cmd(since_cfg, num_retry+1) + else: + syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr)) + elif rc != EXT_SUCCESS: syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr)) + # Parse the dump name new_dump = self.parse_ts_dump_name(stdout) if not new_dump: syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) diff --git a/tests/coredump_gen_handler_test.py b/tests/coredump_gen_handler_test.py index e5270460eb..b228c4ca55 100644 --- a/tests/coredump_gen_handler_test.py +++ b/tests/coredump_gen_handler_test.py @@ -3,10 +3,12 @@ import sys import pyfakefs import unittest +import signal from pyfakefs.fake_filesystem_unittest import Patcher from swsscommon import swsscommon from utilities_common.general import load_module_from_source from utilities_common.db import Db +from utilities_common.auto_techsupport_helper import EXT_RETRY from .mock_tables import dbconnector sys.path.append("scripts") @@ -18,6 +20,9 @@ /tmp/saisdkdump """ +def signal_handler(signum, frame): + raise Exception("Timed out!") + def set_auto_ts_cfg(redis_mock, state="disabled", rate_limit_interval="0", max_core_size="0.0", @@ -264,7 +269,7 @@ def test_since_argument(self): def mock_cmd(cmd, env): ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) - if "--since '4 days ago'" in cmd_str: + if "--since 4 days ago" in cmd_str: patcher.fs.create_file(ts_dump) return 0, AUTO_TS_STDOUT + ts_dump, "" elif "date --date=4 days ago" in cmd_str: @@ -330,7 +335,7 @@ def test_invalid_since_argument(self): def mock_cmd(cmd, env): ts_dump = "/var/dump/sonic_dump_random3.tar.gz" cmd_str = " ".join(cmd) - if "--since '2 days ago'" in cmd_str: + if "--since 2 days ago" in cmd_str: patcher.fs.create_file(ts_dump) print(AUTO_TS_STDOUT + ts_dump) return 0, AUTO_TS_STDOUT + ts_dump, "" @@ -396,3 +401,28 @@ def mock_cmd(cmd, env): assert "orchagent.12345.123.core.gz" in current_fs assert "lldpmgrd.12345.22.core.gz" in current_fs assert "python3.12345.21.core.gz" in current_fs + + def test_max_retry_ts_failure(self): + """ + Scenario: TS subprocess is continously returning EXT_RETRY + Make sure auto-ts is not exceeding the limit + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled") + set_feature_table_cfg(redis_mock, state="enabled") + with Patcher() as patcher: + def mock_cmd(cmd, env): + return EXT_RETRY, "", "" + + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) + + signal.signal(signal.SIGALRM, signal_handler) + signal.alarm(5) # 5 seconds + try: + cls.handle_core_dump_creation_event() + except Exception: + assert False, "Method should not time out" + diff --git a/utilities_common/auto_techsupport_helper.py b/utilities_common/auto_techsupport_helper.py index 0fba656521..4eaae933b0 100644 --- a/utilities_common/auto_techsupport_helper.py +++ b/utilities_common/auto_techsupport_helper.py @@ -11,8 +11,9 @@ "CORE_DUMP_DIR", "CORE_DUMP_PTRN", "TS_DIR", "TS_PTRN", "CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF", "CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB", - "TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", - "TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB" + "TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", "TIME_BUF", + "SINCE_DEFAULT", "TS_PTRN_GLOB", "EXT_LOCKFAIL", "EXT_RETRY", + "EXT_SUCCESS", "MAX_RETRY_LIMIT" ] + [ # Methods "verify_recent_file_creation", "get_ts_dumps", @@ -60,6 +61,11 @@ TIME_BUF = 20 SINCE_DEFAULT = "2 days ago" +# Techsupport Exit Codes +EXT_LOCKFAIL = 2 +EXT_RETRY = 4 +EXT_SUCCESS = 0 +MAX_RETRY_LIMIT = 2 # Helper methods def subprocess_exec(cmd, env=None): From 19012758570ac9b3c747f5090c7958aac23db0f6 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Karri Date: Sat, 26 Mar 2022 02:58:46 +0000 Subject: [PATCH 4/6] Disable signal after the test Signed-off-by: Vivek Reddy Karri --- tests/coredump_gen_handler_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/coredump_gen_handler_test.py b/tests/coredump_gen_handler_test.py index b228c4ca55..bf4ae8dc78 100644 --- a/tests/coredump_gen_handler_test.py +++ b/tests/coredump_gen_handler_test.py @@ -425,4 +425,6 @@ def mock_cmd(cmd, env): cls.handle_core_dump_creation_event() except Exception: assert False, "Method should not time out" + finally: + signal.alarm(0) From fdd57fb0e1442a41b08168e7e5aac62d4130cf5d Mon Sep 17 00:00:00 2001 From: Vivek R Date: Tue, 29 Mar 2022 13:59:13 -0700 Subject: [PATCH 5/6] Update coredump_gen_handler.py --- scripts/coredump_gen_handler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/coredump_gen_handler.py b/scripts/coredump_gen_handler.py index d23f927ec0..b846ee87b4 100644 --- a/scripts/coredump_gen_handler.py +++ b/scripts/coredump_gen_handler.py @@ -119,7 +119,6 @@ def invoke_ts_cmd(self, since_cfg, num_retry=0): return "" elif rc == EXT_RETRY: if num_retry <= MAX_RETRY_LIMIT: - print(num_retry) return self.invoke_ts_cmd(since_cfg, num_retry+1) else: syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr)) From 230c8df77a43a32c5cb4052b6a08aec8e0481b81 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Karri Date: Fri, 1 Apr 2022 02:18:39 +0000 Subject: [PATCH 6/6] Comments handled Signed-off-by: Vivek Reddy Karri --- scripts/coredump_gen_handler.py | 14 +++++++------- scripts/generate_dump | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/coredump_gen_handler.py b/scripts/coredump_gen_handler.py index b846ee87b4..03ba2de89e 100644 --- a/scripts/coredump_gen_handler.py +++ b/scripts/coredump_gen_handler.py @@ -114,9 +114,9 @@ def invoke_ts_cmd(self, since_cfg, num_retry=0): cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg] cmd = " ".join(cmd_opts) rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR) + new_dump = "" if rc == EXT_LOCKFAIL: syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr)) - return "" elif rc == EXT_RETRY: if num_retry <= MAX_RETRY_LIMIT: return self.invoke_ts_cmd(since_cfg, num_retry+1) @@ -124,12 +124,12 @@ def invoke_ts_cmd(self, since_cfg, num_retry=0): syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr)) elif rc != EXT_SUCCESS: syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr)) - # Parse the dump name - new_dump = self.parse_ts_dump_name(stdout) - if not new_dump: - syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) - else: - syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump)) + else: # EXT_SUCCESS + new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name + if not new_dump: + syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) + else: + syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump)) return new_dump def verify_rate_limit_intervals(self, global_cooloff, container_cooloff): diff --git a/scripts/generate_dump b/scripts/generate_dump index 0b69e9b051..85a7f6a057 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -63,7 +63,7 @@ handle_exit() ECODE=$? echo "Removing lock. Exit: $ECODE" >&2 $RM $V -rf ${LOCKDIR} - # Echo the filename as the last statement if the generation suceeds + # Echo the filename as the last statement if the generation succeeds if [[ -f $TARFILE && ($ECODE == $EXT_SUCCESS || $ECODE == $RETURN_CODE) ]]; then echo $TARFILE fi