From c007d657713719aa97db2fc4b1b2ab946e0a09d4 Mon Sep 17 00:00:00 2001 From: Vaibhav Hemant Dixit Date: Mon, 13 Sep 2021 12:44:55 -0700 Subject: [PATCH] [warm-reboot] Add new preboot health check: verify database integrity (#1785) What I did Verify database integrity before proceeding with warm reboot or fast reboot. This integrity check uses a JSON schema to validate DBs. To start with, only counters_db's table COUNTERS_PORT_NAME_MAP presence is verified. But, this list can advance in future. The test logic is designed to be generic; any more databases or tables within them can be just added to schema list, and the verification logic needs no change. How I did it Added a JSON schema, and generic schema validation logic. --- scripts/check_db_integrity.py | 84 +++++++++++++++++++++++++++++++++++ scripts/fast-reboot | 31 +++++++++++-- 2 files changed, 112 insertions(+), 3 deletions(-) create mode 100755 scripts/check_db_integrity.py diff --git a/scripts/check_db_integrity.py b/scripts/check_db_integrity.py new file mode 100755 index 000000000000..3a994897b464 --- /dev/null +++ b/scripts/check_db_integrity.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +""" +This is to verify if Database has critical tables present before warmboot can proceed. +If warmboot is allowed with missing critical tables, it can lead to issues in going +down path or during the recovery path. This test detects such issues before proceeding. +The verification procedure here uses JSON schemas to verify the DB entities. + +In future, to verify new tables or their content, just the schema modification is needed. +No modification may be needed to the integrity check logic. +""" + +import os, sys +import json, jsonschema +import syslog +import subprocess +import traceback + +DB_SCHEMA = { + "COUNTERS_DB": + { + "$schema": "http://json-schema.org/draft-06/schema", + "type": "object", + "title": "Schema for COUNTERS DB's entities", + "required": ["COUNTERS_PORT_NAME_MAP"], + "properties": { + "COUNTERS_PORT_NAME_MAP": {"$id": "#/properties/COUNTERS_PORT_NAME_MAP", "type": "object"} + } + } +} + + +def main(): + if not DB_SCHEMA: + return 0 + + for db_name, schema in DB_SCHEMA.items(): + db_dump_file = "/tmp/{}.json".format(db_name) + dump_db_cmd = "sonic-db-dump -n 'COUNTERS_DB' -y > {}".format(db_dump_file) + p = subprocess.Popen(dump_db_cmd, shell=True, text=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (_, err) = p.communicate() + rc = p.wait() + if rc != 0: + print("Failed to dump db {}. Return code: {} with err: {}".format(db_name, rc, err)) + + try: + with open(db_dump_file) as fp: + db_dump_data = json.load(fp) + except ValueError as err: + syslog.syslog(syslog.LOG_DEBUG, "DB json file is not a valid json file. " +\ + "Error: {}".format(str(err))) + return 1 + + # What: Validate if critical tables and entries are present in DB. + # Why: This is needed to avoid warmbooting with a bad DB; which can + # potentially trigger failures in the reboot recovery path. + # How: Validate DB against a schema which defines required tables. + try: + jsonschema.validate(instance=db_dump_data, schema=schema) + except jsonschema.exceptions.ValidationError as err: + syslog.syslog(syslog.LOG_ERR, "Database is missing tables/entries needed for reboot procedure. " +\ + "DB integrity check failed with:\n{}".format(str(err.message))) + return 1 + syslog.syslog(syslog.LOG_DEBUG, "Database integrity checks passed.") + return 0 + + +if __name__ == '__main__': + res = 0 + try: + res = main() + except KeyboardInterrupt: + syslog.syslog(syslog.LOG_NOTICE, "SIGINT received. Quitting") + res = 1 + except Exception as e: + syslog.syslog(syslog.LOG_ERR, "Got an exception %s: Traceback: %s" % (str(e), traceback.format_exc())) + res = 2 + finally: + syslog.closelog() + try: + sys.exit(res) + except SystemExit: + os._exit(res) diff --git a/scripts/fast-reboot b/scripts/fast-reboot index aa28954b4698..557a6d0f682e 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -12,6 +12,7 @@ SHUTDOWN_ORDER_FILE="/etc/sonic/${REBOOT_TYPE}_order" VERBOSE=no FORCE=no IGNORE_ASIC=no +IGNORE_DB_CHECK=no STRICT=no REBOOT_METHOD="/sbin/kexec -e" ASSISTANT_IP_LIST="" @@ -38,6 +39,7 @@ EXIT_SYNCD_SHUTDOWN=11 EXIT_FAST_REBOOT_DUMP_FAILURE=12 EXIT_FILTER_FDB_ENTRIES_FAILURE=13 EXIT_COUNTERPOLL_DELAY_FAILURE=14 +EXIT_DB_INTEGRITY_FAILURE=15 EXIT_NO_CONTROL_PLANE_ASSISTANT=20 EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21 @@ -59,8 +61,9 @@ function showHelpAndExit() echo "Usage: ${REBOOT_SCRIPT_NAME} [options]" echo " -h,-? : get this help" echo " -v : turn on verbose" - echo " -f : force execution" - echo " -i : ignore MD5-checksum-verification of ASIC configuration files" + echo " -f : force execution - ignore Orchagent RESTARTCHECK failure" + echo " -i : force execution - ignore ASIC MD5-checksum-verification" + echo " -d : force execution - ignore database integrity check" echo " -r : reboot with /sbin/reboot" echo " -k : reboot with /sbin/kexec -e [default]" echo " -x : execute script with -x flag" @@ -74,7 +77,7 @@ function showHelpAndExit() function parseOptions() { - while getopts "vfih?rkxc:s" opt; do + while getopts "vfidh?rkxc:s" opt; do case ${opt} in h|\? ) showHelpAndExit @@ -88,6 +91,9 @@ function parseOptions() i ) IGNORE_ASIC=yes ;; + d ) + IGNORE_DB_CHECK=yes + ;; r ) REBOOT_METHOD="/sbin/reboot" ;; @@ -327,6 +333,23 @@ function check_docker_exec() done } +function check_db_integrity() +{ + if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + CHECK_DB_INTEGRITY=0 + /usr/local/bin/check_db_integrity.py || CHECK_DB_INTEGRITY=$? + if [[ CHECK_DB_INTEGRITY -ne 0 ]]; then + if [[ x"${IGNORE_DB_CHECK}" == x"yes" ]]; then + debug "Ignoring Database integrity checks..." + else + error "Failed to validate DB's integrity. Exit code: ${CHECK_DB_INTEGRITY}. \ + Use '-d' option to force ignore this check." + exit ${EXIT_DB_INTEGRITY_FAILURE} + fi + fi + fi +} + function reboot_pre_check() { check_docker_exec @@ -337,6 +360,8 @@ function reboot_pre_check() fi rm ${filename} + check_db_integrity + # Make sure /host has enough space for warm reboot temp files avail=$(df -k /host | tail -1 | awk '{ print $4 }') if [[ ${avail} -lt ${MIN_HD_SPACE_NEEDED} ]]; then