From 9f9d82dbd465978a389948f1f37d73f82211f07c Mon Sep 17 00:00:00 2001 From: Sudharsan Dhamal Gopalarathnam Date: Tue, 8 Oct 2024 08:02:08 -0700 Subject: [PATCH] [Mellanox] Fix retry logic on discovery of MST device (#20389) - Why I did it Fixing retry logic when MST device is discovered. The current implementation only fetches the name of the device but doesn't verify if the device is accessible which can be confirmed by querying the device and ensuring the command passes. - How I did it Added a query command with device as parameter and ensured it passes. - How to verify it Running upgrade tests. --- files/scripts/syncd.sh | 19 +---------------- platform/mellanox/mlnx-fw-upgrade.j2 | 31 ++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index 0930de72b87c..0c794ef5d9dc 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -2,18 +2,6 @@ . /usr/local/bin/syncd_common.sh -declare -r UNKN_MST="unknown" - -function GetMstDevice() { - local _MST_DEVICE="$(ls /dev/mst/*_pci_cr0 2>&1)" - - if [[ ! -c "${_MST_DEVICE}" ]]; then - echo "${UNKN_MST}" - else - echo "${_MST_DEVICE}" - fi -} - function startplatform() { # platform specific tasks @@ -36,12 +24,7 @@ function startplatform() { debug "Starting Firmware update procedure" /usr/bin/mst start --with_i2cdev - local -r _MST_DEVICE="$(GetMstDevice)" - if [[ "${_MST_DEVICE}" != "${UNKN_MST}" ]]; then - /usr/bin/flint -d $_MST_DEVICE --clear_semaphore - fi - - /usr/bin/mlnx-fw-upgrade.sh -v + /usr/bin/mlnx-fw-upgrade.sh -c -v if [[ "$?" -ne "${EXIT_SUCCESS}" ]]; then debug "Failed to upgrade fw. " "$?" "Restart syncd" exit 1 diff --git a/platform/mellanox/mlnx-fw-upgrade.j2 b/platform/mellanox/mlnx-fw-upgrade.j2 index 6bfea7762070..00796d54b096 100755 --- a/platform/mellanox/mlnx-fw-upgrade.j2 +++ b/platform/mellanox/mlnx-fw-upgrade.j2 @@ -77,6 +77,7 @@ function PrintHelp() { echo " -s, --syslog Use syslog logger (enabled when -u|--upgrade)" echo " -v, --verbose Verbose mode (enabled when -u|--upgrade)" echo " -d, --dry-run Compare the FW versions without installation. Return code "0" means the FW is up-to-date, return code "10" means an upgrade is required, otherwise an error is detected." + echo " -c, --clear-semaphore Clear hw resources before updating firmware" echo " -h, --help Print help" echo echo "Examples:" @@ -103,6 +104,9 @@ function ParseArguments() { -d|--dry-run) DRY_RUN="${YES_PARAM}" ;; + -c|--clear-semaphore) + CLEAR_SEMAPHORE="${YES_PARAM}" + ;; -h|--help) PrintHelp exit "${EXIT_SUCCESS}" @@ -210,16 +214,20 @@ function WaitForDevice() { local -i QUERY_RETRY_COUNT_MAX="10" local -i QUERY_RETRY_COUNT="0" local -r DEVICE_TYPE=$(GetMstDeviceType) + local SPC_MST_DEV + local QUERY_RC="" - local SPC_MST_DEV=$(GetSPCMstDevice) - - while [[ ("${QUERY_RETRY_COUNT}" -lt "${QUERY_RETRY_COUNT_MAX}") && ("${SPC_MST_DEV}" == "${UNKN_MST}") ]]; do + while : ; do + SPC_MST_DEV=$(GetSPCMstDevice) + ${QUERY_XML} -d ${SPC_MST_DEV} -o ${QUERY_FILE} + QUERY_RC="$?" + [[ ("${QUERY_RETRY_COUNT}" -lt "${QUERY_RETRY_COUNT_MAX}") && ("${QUERY_RC}" != "${EXIT_SUCCESS}") ]] || break sleep 1s ((QUERY_RETRY_COUNT++)) - SPC_MST_DEV=$(GetSPCMstDevice) + LogInfo "Retrying MST device query ${QUERY_RETRY_COUNT}" done - if [[ "${SPC_MST_DEV}" == "${UNKN_MST}" ]]; then + if [[ "${QUERY_RC}" != "${EXIT_SUCCESS}" ]]; then # Couldn't Detect the Spectrum ASIC. Exit failure and print the detailed information output=$(${QUERY_CMD}) failure_msg="${output#*Fail : }" @@ -265,7 +273,7 @@ function GetSPCMstDevice() { if [[ ! -c "${_MST_DEVICE}" ]]; then echo "${UNKN_MST}" - else + else echo "${_MST_DEVICE}" fi @@ -482,6 +490,15 @@ function Cleanup() { fi } +function ClearSemaphore() { + if [[ "${CLEAR_SEMAPHORE}" == "${YES_PARAM}" ]]; then + local -r _MST_DEVICE="$(GetSPCMstDevice)" + if [[ "${_MST_DEVICE}" != "${UNKN_MST}" ]]; then + /usr/bin/flint -d $_MST_DEVICE --clear_semaphore + fi + fi +} + trap Cleanup EXIT ParseArguments "$@" @@ -492,6 +509,8 @@ LockStateChange WaitForDevice +ClearSemaphore + if [ "${IMAGE_UPGRADE}" != "${YES_PARAM}" ]; then UpgradeFW else