Skip to content

Commit

Permalink
[Mellanox] Fix retry logic on discovery of MST device (#20389)
Browse files Browse the repository at this point in the history
- Why I did it
Fixing retry logic when MST device is discovered. The current implementation only fetches the name of the device but doesn't verify if the device is accessible which can be confirmed by querying the device and ensuring the command passes.

- How I did it
Added a query command with device as parameter and ensured it passes.

- How to verify it
Running upgrade tests.
  • Loading branch information
dgsudharsan authored Oct 8, 2024
1 parent e8e358d commit 9f9d82d
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 24 deletions.
19 changes: 1 addition & 18 deletions files/scripts/syncd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,6 @@

. /usr/local/bin/syncd_common.sh

declare -r UNKN_MST="unknown"

function GetMstDevice() {
local _MST_DEVICE="$(ls /dev/mst/*_pci_cr0 2>&1)"

if [[ ! -c "${_MST_DEVICE}" ]]; then
echo "${UNKN_MST}"
else
echo "${_MST_DEVICE}"
fi
}

function startplatform() {

# platform specific tasks
Expand All @@ -36,12 +24,7 @@ function startplatform() {
debug "Starting Firmware update procedure"
/usr/bin/mst start --with_i2cdev

local -r _MST_DEVICE="$(GetMstDevice)"
if [[ "${_MST_DEVICE}" != "${UNKN_MST}" ]]; then
/usr/bin/flint -d $_MST_DEVICE --clear_semaphore
fi

/usr/bin/mlnx-fw-upgrade.sh -v
/usr/bin/mlnx-fw-upgrade.sh -c -v
if [[ "$?" -ne "${EXIT_SUCCESS}" ]]; then
debug "Failed to upgrade fw. " "$?" "Restart syncd"
exit 1
Expand Down
31 changes: 25 additions & 6 deletions platform/mellanox/mlnx-fw-upgrade.j2
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ function PrintHelp() {
echo " -s, --syslog Use syslog logger (enabled when -u|--upgrade)"
echo " -v, --verbose Verbose mode (enabled when -u|--upgrade)"
echo " -d, --dry-run Compare the FW versions without installation. Return code "0" means the FW is up-to-date, return code "10" means an upgrade is required, otherwise an error is detected."
echo " -c, --clear-semaphore Clear hw resources before updating firmware"
echo " -h, --help Print help"
echo
echo "Examples:"
Expand All @@ -103,6 +104,9 @@ function ParseArguments() {
-d|--dry-run)
DRY_RUN="${YES_PARAM}"
;;
-c|--clear-semaphore)
CLEAR_SEMAPHORE="${YES_PARAM}"
;;
-h|--help)
PrintHelp
exit "${EXIT_SUCCESS}"
Expand Down Expand Up @@ -210,16 +214,20 @@ function WaitForDevice() {
local -i QUERY_RETRY_COUNT_MAX="10"
local -i QUERY_RETRY_COUNT="0"
local -r DEVICE_TYPE=$(GetMstDeviceType)
local SPC_MST_DEV
local QUERY_RC=""

local SPC_MST_DEV=$(GetSPCMstDevice)

while [[ ("${QUERY_RETRY_COUNT}" -lt "${QUERY_RETRY_COUNT_MAX}") && ("${SPC_MST_DEV}" == "${UNKN_MST}") ]]; do
while : ; do
SPC_MST_DEV=$(GetSPCMstDevice)
${QUERY_XML} -d ${SPC_MST_DEV} -o ${QUERY_FILE}
QUERY_RC="$?"
[[ ("${QUERY_RETRY_COUNT}" -lt "${QUERY_RETRY_COUNT_MAX}") && ("${QUERY_RC}" != "${EXIT_SUCCESS}") ]] || break
sleep 1s
((QUERY_RETRY_COUNT++))
SPC_MST_DEV=$(GetSPCMstDevice)
LogInfo "Retrying MST device query ${QUERY_RETRY_COUNT}"
done

if [[ "${SPC_MST_DEV}" == "${UNKN_MST}" ]]; then
if [[ "${QUERY_RC}" != "${EXIT_SUCCESS}" ]]; then
# Couldn't Detect the Spectrum ASIC. Exit failure and print the detailed information
output=$(${QUERY_CMD})
failure_msg="${output#*Fail : }"
Expand Down Expand Up @@ -265,7 +273,7 @@ function GetSPCMstDevice() {

if [[ ! -c "${_MST_DEVICE}" ]]; then
echo "${UNKN_MST}"
else
else
echo "${_MST_DEVICE}"
fi

Expand Down Expand Up @@ -482,6 +490,15 @@ function Cleanup() {
fi
}

function ClearSemaphore() {
if [[ "${CLEAR_SEMAPHORE}" == "${YES_PARAM}" ]]; then
local -r _MST_DEVICE="$(GetSPCMstDevice)"
if [[ "${_MST_DEVICE}" != "${UNKN_MST}" ]]; then
/usr/bin/flint -d $_MST_DEVICE --clear_semaphore
fi
fi
}

trap Cleanup EXIT

ParseArguments "$@"
Expand All @@ -492,6 +509,8 @@ LockStateChange

WaitForDevice

ClearSemaphore

if [ "${IMAGE_UPGRADE}" != "${YES_PARAM}" ]; then
UpgradeFW
else
Expand Down

0 comments on commit 9f9d82d

Please sign in to comment.