From 4d4b1d4c4912d22df169f06966ab0accef9c8414 Mon Sep 17 00:00:00 2001 From: "Marty Y. Lok" <76118573+mlok-nokia@users.noreply.github.com> Date: Sat, 11 May 2024 22:01:27 -0400 Subject: [PATCH] [Nokia][chassis] modify Nokia-IXR7250E-36x400G platform specified reboot to allow SUP to log expected/unepected midplane/module connectivity msg (#18805) Why I did it For Linecard expected and unexpected reboot, Supervisor needs to log a expected and unexpected lost connectivity message. After the new mechanism has been introduced by PRs. For Nokia-IXR7250E-36x600G linecard, it requires to handle missing heartbeat reboot is unexpected reboot for SUP. Issue #18540 Work item tracking Microsoft ADO (number only): How I did it On Nokia-IXR7250E-36x400G platform, missing heartbeat reboot also call the "sudo reboot" which creates a CHASSIS_MODULE_REBOOT_INFO_TABLE entry expected reboot on SUP. Since heartbeat reboot is unexpected reboot, it requires to modify the platform_reboot check if it is missing heart reboot, then remove the CHASSIS_MODULE_REBOOT_INFO_TABLE entry on the SUP. So that, SUP can log the unexpected log. How to verify it Simulated the missing heartbeat reboot on the linecard, then, verify the log message on SUP as below Apr 25 19:50:19.286081 ixre-cpm-chassis7 WARNING pmon#chassisd: Module LINE-CARD0 went off-line! Apr 25 19:50:22.549416 ixre-cpm-chassis7 WARNING pmon#chassisd: Unexpected: Module LINE-CARD0 lost midplane connectivity. Signed-off-by: mlok --- .../platform_reboot | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_reboot b/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_reboot index eb0bebef0e54..454a14c5ab74 100755 --- a/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_reboot +++ b/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_reboot @@ -1,12 +1,27 @@ #!/bin/bash +DEVICE_MGR_REBOOT_FILE=/tmp/device_mgr_reboot +REBOOT_CAUSE_FILE=/host/reboot-cause/reboot-cause.txt +DEVICE_REBOOT_CAUSE_FILE=/etc/opt/srlinux/reboot-cause.txt +kHeartbeatLostRebootCause="Heartbeat with the Supervisor card lost" +DEVICE_DETAILS_FILE="/etc/opt/srlinux/devices/hw_details.json" + +ungraceful_reboot_handle() +{ + str=$(grep "$kHeartbeatLostRebootCause" $DEVICE_REBOOT_CAUSE_FILE 2> /dev/null) + status=$? + if [ $status -eq 0 ]; then + slot_num=$(jq -r '.slot_num' $DEVICE_DETAILS_FILE 2>/dev/null) + slot_num=$((slot_num - 1)) + sonic-db-cli CHASSIS_STATE_DB del "CHASSIS_MODULE_REBOOT_INFO_TABLE|LINE-CARD${slot_num}" + fi +} update_reboot_cause() { - DEVICE_MGR_REBOOT_FILE=/tmp/device_mgr_reboot - REBOOT_CAUSE_FILE=/host/reboot-cause/reboot-cause.txt - DEVICE_REBOOT_CAUSE_FILE=/etc/opt/srlinux/reboot-cause.txt if [ -e $DEVICE_MGR_REBOOT_FILE ]; then if [ -e $DEVICE_REBOOT_CAUSE_FILE ]; then + # reomve the REBOOT_INFO_TABLE entry for unpexected reboot + ungraceful_reboot_handle cp -f $DEVICE_REBOOT_CAUSE_FILE $REBOOT_CAUSE_FILE fi rm -f $DEVICE_MGR_REBOOT_FILE @@ -18,7 +33,7 @@ update_reboot_cause() } echo "Disable all SFPs" -python3 -c 'import sonic_platform.platform; platform_chassis = sonic_platform.platform.Platform().get_chassis(); platform_chassis.tx_disable_all_sfps()' +python3 -c 'import sonic_platform.platform; platform_chassis = sonic_platform.platform.Platform().get_chassis(); platform_chassis.tx_disable_all_sfps()' & sleep 3 # update the reboot_cuase file when reboot is trigger by device-mgr