Skip to content

Commit

Permalink
[platform/cel-haliburton]: add watchdog service (sonic-net#6259)
Browse files Browse the repository at this point in the history
Haliburton needed watchdog daemon to monitor the basic health of a machine. If something goes wrong, such as a crashing program overloading the CPU, or no more free memory on the system, watchdog can safely reboot the machine,
  • Loading branch information
Wirut Getbamrung authored Dec 26, 2020
1 parent d609b40 commit a416f49
Show file tree
Hide file tree
Showing 4 changed files with 336 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
haliburton/cfg/haliburton-modules.conf etc/modules-load.d
haliburton/systemd/platform-modules-haliburton.service lib/systemd/system
haliburton/systemd/cpu_wdt.service lib/systemd/system
haliburton/scripts/cpu_wdt /usr/local/bin/
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
depmod -a
systemctl enable platform-modules-haliburton.service
systemctl start platform-modules-haliburton.service

systemctl enable cpu_wdt.service
systemctl start cpu_wdt.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
#!/usr/bin/env python

import os
import sys
import time
import signal
import syslog
import argparse
import subprocess

from sonic_platform_base.watchdog_base import WatchdogBase

SYSLOG_IDENTIFIER = 'cpu_wdt'
CPUWDT_MAIN_TASK_RUNNING_FLAG = True

PLATFORM_CPLD_PATH = '/sys/devices/platform/e1031.smc/'
SETREG_FILE = 'setreg'
GETREG_FILE = 'getreg'
WDT_COMMON_ERROR = -1
MMC_VERSION_REG = "0x100"

# watchdog infomation for cpld v06
V06_MMC_VERSION = 0x05
V06_WDT_WIDTH = '0x110'
V06_WDT_WIDTH_SELECTOR = {
30: '0x1',
60: '0x2',
180: '0x3'
}

V06_CPLD_WDT_INFO = {
'wdt_en_reg': '0x111',
'wdt_en_cmd': '0x0',
'wdt_dis_cmd': '0x1'
}

# watchdog infomation
WDT_TIMER_L_BIT_REG = '0x117'
WDT_TIMER_M_BIT_REG = '0x118'
WDT_TIMER_H_BIT_REG = '0x119'
WDT_KEEP_ALVIVE_REG = '0x11a'

CPLD_WDT_INFO = {
'wdt_en_reg': '0x116',
'wdt_en_cmd': '0x1',
'wdt_dis_cmd': '0x0'
}


# default input
DEFAULT_TIMEOUT = 180
DEFAULT_KEEPALIVE = 60


# ========================== Syslog wrappers ==========================


def log_info(msg, also_print_to_console=False):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_INFO, msg)
syslog.closelog()

if also_print_to_console:
print(msg)


def log_warning(msg, also_print_to_console=False):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_WARNING, msg)
syslog.closelog()

if also_print_to_console:
print(msg)


def log_error(msg, also_print_to_console=False):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_ERR, msg)
syslog.closelog()

if also_print_to_console:
print(msg)


class Watchdog(WatchdogBase):

def __init__(self):
# Init cpld reg path
self.setreg_path = os.path.join(PLATFORM_CPLD_PATH, SETREG_FILE)
self.getreg_path = os.path.join(PLATFORM_CPLD_PATH, GETREG_FILE)

self.mmc_v = self._get_mmc_version()
self.cpld_info = V06_CPLD_WDT_INFO if self.mmc_v <= V06_MMC_VERSION else CPLD_WDT_INFO

# Set default value
self._disable()
self.armed = False
self.timeout = 0

def _get_mmc_version(self):
hex_str_v = self._get_register_value(MMC_VERSION_REG)
return int(hex_str_v, 16)

def _get_register_value(self, register):
# Retrieves the value in the cpld register.
self._write_reg(self.getreg_path, register)
return self._read_reg(self.getreg_path)

def _write_reg(self, file_path, value):
with open(file_path, 'w') as fd:
fd.write(str(value))

def _read_reg(self, path):
with open(path, 'r') as fd:
output = fd.readline()
return output.strip('\n')

def _get_level_hex(self, sub_hex):
sub_hex_str = sub_hex.replace("x", "0")
return hex(int(sub_hex_str, 16))

def _seconds_to_lmh_hex(self, seconds):
ms = seconds*1000 # calculate timeout in ms format
hex_str = hex(ms)
l = self._get_level_hex(hex_str[-2:])
m = self._get_level_hex(hex_str[-4:-2])
h = self._get_level_hex(hex_str[-6:-4])
return (l, m, h)

def _enable(self):
"""
Turn on the watchdog timer
"""
enable_val = '{} {}'.format(
self.cpld_info['wdt_en_reg'], self.cpld_info['wdt_en_cmd'])
return self._write_reg(self.setreg_path, enable_val)

def _disable(self):
"""
Turn off the watchdog timer
"""
disable_val = '{} {}'.format(
self.cpld_info['wdt_en_reg'], self.cpld_info['wdt_dis_cmd'])
return self._write_reg(self.setreg_path, disable_val)

def _keepalive(self):
"""
Keep alive watchdog timer
"""
if self.mmc_v <= V06_MMC_VERSION:
self._disable()
self._enable()

else:
enable_val = '{} {}'.format(
WDT_KEEP_ALVIVE_REG, self.cpld_info['wdt_en_cmd'])
self._write_reg(self.setreg_path, enable_val)

def _settimeout(self, seconds):
"""
Set watchdog timer timeout
@param seconds - timeout in seconds
@return is the actual set timeout
"""

if self.mmc_v <= V06_MMC_VERSION:
timeout_hex = V06_WDT_WIDTH_SELECTOR.get(seconds)
set_timeout_val = '{} {}'.format(V06_WDT_WIDTH, timeout_hex)
self._write_reg(self.setreg_path, set_timeout_val)

else:
(l, m, h) = self._seconds_to_lmh_hex(seconds)
set_h_val = '{} {}'.format(WDT_TIMER_H_BIT_REG, h)
set_m_val = '{} {}'.format(WDT_TIMER_M_BIT_REG, m)
set_l_val = '{} {}'.format(WDT_TIMER_L_BIT_REG, l)
self._write_reg(self.setreg_path, set_h_val) # set high bit
self._write_reg(self.setreg_path, set_m_val) # set med bit
self._write_reg(self.setreg_path, set_l_val) # set low bit

return seconds

#################################################################

def arm(self, seconds):
"""
Arm the hardware watchdog with a timeout of <seconds> seconds.
If the watchdog is currently armed, calling this function will
simply reset the timer to the provided value. If the underlying
hardware does not support the value provided in <seconds>, this
method should arm the watchdog with the *next greater* available
value.
Returns:
An integer specifying the *actual* number of seconds the watchdog
was armed with. On failure returns -1.
"""
ret = WDT_COMMON_ERROR

try:
if self.timeout != seconds:
self.timeout = self._settimeout(seconds)

if self.armed:
self._keepalive()
else:
self._enable()
self.armed = True

ret = self.timeout
self.arm_timestamp = time.time()
except IOError as e:
log_error("Error: unable to enable wdt due to : {}".format(e))

return ret

def disarm(self):
"""
Disarm the hardware watchdog
Returns:
A boolean, True if watchdog is disarmed successfully, False if not
"""
disarmed = False
try:
self._disable()
self.armed = False
disarmed = True
except IOError as e:
log_error("Error: unable to disable wdt due to : {}".format(e))
return disarmed

def is_armed(self):
"""
Retrieves the armed state of the hardware watchdog.
Returns:
A boolean, True if watchdog is armed, False if not
"""
return self.armed

# ========================== Signal Handling ==========================


def signal_handler(sig, frame):
global CPUWDT_MAIN_TASK_RUNNING_FLAG
if sig == signal.SIGHUP:
log_info("Caught SIGHUP - ignoring...")
return
elif sig == signal.SIGINT:
log_info("Caught SIGINT - exiting...")
CPUWDT_MAIN_TASK_RUNNING_FLAG = False
elif sig == signal.SIGTERM:
log_info("Caught SIGTERM - exiting...")
CPUWDT_MAIN_TASK_RUNNING_FLAG = False
else:
log_warning("Caught unhandled signal '" + sig + "'")
return

#
# Main =========================================================================
#


def check_cpld_driver():
# Check the cpld driver loading status.
cpld_setreg = os.path.join(PLATFORM_CPLD_PATH, SETREG_FILE)

c = 0
while c < 60:
if os.path.isfile(cpld_setreg):
return
c += 1
time.sleep(1)

print("Error: The cpld driver has not been loaded.")
sys.exit(1)


def main():
# Register our signal handlers
signal.signal(signal.SIGHUP, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

# Init argument parser
parser = argparse.ArgumentParser()
parser.add_argument("action", help="Start/Stop CPU WDT",
choices=['start', 'stop'])
parser.add_argument(
"--timeout", "-t", help="WDT timeout period", choices=[30, 60, 180], type=int)
parser.add_argument("--keep_alive", "-k",
help="WDT keep alive period", type=int)
args = parser.parse_args()

# Check the cpld driver
check_cpld_driver()

# Init WDT Class
watchdog = Watchdog()

if args.action == 'start':
log_info('Enable CPU WDT..')

# Enable
timeout = args.timeout or DEFAULT_TIMEOUT
watchdog.arm(timeout)
log_info('CPU WDT has been enabled with {} seconds timeout'.format(timeout))

# Keep Alive
keep_alive = args.keep_alive or DEFAULT_KEEPALIVE
log_info('Enable keep alive messaging every {} seconds'.format(keep_alive))
while CPUWDT_MAIN_TASK_RUNNING_FLAG:
time.sleep(keep_alive-1)
watchdog.arm(timeout)
log_info('Keep alive messaging has been disabled')

# Disable
log_info('Disable CPU WDT..')
watchdog.disarm()
log_info('CPU WDT has been disabled!')


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[Unit]
Description=CPU WDT
After=platform-modules-haliburton.service
Requires=platform-modules-haliburton.service

[Service]
ExecStart=-/usr/local/bin/cpu_wdt start

[Install]
WantedBy=multi-user.target

0 comments on commit a416f49

Please sign in to comment.