disk_check: Check & mount RO as RW using tmpfs (sonic-net#1569)

What I did There is a bug that occasionally turn root-overlay as RO. This makes /etc & /home as RO. This blocks any new remote user login, as that needs to write into /etc & /home. This tool scans /etc & /home (or given dirs) as in RW or RO state. If RO, it could create a writable overlay using tmpfs. This is transient and stays until next reboot. Any write after the overlay will be lost upon reboot. But this allows new remote users login. How I did it Create upper & work dirs in /run/mount (tmpfs). Mount /etc & /home as lowerdirs and use the same name for final merge. This allows anyone opening a file in /etc or /home to operate on the merged overlay, transparently. How to verify it Mount any dir on tmpfs ( mount -t tmpfs tmpfs test_dir) remount as RO (mount -o remount,ro test_dir) Pass that dir to this script. (disk_check.py -d ./test_dir) Now it should be RW
renukamanavalan · Apr 26, 2021 · 9dba93f · 9dba93f
1 parent c3963c5
commit 9dba93f
Show file tree

Hide file tree

Showing 3 changed files with 313 additions and 0 deletions.
diff --git a/scripts/disk_check.py b/scripts/disk_check.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+What:
+    There have been cases, where disk turns Read-only due to kernel bug.
+    In Read-only state, system blocks new remote user login via TACACS.
+    This utility is to check & make transient recovery as needed.
+
+How:
+    check for Read-Write permission. If Read-only, create writable overlay using tmpfs.
+
+    By default "/etc" & "/home" are checked and if in Read-only state, make them Read-Write
+    using overlay on top of tmpfs.
+
+    Making /etc & /home as writable lets successful new remote user login.
+
+    If in Read-only state or in Read-Write state with the help of tmpfs overlay,
+    syslog ERR messages are written, to help raise alerts.
+
+    Monit may be used to invoke it periodically, to help scan & fix and
+    report via syslog.
+
+"""
+
+import argparse
+import os
+import sys
+import syslog
+import subprocess
+
+UPPER_DIR = "/run/mount/upper"
+WORK_DIR = "/run/mount/work"
+MOUNTS_FILE = "/proc/mounts"
+
+def log_err(m):
+    print("Err: {}".format(m), file=sys.stderr)
+    syslog.syslog(syslog.LOG_ERR, m)
+
+
+def log_info(m):
+    print("Info: {}".format(m))
+    syslog.syslog(syslog.LOG_INFO, m)
+
+
+def log_debug(m):
+    print("debug: {}".format(m))
+    syslog.syslog(syslog.LOG_DEBUG, m)
+
+
+def test_writable(dirs): 
+    for d in dirs:
+        rw = os.access(d, os.W_OK)
+        if not rw:
+            log_err("{} is not read-write".format(d))
+            return False
+        else:
+            log_debug("{} is Read-Write".format(d))
+    return True
+
+
+def run_cmd(cmd):
+    proc = subprocess.run(cmd, shell=True, text=True, capture_output=True)
+    ret = proc.returncode
+    if ret:
+        log_err("failed: ret={} cmd={}".format(ret, cmd))
+    else:
+        log_info("ret={} cmd: {}".format(ret, cmd))
+
+    if proc.stdout:
+        log_info("stdout: {}".format(str(proc.stdout)))
+    if proc.stderr:
+        log_info("stderr: {}".format(str(proc.stderr)))
+    return ret
+
+
+def get_dname(path_name):
+    return os.path.basename(os.path.normpath(path_name))
+
+
+def do_mnt(dirs):
+    if os.path.exists(UPPER_DIR):
+        log_err("Already mounted")
+        return 1
+
+    for i in (UPPER_DIR, WORK_DIR):
+        try:
+            os.mkdir(i)
+        except OSError as error:
+            log_err("Failed to create {}".format(i))
+            return 1
+
+    for d in dirs:
+        ret = run_cmd("mount -t overlay overlay_{} -o lowerdir={},"
+        "upperdir={},workdir={} {}".format(
+            get_dname(d), d, UPPER_DIR, WORK_DIR, d))
+        if ret:
+            break
+
+    if ret:
+        log_err("Failed to mount {} as Read-Write".format(dirs))
+    else:
+        log_info("{} are mounted as Read-Write".format(dirs))
+    return ret
+
+
+def is_mounted(dirs):
+    if not os.path.exists(UPPER_DIR):
+        return False
+
+    onames = set()
+    for d in dirs:
+        onames.add("overlay_{}".format(get_dname(d)))
+
+    with open(MOUNTS_FILE, "r") as s:
+        for ln in s.readlines():
+            n = ln.strip().split()[0]
+            if n in onames:
+                log_debug("Mount exists for {}".format(n))
+                return True
+    return False
+
+
+def do_check(skip_mount, dirs):
+    ret = 0
+    if not test_writable(dirs):
+        if not skip_mount:
+            ret = do_mnt(dirs)
+
+    # Check if mounted
+    if (not ret) and is_mounted(dirs):
+        log_err("READ-ONLY: Mounted {} to make Read-Write".format(dirs))
+
+    return ret
+
+
+def main():
+    parser=argparse.ArgumentParser(
+            description="check disk for Read-Write and mount etc & home as Read-Write")
+    parser.add_argument('-s', "--skip-mount", action='store_true', default=False,
+            help="Skip mounting /etc & /home as Read-Write")
+    parser.add_argument('-d', "--dirs", default="/etc,/home",
+            help="dirs to mount")
+    args = parser.parse_args()
+
+    ret = do_check(args.skip_mount, args.dirs.split(","))
+    return ret
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/setup.py b/setup.py
@@ -81,6 +81,7 @@
         'scripts/db_migrator.py',
         'scripts/decode-syseeprom',
         'scripts/dropcheck',
+        'scripts/disk_check.py',
         'scripts/dropconfig',
         'scripts/dropstat',
         'scripts/dump_nat_entries.py',

diff --git a/tests/disk_check_test.py b/tests/disk_check_test.py
@@ -0,0 +1,161 @@
+import sys
+import syslog
+from unittest.mock import patch
+import pytest
+
+sys.path.append("scripts")
+import disk_check
+
+disk_check.MOUNTS_FILE = "/tmp/proc_mounts"
+
+test_data = {
+    "0": {
+        "desc": "All good as /tmp is read-write",
+        "args": ["", "-d", "/tmp"],
+        "err": ""
+    },
+    "1": {
+        "desc": "Not good as /tmpx is not read-write; But fix skipped",
+        "args": ["", "-d", "/tmpx", "-s"],
+        "err": "/tmpx is not read-write"
+    },
+    "2": {
+        "desc": "Not good as /tmpx is not read-write; expect mount",
+        "args": ["", "-d", "/tmpx"],
+        "upperdir": "/tmp/tmpx",
+        "workdir": "/tmp/tmpy",
+        "mounts": "overlay_tmpx blahblah",
+        "err": "/tmpx is not read-write|READ-ONLY: Mounted ['/tmpx'] to make Read-Write",
+        "cmds": ['mount -t overlay overlay_tmpx -o lowerdir=/tmpx,upperdir=/tmp/tmpx,workdir=/tmp/tmpy /tmpx']
+    },
+    "3": {
+        "desc": "Not good as /tmpx is not read-write; mount fail as create of upper fails",
+        "args": ["", "-d", "/tmpx"],
+        "upperdir": "/tmpx",
+        "expect_ret": 1
+    },
+    "4": {
+        "desc": "Not good as /tmpx is not read-write; mount fail as upper exist",
+        "args": ["", "-d", "/tmpx"],
+        "upperdir": "/tmp",
+        "err": "/tmpx is not read-write|Already mounted",
+        "expect_ret": 1
+    },
+    "5": {
+        "desc": "/tmp is read-write, but as well mount exists; hence report",
+        "args": ["", "-d", "/tmp"],
+        "upperdir": "/tmp",
+        "mounts": "overlay_tmp blahblah",
+        "err": "READ-ONLY: Mounted ['/tmp'] to make Read-Write"
+    },
+    "6": {
+        "desc": "Test another code path for good case",
+        "args": ["", "-d", "/tmp"],
+        "upperdir": "/tmp"
+    }
+}
+
+err_data = ""
+cmds = []
+current_tc = None
+
+def mount_file(d):
+    with open(disk_check.MOUNTS_FILE, "w") as s:
+        s.write(d)
+
+
+def report_err_msg(lvl, m):
+    global err_data
+    if lvl == syslog.LOG_ERR:
+        if err_data:
+            err_data += "|"
+        err_data += m
+
+
+class proc:
+    returncode = 0
+    stdout = None
+    stderr = None
+
+    def __init__(self, proc_upd = None):
+        if proc_upd:
+            self.returncode = proc_upd.get("ret", 0)
+            self.stdout = proc_upd.get("stdout", None)
+            self.stderr = proc_upd.get("stderr", None)
+
+
+def mock_subproc_run(cmd, shell, text, capture_output):
+    global cmds
+
+    upd = (current_tc["proc"][len(cmds)]
+            if len(current_tc.get("proc", [])) > len(cmds) else None)
+    cmds.append(cmd)
+
+    return proc(upd)
+
+
+def init_tc(tc):
+    global err_data, cmds, current_tc
+
+    err_data = ""
+    cmds = []
+    mount_file(tc.get("mounts", ""))
+    current_tc = tc
+
+
+def swap_upper(tc):
+    tmp_u = tc["upperdir"]
+    tc["upperdir"] = disk_check.UPPER_DIR
+    disk_check.UPPER_DIR = tmp_u
+
+
+def swap_work(tc):
+    tmp_w = tc["workdir"]
+    tc["upperdir"] = disk_check.WORK_DIR
+    disk_check.WORK_DIR = tmp_w
+
+
+class TestDiskCheck(object):
+    def setup(self):
+        pass
+
+
+    @patch("disk_check.syslog.syslog")
+    @patch("disk_check.subprocess.run")
+    def test_readonly(self, mock_proc, mock_log):
+        global err_data, cmds
+
+        mock_proc.side_effect = mock_subproc_run
+        mock_log.side_effect = report_err_msg
+
+        for i, tc in test_data.items():
+            print("-----------Start tc {}---------".format(i))
+            init_tc(tc)
+
+            with patch('sys.argv', tc["args"]):
+                if "upperdir" in tc:
+                    swap_upper(tc)
+
+                if "workdir" in tc:
+                    # restore
+                    swap_work(tc)
+
+                ret = disk_check.main()
+
+                if "upperdir" in tc:
+                    # restore
+                    swap_upper(tc)
+
+                if "workdir" in tc:
+                    # restore
+                    swap_work(tc)
+
+            print("ret = {}".format(ret))
+            print("err_data={}".format(err_data))
+            print("cmds: {}".format(cmds))
+
+            assert ret == tc.get("expect_ret", 0)
+            if  "err" in tc:
+                assert err_data == tc["err"]
+            assert cmds == tc.get("cmds", [])
+            print("-----------End tc {}-----------".format(i))