Executable Monitor Change (#62)

* Use two files descriptors to capture subprocess output. Use golang for the echo server * Added retry logic to the executable monitor, swapping back to python echo server for test * Adding success line check changes. Changing the while loop, printing exit status if we look for it * Some more info about why this is the way it is, and making it so we only print the rest of the log if the run doesn't find an exit condition originally. ---------
FreeRTOS · Jul 11, 2023 · 31f9331 · 31f9331
1 parent 1f59a76
commit 31f9331
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 69 deletions.
diff --git a/executable-monitor/action.yml b/executable-monitor/action.yml
@@ -14,7 +14,11 @@ inputs:
   timeout-seconds:
     description: 'Maximum amount of time to run the executable. Default is 600.'
     required: false
-    default: 600
+    default: 300
+  retry-attempts:
+    description: 'Number of times to re-launch the binary to check for success.'
+    required: false
+    default: 0
 
 runs:
   using: "composite"
@@ -23,5 +27,6 @@ runs:
         run: pip install -r $GITHUB_ACTION_PATH/requirements.txt
         shell: bash
       - name: Run executable with monitoring script
-        run: python3 $GITHUB_ACTION_PATH/executable-monitor.py --exe-path=${{ inputs.exe-path }} --timeout-seconds=${{ inputs.timeout-seconds }} --success-line="${{ inputs.success-line }}" --log-dir=${{ inputs.log-dir }}
+        run: |
+          python3 $GITHUB_ACTION_PATH/executable-monitor.py --exe-path=${{ inputs.exe-path }} --timeout-seconds=${{ inputs.timeout-seconds }} --success-line="${{ inputs.success-line }}" --log-dir=${{ inputs.log-dir }} --retry-attempts=${{ inputs.retry-attempts }}
         shell: bash
diff --git a/executable-monitor/executable-monitor.py b/executable-monitor/executable-monitor.py
@@ -41,6 +41,10 @@
                         type=int,
                         required=False,
                         help='Exit status that indicates that the executable completed successfully. Required if --success-line is not used.')
+    parser.add_argument('--retry-attempts',
+                        type=int,
+                        required=False,
+                        help='Number of times to attempt re-running the executable if the correct exit condition is not found.')
 
     args = parser.parse_args()
 
@@ -56,6 +60,11 @@
     if not os.path.exists(args.log_dir):
         os.makedirs(args.log_dir, exist_ok = True)
 
+    if not args.retry_attempts:
+        retryAttempts = 0
+    else:
+        retryAttempts = args.retry_attempts
+
     # Convert any relative path (like './') in passed argument to absolute path.
     exe_abs_path = os.path.abspath(args.exe_path)
     log_dir = os.path.abspath(args.log_dir)
@@ -72,76 +81,114 @@
     logging.info(f"Running executable: {exe_abs_path} ")
     logging.info(f"Storing logs in: {log_dir}")
     logging.info(f"Timeout (seconds): {args.timeout_seconds}")
+    logging.info(f"Searching for success line: {args.success_line}")
+    logging.info(f"Will re-try the run {retryAttempts} times")
+    if args.success_exit_status is not None:
+        logging.info("Looking for exit status {0}".format(args.success_exit_status ))
+    for attempts in range(0,retryAttempts + 1):
+
+        # Initialize values
+        success_line = ""
+        timeout_occurred = False
+        exe_exit_status = None
+        exe_exitted = False
+        success_line_found = False
+        exit_condition_met = False
+        wait_for_exit = args.success_exit_status is not None
+
+        # Create two file descriptors. The subprocess writes to one, the parent task reads from the other
+        # This is a workaround to avoid the fact that calling readline() on the stdout of the subprocess is
+        # a blocking call. Where if the subprocess is running but hasn't printed anything, readline will never time out.
+        # The approach uses the underlying file system to not block on data that hasn't been written.
+        WriteOutputFile = open("output.log", "w")
+        ReadOutputFile = open("output.log", "r")
+
+        # Launch the executable
+        exe = subprocess.Popen([exe_abs_path], stdout=WriteOutputFile, stderr=WriteOutputFile, universal_newlines=True, env=os.environ)
 
-    # Initialize values
-    timeout_occurred = False
-    exe_exit_status = None
-    exe_exitted = False
-    success_line_found = False
-    wait_for_exit = args.success_exit_status is not None
-
-    # Launch the executable
-    exe = subprocess.Popen([exe_abs_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
-
-    cur_time_seconds = time.time()
-    timeout_time_seconds = cur_time_seconds + args.timeout_seconds
-
-    logging.info("START OF DEVICE OUTPUT\n")
-
-    while not (timeout_occurred or exe_exitted or (not wait_for_exit and success_line_found)):
-
-        # Check if executable exitted
-        exe_exit_status = exe.poll()
-        if exe_exit_status is not None:
-            exe_exitted = True
-
-        # Read executable's stdout and write to stdout and logfile
-        exe_stdout_line = exe.stdout.readline()
-        logging.info(exe_stdout_line)
-
-        # Check if the executable printed out it's success line
-        if args.success_line is not None and args.success_line in exe_stdout_line:
-            success_line_found = True
-
-        # Check for timeout
         cur_time_seconds = time.time()
-        if cur_time_seconds >= timeout_time_seconds:
-            timeout_occurred = True
-
-        # Sleep for a short duration between loops to not steal all system resources
-        time.sleep(.1)
-
-    if not exe_exitted:
-        exe.kill()
-
-    # Capture remaining output and check for the successful line
-    for exe_stdout_line in exe.stdout.readlines():
-        logging.info(exe_stdout_line)
-        if args.success_line is not None and args.success_line in exe_stdout_line:
-            success_line_found = True
-
-    logging.info("END OF DEVICE OUTPUT\n")
-
-    logging.info("EXECUTABLE RUN SUMMARY:\n")
+        timeout_time_seconds = cur_time_seconds + args.timeout_seconds
+
+        logging.info("START OF DEVICE OUTPUT\n")
+
+        # While a timeout hasn't happened, the executable is running, and an exit condition has not been met
+        while ( not exit_condition_met ):
+            # Sleep for a short duration between loops to not steal all system resources
+            time.sleep(.05)
+
+            # Check if executable exitted
+            exe_exit_status = exe.poll()
+            if exe_exit_status is not None:
+                logging.info(f"EXECUTABLE CLOSED WITH STATUS: {exe_exit_status}")
+                exe_exitted = True
+                exit_condition_met = True
+
+            # Read executable's stdout and write to stdout and logfile
+            # A potential improvement here would be to do readlines() on the file, then truncate()
+            # This might be cleaner than this approach of reading a single line each loop.
+            exe_stdout_line = ReadOutputFile.readline()
+            if(exe_stdout_line is not None) and (len(exe_stdout_line.strip()) > 1):
+                # Check if the executable printed out its success line
+                if ( args.success_line is not None ) and ( args.success_line in exe_stdout_line ) :
+                    logging.info(f"SUCCESS_LINE_FOUND: {exe_stdout_line}")
+                    success_line_found = True
+                    success_line = exe_stdout_line
+                    if( not wait_for_exit ):
+                        exit_condition_met = True
+                else:
+                    logging.info(exe_stdout_line)
+
+            # Check for timeout
+            cur_time_seconds = time.time()
+            if cur_time_seconds >= timeout_time_seconds:
+                logging.info(f"TIMEOUT OF {args.timeout_seconds} SECONDS HIT")
+                timeout_occurred = True
+                exit_condition_met = True
+
+        if not exe_exitted:
+            logging.info(f"EXECUTABLE DID NOT EXIT, MANUALLY KILLING NOW")
+            exe.kill()
+
+        if not exit_condition_met:
+            logging.info(f"PARSING REST OF LOG")
+            # Capture remaining output and check for the successful line
+            for exe_stdout_line in ReadOutputFile.readlines():
+                logging.info(exe_stdout_line)
+                if args.success_line is not None and args.success_line in exe_stdout_line:
+                    success_line_found = True
+                    success_line = exe_stdout_line
+                    logging.info(f"SUCCESS_LINE_FOUND: {exe_stdout_line}")
+
+        # Close the files
+        WriteOutputFile.close()
+        ReadOutputFile.close()
+
+        logging.info("END OF DEVICE OUTPUT")
+
+        logging.info("EXECUTABLE RUN SUMMARY:\n")
+
+        exit_status = 0
+
+        if args.success_line is not None:
+            if not success_line_found:
+                logging.error("Success Line: Success line not output.\n")
+                exit_status = 1
 
-    exit_status = 0
+        if args.success_exit_status is not None:
+            if exe_exitted:
+                if exe_exit_status != args.success_exit_status:
+                    exit_status = 1
+                logging.info(f"Exit Status: {exe_exit_status}")
+            else:
+                logging.error("Exit Status: Executable did not exit by itself.\n")
+                exit_status = 1
 
-    if args.success_line is not None:
-        if success_line_found:
-            logging.info("Success Line: Found.\n")
-        else:
-            logging.error("Success Line: Success line not output.\n")
-            exit_status = 1
+        if( exit_status == 0 ):
+            logging.info(f"Run found a valid success metric\n")
+            sys.exit(exit_status)
 
-    if args.success_exit_status is not None:
-        if exe_exitted:
-            if exe_exit_status != args.success_exit_status:
-                exit_status = 1
-            logging.info(f"Exit Status: {exe_exit_status}")
-        else:
-            logging.error("Exit Status: Executable did not exit.\n")
-            exe_status = 1
-
+        elif( attempts < retryAttempts ):
+            logging.info(f"Did not succeed, trying re-attempt {attempts+1} of {retryAttempts}\n")
 
-    # Report if executable executed successfully to workflow
+    # Report final exit status if no successful run occured
     sys.exit(exit_status)
diff --git a/localhost-echo-server/local_echo_server.py b/localhost-echo-server/local_echo_server.py
@@ -24,7 +24,8 @@ async def echo_handler(reader, writer):
   args = parser.parse_args()
   logging.basicConfig()
   logger.setLevel(logging.DEBUG)
-  loop = asyncio.get_event_loop()
+  loop = asyncio.new_event_loop()
+  asyncio.set_event_loop(loop)
   factory = asyncio.start_server(
     echo_handler,
     os.environ.get('HOST'),