From 907c4d986ccc131e0d483229edf5dfeee80e7966 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Mon, 20 Mar 2023 17:27:22 -0700 Subject: [PATCH] [CI] Fix test result not found in client_runner (#33464) * Fix 'Observed wheel commit () is not expected' issue (https://github.com/ray-project/ray/issues/32156) that has been creeping through many of ci/cd builds in our pipeline. The existing code uses pipe to read from a rather large file (>50MB). Pipe however has buffer limit which by default in term of kb (https://man7.org/linux/man-pages/man7/pipe.7.html) so what we look for might not exist. We can fix this by tell unzip the exact file we are looking for. That file is pretty small so we should not hit buffer limit. You might notice other surpises might still happen with this fix (e.g. many files that match ^__commit__). This sanity check goes back to 2 years ago by our veteran Kai (https://github.com/ray-project/ray/commit/234b015b426274d461a15345a4d4724a08bc5289) to sanity check issues with stale artifacts from previous builds or race conditions between builds. Further investigation on how builkite agent multi-tenant is setup might or might not simplify this logic further. Signed-off-by: Cuong Nguyen * Improve wheel commit validation error message Signed-off-by: Cuong Nguyen * PR 31978 replaced result_output_json and metrics_output_json with fixed values, but did not update client_runner. GCE tests using client_runner is failing with the following error because of that. Simple fix by reusing the fixed global values. > [Errno 2] No such file or directory: '/tmp/tmpr33nmui3' > Traceback (most recent call last): > File "/tmp/release-HS2M44AnMX/release/ray_release/command_runner/client_runner.py", line 122, in _fetch_json > with open(path, "rt") as fp: > FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmpr33nmui3' Signed-off-by: Cuong Nguyen * remove tempfile import, not used Signed-off-by: Cuong Nguyen --------- Signed-off-by: Cuong Nguyen --- release/ray_release/command_runner/client_runner.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/release/ray_release/command_runner/client_runner.py b/release/ray_release/command_runner/client_runner.py index e6aa4dd9107c..ad011fb67655 100644 --- a/release/ray_release/command_runner/client_runner.py +++ b/release/ray_release/command_runner/client_runner.py @@ -3,7 +3,6 @@ import shlex import subprocess import sys -import tempfile import threading import time from collections import deque @@ -51,8 +50,6 @@ def __init__( super(ClientRunner, self).__init__(cluster_manager, file_manager, working_dir) self.last_logs = None - self.result_output_json = tempfile.mktemp() - self.metrics_output_json = tempfile.mktemp() def prepare_remote_env(self): pass @@ -127,10 +124,10 @@ def _fetch_json(self, path: str) -> Dict[str, Any]: ) from e def fetch_results(self) -> Dict[str, Any]: - return self._fetch_json(self.result_output_json) + return self._fetch_json(self._RESULT_OUTPUT_JSON) def fetch_metrics(self) -> Dict[str, Any]: - return self._fetch_json(self.metrics_output_json) + return self._fetch_json(self._METRICS_OUTPUT_JSON) def fetch_artifact(self): raise NotImplementedError