From d5b12578db8b1fcfd9aa52c563a33fb4ffdcff03 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 30 Mar 2023 10:28:23 -0700 Subject: [PATCH 001/104] Re-add glue logic for JobManager Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..4c058b66c2c5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -55,6 +55,7 @@ type_str_to_command_runner = { "command": SDKRunner, "sdk_command": SDKRunner, + "job": JobRunner, "anyscale_job": AnyscaleJobRunner, } From cc6db98b7934b15ef3496a138f8ad9064f4abfe7 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Thu, 30 Mar 2023 10:26:12 -0700 Subject: [PATCH 002/104] [CI] clean up things Clean up client runner --- release/ray_release/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 4c058b66c2c5..46ef68f45ce5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -55,7 +55,6 @@ type_str_to_command_runner = { "command": SDKRunner, "sdk_command": SDKRunner, - "job": JobRunner, "anyscale_job": AnyscaleJobRunner, } From 768466b67ade32e9f8812273889a11d72b76e599 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Fri, 31 Mar 2023 14:58:28 -0700 Subject: [PATCH 003/104] Add back job run type. They are still used in CLI so not deprecate them yet. Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..4c058b66c2c5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -55,6 +55,7 @@ type_str_to_command_runner = { "command": SDKRunner, "sdk_command": SDKRunner, + "job": JobRunner, "anyscale_job": AnyscaleJobRunner, } From 57f499aac2ce187ff0a4a61247128f9859b1ec6d Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Thu, 30 Mar 2023 10:26:12 -0700 Subject: [PATCH 004/104] [CI] clean up things Clean up client runner --- release/ray_release/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 4c058b66c2c5..46ef68f45ce5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -55,7 +55,6 @@ type_str_to_command_runner = { "command": SDKRunner, "sdk_command": SDKRunner, - "job": JobRunner, "anyscale_job": AnyscaleJobRunner, } From 12feefe8108ce1969fe322ce0702f86e999b40a7 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 09:11:54 -0700 Subject: [PATCH 005/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..4c058b66c2c5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -55,6 +55,7 @@ type_str_to_command_runner = { "command": SDKRunner, "sdk_command": SDKRunner, + "job": JobRunner, "anyscale_job": AnyscaleJobRunner, } From 1ccc73e67bf1d6db9dd438eb7fb1d19b09aa2ce0 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Thu, 30 Mar 2023 10:26:12 -0700 Subject: [PATCH 006/104] [CI] clean up things Clean up client runner --- release/ray_release/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 4c058b66c2c5..46ef68f45ce5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -55,7 +55,6 @@ type_str_to_command_runner = { "command": SDKRunner, "sdk_command": SDKRunner, - "job": JobRunner, "anyscale_job": AnyscaleJobRunner, } From 0e9f29cdde05635d969d6cbfc80253e202e5c136 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 3 Apr 2023 13:24:38 -0700 Subject: [PATCH 007/104] Add functions that are free from execeptions Signed-off-by: Cuong Nguyen --- release/ray_release/command_runner/command_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/command_runner/command_runner.py b/release/ray_release/command_runner/command_runner.py index f82f029bb178..7f632292cb07 100644 --- a/release/ray_release/command_runner/command_runner.py +++ b/release/ray_release/command_runner/command_runner.py @@ -129,7 +129,7 @@ def get_last_logs(self) -> Optional[str]: def get_last_logs_ex(self): raise NotImplementedError - def fetch_results(self) -> Dict[str, Any]: + def fetch_results_ex(self) -> Dict[str, Any]: raise NotImplementedError def fetch_metrics(self) -> Dict[str, Any]: From 708b80f6698726deb819aa8f89eee08cbbc5fffe Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 10 Apr 2023 16:05:46 -0700 Subject: [PATCH 008/104] Undo changes to fetch_results Signed-off-by: Cuong Nguyen --- release/ray_release/command_runner/command_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/command_runner/command_runner.py b/release/ray_release/command_runner/command_runner.py index 7f632292cb07..f82f029bb178 100644 --- a/release/ray_release/command_runner/command_runner.py +++ b/release/ray_release/command_runner/command_runner.py @@ -129,7 +129,7 @@ def get_last_logs(self) -> Optional[str]: def get_last_logs_ex(self): raise NotImplementedError - def fetch_results_ex(self) -> Dict[str, Any]: + def fetch_results(self) -> Dict[str, Any]: raise NotImplementedError def fetch_metrics(self) -> Dict[str, Any]: From 6baff80eebbadc4333fe909ac3ef2dbe21c23e9c Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:37:11 -0700 Subject: [PATCH 009/104] Auto-retry for infrastructure errors Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 447 ++++++++++++++++------------------ release/ray_release/result.py | 12 + 2 files changed, 225 insertions(+), 234 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..50494aeb5cf0 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -6,11 +6,9 @@ from ray_release.alerts.handle import handle_result, require_result from ray_release.anyscale_util import get_cluster_name from ray_release.buildkite.output import buildkite_group, buildkite_open_last -from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.cluster_manager.full import FullClusterManager from ray_release.cluster_manager.minimal import MinimalClusterManager from ray_release.command_runner.job_runner import JobRunner -from ray_release.command_runner.command_runner import CommandRunner from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner from ray_release.command_runner.sdk_runner import SDKRunner from ray_release.config import ( @@ -92,33 +90,43 @@ def _get_extra_tags_from_env() -> dict: return {key.lower(): os.getenv(key, "") for key in env_vars} -def _load_test_configuration( +def run_release_test( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, + reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, + cluster_id: Optional[str] = None, + cluster_env_id: Optional[str] = None, no_terminate: bool = False, -) -> Tuple[ClusterManager, CommandRunner, str]: +) -> Result: + buildkite_group(":spiral_note_pad: Loading test configuration") + validate_test(test) + logger.info(f"Test config: {test}") - # Populate result paramaters result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test + buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") buildkite_job_id = os.getenv("BUILDKITE_JOB_ID", "") + if buildkite_url: buildkite_url += "#" + buildkite_job_id + result.buildkite_url = buildkite_url result.buildkite_job_id = buildkite_job_id - # Setting up working directory working_dir = test["working_dir"] + + old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) + start_time = time.monotonic() run_type = test["run"].get("type", DEFAULT_RUN_TYPE) # Workaround while Anyscale Jobs don't support leaving cluster alive @@ -153,6 +161,7 @@ def _load_test_configuration( logger.info(f"Got command runner cls: {command_runner_cls}") logger.info(f"Got file manager cls: {file_manager_cls}") + # Extra tags to be set on resources on cloud provider's side extra_tags = _get_extra_tags_from_env() # We don't need other attributes as they can be derived from the name @@ -176,267 +185,230 @@ def _load_test_configuration( except Exception as e: raise ReleaseTestSetupError(f"Error setting up release test: {e}") from e - return cluster_manager, command_runner, artifact_path + pipeline_exception = None + # non critical for some tests. So separate it from the general one. + fetch_result_exception = None + try: + setup_signal_handling() + # Load configs + cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) + cluster_compute = load_test_cluster_compute(test) + if cluster_env_id: + try: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_cluster_env() + cluster_manager.fetch_build_info() + logger.info( + "Using overridden cluster environment with ID " + f"{cluster_env_id} and build ID " + f"{cluster_manager.cluster_env_build_id}" + ) + except Exception as e: + raise ClusterEnvCreateError( + f"Could not get existing overridden cluster environment " + f"{cluster_env_id}: {e}" + ) from e + else: + cluster_manager.set_cluster_env(cluster_env) -def _setup_cluster_environment( - test: Test, - result: Result, - cluster_manager: ClusterManager, - ray_wheels_url: str, - cluster_env_id: Optional[str], -) -> Tuple[str, int, int, int, int]: - setup_signal_handling() - # Load configs - cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) - cluster_compute = load_test_cluster_compute(test) - - if cluster_env_id: - try: - cluster_manager.cluster_env_id = cluster_env_id - cluster_manager.build_cluster_env() - cluster_manager.fetch_build_info() - logger.info( - "Using overridden cluster environment with ID " - f"{cluster_env_id} and build ID " - f"{cluster_manager.cluster_env_build_id}" - ) - except Exception as e: - raise ClusterEnvCreateError( - f"Could not get existing overridden cluster environment " - f"{cluster_env_id}: {e}" - ) from e - else: - cluster_manager.set_cluster_env(cluster_env) + # Load some timeouts + build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) + command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) + cluster_timeout = int( + test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) + ) - # Load some timeouts - build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) - command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) - cluster_timeout = int(test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT)) + # Get prepare command timeout, if any + prepare_cmd = test["run"].get("prepare", None) + if prepare_cmd: + prepare_timeout = test["run"].get("prepare_timeout", command_timeout) + else: + prepare_timeout = 0 - # Get prepare command timeout, if any - prepare_cmd = test["run"].get("prepare", None) - if prepare_cmd: - prepare_timeout = test["run"].get("prepare_timeout", command_timeout) - else: - prepare_timeout = 0 + # Base maximum uptime on the combined command and prepare timeouts + command_and_prepare_timeout = command_timeout + prepare_timeout - # Base maximum uptime on the combined command and prepare timeouts - command_and_prepare_timeout = command_timeout + prepare_timeout + # Use default timeout = 0 here if wait_for_nodes is empty. This is to make + # sure we don't inflate the maximum_uptime_minutes too much if we don't wait + # for nodes at all. + # The actual default will be otherwise loaded further down. + wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) - # Use default timeout = 0 here if wait_for_nodes is empty. This is to make - # sure we don't inflate the maximum_uptime_minutes too much if we don't wait - # for nodes at all. - # The actual default will be otherwise loaded further down. - wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) + autosuspend_mins = test["cluster"].get("autosuspend_mins", None) + if autosuspend_mins: + cluster_manager.autosuspend_minutes = autosuspend_mins + autosuspend_base = autosuspend_mins + else: + cluster_manager.autosuspend_minutes = min( + DEFAULT_AUTOSUSPEND_MINS, + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, + ) + # Maximum uptime should be based on the command timeout, not the + # DEFAULT_AUTOSUSPEND_MINS + autosuspend_base = ( + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES + ) - autosuspend_mins = test["cluster"].get("autosuspend_mins", None) - if autosuspend_mins: - cluster_manager.autosuspend_minutes = autosuspend_mins - autosuspend_base = autosuspend_mins - else: - cluster_manager.autosuspend_minutes = min( - DEFAULT_AUTOSUSPEND_MINS, - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, - ) - # Maximum uptime should be based on the command timeout, not the - # DEFAULT_AUTOSUSPEND_MINS - autosuspend_base = ( - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES - ) + maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) + if maximum_uptime_minutes: + cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes + else: + cluster_manager.maximum_uptime_minutes = ( + autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + ) - maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) - if maximum_uptime_minutes: - cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes - else: - cluster_manager.maximum_uptime_minutes = ( - autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + # Set cluster compute here. Note that this may use timeouts provided + # above. + cluster_manager.set_cluster_compute( + cluster_compute, + extra_tags=extra_tags, ) - # Set cluster compute here. Note that this may use timeouts provided - # above. - cluster_manager.set_cluster_compute( - cluster_compute, - extra_tags=result.extra_tags, - ) + buildkite_group(":nut_and_bolt: Setting up local environment") + driver_setup_script = test.get("driver_setup", None) + if driver_setup_script: + try: + run_bash_script(driver_setup_script) + except Exception as e: + raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - return prepare_cmd, prepare_timeout, build_timeout, cluster_timeout, command_timeout + # Install local dependencies + command_runner.prepare_local_env(ray_wheels_url) + # Re-install anyscale package as local dependencies might have changed + # from local env setup + reinstall_anyscale_dependencies() -def _setup_local_environment( - test: Test, - command_runner: CommandRunner, - ray_wheels_url: str, -) -> None: - driver_setup_script = test.get("driver_setup", None) - if driver_setup_script: - try: - run_bash_script(driver_setup_script) - except Exception as e: - raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e + # Print installed pip packages + buildkite_group(":bulb: Local environment information") + pip_packages = get_pip_packages() + pip_package_string = "\n".join(pip_packages) + logger.info(f"Installed python packages:\n{pip_package_string}") - # Install local dependencies - command_runner.prepare_local_env(ray_wheels_url) + if isinstance(cluster_manager, FullClusterManager): + if not no_terminate: + register_handler( + lambda sig, frame: cluster_manager.terminate_cluster(wait=True) + ) + + # Start cluster + if cluster_id: + buildkite_group(":rocket: Using existing cluster") + # Re-use existing cluster ID for development + cluster_manager.cluster_id = cluster_id + cluster_manager.cluster_name = get_cluster_name(cluster_id) + else: + buildkite_group(":gear: Building cluster environment") - # Re-install anyscale package as local dependencies might have changed - # from local env setup - reinstall_anyscale_dependencies() + if cluster_env_id: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_configs(timeout=build_timeout) -def _local_environment_information( - result: Result, - cluster_manager: ClusterManager, - command_runner: CommandRunner, - build_timeout: int, - cluster_timeout: int, - no_terminate: bool, - cluster_id: Optional[str], - cluster_env_id: Optional[str], -) -> None: - pip_packages = get_pip_packages() - pip_package_string = "\n".join(pip_packages) - logger.info(f"Installed python packages:\n{pip_package_string}") - - if isinstance(cluster_manager, FullClusterManager): - if not no_terminate: - register_handler( - lambda sig, frame: cluster_manager.terminate_cluster(wait=True) - ) + if isinstance(cluster_manager, FullClusterManager): + buildkite_group(":rocket: Starting up cluster") + cluster_manager.start_cluster(timeout=cluster_timeout) + elif isinstance(command_runner, AnyscaleJobRunner): + command_runner.job_manager.cluster_startup_timeout = cluster_timeout - # Start cluster - if cluster_id: - buildkite_group(":rocket: Using existing cluster") - # Re-use existing cluster ID for development - cluster_manager.cluster_id = cluster_id - cluster_manager.cluster_name = get_cluster_name(cluster_id) - else: - buildkite_group(":gear: Building cluster environment") + result.cluster_url = cluster_manager.get_cluster_url() + result.cluster_id = cluster_manager.cluster_id - if cluster_env_id: - cluster_manager.cluster_env_id = cluster_env_id + # Upload files + buildkite_group(":wrench: Preparing remote environment") + command_runner.prepare_remote_env() - cluster_manager.build_configs(timeout=build_timeout) + wait_for_nodes = test["run"].get("wait_for_nodes", None) - if isinstance(cluster_manager, FullClusterManager): - buildkite_group(":rocket: Starting up cluster") - cluster_manager.start_cluster(timeout=cluster_timeout) - elif isinstance(command_runner, AnyscaleJobRunner): - command_runner.job_manager.cluster_startup_timeout = cluster_timeout + if wait_for_nodes: + buildkite_group(":stopwatch: Waiting for nodes to come up") + # Overwrite wait_timeout from above to account for better default + wait_timeout = int( + wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) + ) + num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] + command_runner.wait_for_nodes(num_nodes, wait_timeout) - result.cluster_url = cluster_manager.get_cluster_url() - result.cluster_id = cluster_manager.cluster_id + if prepare_cmd: + try: + command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) + except CommandError as e: + raise PrepareCommandError(e) + except CommandTimeout as e: + raise PrepareCommandTimeout(e) + buildkite_group(":runner: Running test script") + command = test["run"]["script"] + command_env = {} -def _prepare_remote_environment( - test: Test, - command_runner: CommandRunner, - prepare_cmd: bool, - prepare_timeout: int, -) -> None: - command_runner.prepare_remote_env() - - wait_for_nodes = test["run"].get("wait_for_nodes", None) - - if wait_for_nodes: - buildkite_group(":stopwatch: Waiting for nodes to come up") - # Overwrite wait_timeout from above to account for better default - wait_timeout = int( - wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) - ) - num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] - command_runner.wait_for_nodes(num_nodes, wait_timeout) + if smoke_test: + command = f"{command} --smoke-test" + command_env["IS_SMOKE_TEST"] = "1" + + is_long_running = test["run"].get("long_running", False) + + start_time_unix = time.time() - if prepare_cmd: try: - command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) + command_runner.run_command( + command, + env=command_env, + timeout=command_timeout, + raise_on_timeout=not is_long_running, + ) + except ( + TestCommandError, + PrepareCommandError, + TestCommandTimeout, + PrepareCommandTimeout, + ) as e: + raise e except CommandError as e: - raise PrepareCommandError(e) + raise TestCommandError(e) except CommandTimeout as e: - raise PrepareCommandTimeout(e) - + if not is_long_running: + # Only raise error if command is not long running + raise TestCommandTimeout(e) -def _running_test_script( - test: Test, - smoke_test: bool, - command_runner: CommandRunner, - command_timeout: int, -) -> None: - command = test["run"]["script"] - command_env = {} - - if smoke_test: - command = f"{command} --smoke-test" - command_env["IS_SMOKE_TEST"] = "1" + buildkite_group(":floppy_disk: Fetching results") + try: + command_results = command_runner.fetch_results() + except Exception as e: + logger.exception(f"Could not fetch results for test command: {e}") + command_results = {} + fetch_result_exception = e - is_long_running = test["run"].get("long_running", False) + if artifact_path: + try: + command_runner.fetch_artifact() + except Exception as e: + logger.error("Could not fetch artifact for test command") + logger.exception(e) - try: - command_runner.run_command( - command, - env=command_env, - timeout=command_timeout, - raise_on_timeout=not is_long_running, - ) - except ( - TestCommandError, - PrepareCommandError, - TestCommandTimeout, - PrepareCommandTimeout, - ) as e: - raise e - except CommandError as e: - raise TestCommandError(e) - except CommandTimeout as e: - if not is_long_running: - # Only raise error if command is not long running - raise TestCommandTimeout(e) - - -def _fetching_results( - result: Result, - command_runner: CommandRunner, - artifact_path: Optional[str], - smoke_test: bool, - start_time_unix: int, -) -> Tuple[dict, Exception]: - fetch_result_exception = None - try: - command_results = command_runner.fetch_results() - except Exception as e: - logger.exception(f"Could not fetch results for test command: {e}") - command_results = {} - fetch_result_exception = e + # Postprocess result: + if "last_update" in command_results: + command_results["last_update_diff"] = time.time() - command_results.get( + "last_update", 0.0 + ) - if artifact_path: try: - command_runner.fetch_artifact() + # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py + # Timeout is the time the test took divided by 200 + # (~7 minutes for a 24h test) but no less than 30s + # and no more than 900s + metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) + command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) + metrics = command_runner.fetch_metrics() except Exception as e: - logger.error("Could not fetch artifact for test command") - logger.exception(e) - - # Postprocess result: - if "last_update" in command_results: - command_results["last_update_diff"] = time.time() - command_results.get( - "last_update", 0.0 - ) - - try: - # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py - # Timeout is the time the test took divided by 200 - # (~7 minutes for a 24h test) but no less than 30s - # and no more than 900s - metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) - command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) - metrics = command_runner.fetch_metrics() - except Exception as e: - logger.exception(f"Could not fetch metrics for test command: {e}") - metrics = {} + logger.exception(f"Could not fetch metrics for test command: {e}") + metrics = {} - if smoke_test: - command_results["smoke_test"] = True + if smoke_test: + command_results["smoke_test"] = True - result.results = command_results - result.status = "finished" + result.results = command_results + result.status = "finished" return metrics, fetch_result_exception @@ -544,7 +516,10 @@ def run_release_test( if not no_terminate and cluster_manager: buildkite_group(":earth_africa: Terminating cluster") - cluster_manager.terminate_cluster(wait=False) + try: + cluster_manager.terminate_cluster(wait=False) + except Exception as e: + logger.exception(f"Could not terminate cluster: {e}") if hasattr(command_runner, "cleanup"): command_runner.cleanup() @@ -586,8 +561,12 @@ def run_release_test( result.last_logs = traceback.format_exc() buildkite_group(":memo: Reporting results", open=True) - for reporter in reporters or []: - reporter.report_result(test, result) + reporters = reporters or [] + for reporter in reporters: + try: + reporter.report_result(test, result) + except Exception as e: + logger.exception(f"Error reporting results via {type(reporter)}: {e}") if pipeline_exception: raise pipeline_exception diff --git a/release/ray_release/result.py b/release/ray_release/result.py index ed476cf0e734..4c2667b892ff 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -79,6 +79,18 @@ class ExitCode(enum.Enum): COMMAND_TIMEOUT = 42 PREPARE_ERROR = 43 +class BuildkiteExitCode(enum.Enum): + """ + Final exit code the test runner passes to buildkite-agent. This exit code is used + to determine job policies, such as automatic retries + """ + SUCCESS = 0 + UNKNOWN = 1 + TRANSIENT_INFRA_ERROR = 10 + INFRA_ERROR = 11 + INFRA_TIMEOUT = 30 + ERROR = 40 + TIMEOUT = 42 def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ From 9103124a88e2d5c54660af84de9ff8f043d695f2 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:59:29 -0700 Subject: [PATCH 010/104] Exit buildkite job using buildkite return code Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 11 +++++++++++ release/ray_release/glue.py | 3 --- release/ray_release/result.py | 1 + release/ray_release/scripts/run_release_test.py | 4 ++-- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index a13bde1575d8..9fcae90beebd 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,6 +15,7 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar +from ray_release.result import BuildkiteExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" @@ -121,6 +122,16 @@ def get_step( if test.get("run", {}).get("type") == "client": step["agents"]["queue"] = str(RELEASE_QUEUE_CLIENT) + # Auto-retry on transient infra error (according to result.BuildkiteExitCode) + step["retry"] = { + "automatic": [ + { + "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR, + "limit": 2, + } + ] + } + # If a test is not stable, allow to soft fail stable = test.get("stable", True) if not stable: diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 50494aeb5cf0..339ad99c4e68 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -568,7 +568,4 @@ def run_release_test( except Exception as e: logger.exception(f"Error reporting results via {type(reporter)}: {e}") - if pipeline_exception: - raise pipeline_exception - return result diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 4c2667b892ff..7c8397d29a09 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -31,6 +31,7 @@ class Result: stable: bool = True smoke_test: bool = False + buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 449dee26557d..c7de484b382a 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -166,9 +166,9 @@ def main( return_code = e.exit_code.value logger.info( f"Release test pipeline for test {test['name']} completed. " - f"Returning with exit code = {return_code}" + f"Returning with exit code = {result.return_code}" ) - sys.exit(return_code) + sys.exit(result.buildkite_return_code) if __name__ == "__main__": From 88317be744ecd8436a275bd6436b7771c48197f0 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 11:16:38 -0700 Subject: [PATCH 011/104] Handle everything through result exceptions Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 447 +++++++++++++++++++----------------- 1 file changed, 234 insertions(+), 213 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 339ad99c4e68..7232e9b81273 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -6,9 +6,11 @@ from ray_release.alerts.handle import handle_result, require_result from ray_release.anyscale_util import get_cluster_name from ray_release.buildkite.output import buildkite_group, buildkite_open_last +from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.cluster_manager.full import FullClusterManager from ray_release.cluster_manager.minimal import MinimalClusterManager from ray_release.command_runner.job_runner import JobRunner +from ray_release.command_runner.command_runner import CommandRunner from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner from ray_release.command_runner.sdk_runner import SDKRunner from ray_release.config import ( @@ -90,43 +92,33 @@ def _get_extra_tags_from_env() -> dict: return {key.lower(): os.getenv(key, "") for key in env_vars} -def run_release_test( +def _load_test_configuration( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, - reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, - cluster_id: Optional[str] = None, - cluster_env_id: Optional[str] = None, no_terminate: bool = False, -) -> Result: - buildkite_group(":spiral_note_pad: Loading test configuration") - +) -> Tuple[ClusterManager, CommandRunner, str]: validate_test(test) - logger.info(f"Test config: {test}") + # Populate result paramaters result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test - buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") buildkite_job_id = os.getenv("BUILDKITE_JOB_ID", "") - if buildkite_url: buildkite_url += "#" + buildkite_job_id - result.buildkite_url = buildkite_url result.buildkite_job_id = buildkite_job_id + # Setting up working directory working_dir = test["working_dir"] - - old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) - start_time = time.monotonic() run_type = test["run"].get("type", DEFAULT_RUN_TYPE) # Workaround while Anyscale Jobs don't support leaving cluster alive @@ -161,7 +153,6 @@ def run_release_test( logger.info(f"Got command runner cls: {command_runner_cls}") logger.info(f"Got file manager cls: {file_manager_cls}") - # Extra tags to be set on resources on cloud provider's side extra_tags = _get_extra_tags_from_env() # We don't need other attributes as they can be derived from the name @@ -185,230 +176,267 @@ def run_release_test( except Exception as e: raise ReleaseTestSetupError(f"Error setting up release test: {e}") from e - pipeline_exception = None - # non critical for some tests. So separate it from the general one. - fetch_result_exception = None - try: - setup_signal_handling() - # Load configs - cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) - cluster_compute = load_test_cluster_compute(test) - - if cluster_env_id: - try: - cluster_manager.cluster_env_id = cluster_env_id - cluster_manager.build_cluster_env() - cluster_manager.fetch_build_info() - logger.info( - "Using overridden cluster environment with ID " - f"{cluster_env_id} and build ID " - f"{cluster_manager.cluster_env_build_id}" - ) - except Exception as e: - raise ClusterEnvCreateError( - f"Could not get existing overridden cluster environment " - f"{cluster_env_id}: {e}" - ) from e - else: - cluster_manager.set_cluster_env(cluster_env) + return cluster_manager, command_runner, artifact_path - # Load some timeouts - build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) - command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) - cluster_timeout = int( - test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) - ) - # Get prepare command timeout, if any - prepare_cmd = test["run"].get("prepare", None) - if prepare_cmd: - prepare_timeout = test["run"].get("prepare_timeout", command_timeout) - else: - prepare_timeout = 0 +def _setup_cluster_environment( + test: Test, + result: Result, + cluster_manager: ClusterManager, + ray_wheels_url: str, + cluster_env_id: Optional[str], +) -> Tuple[str, int, int, int, int]: + setup_signal_handling() + # Load configs + cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) + cluster_compute = load_test_cluster_compute(test) + + if cluster_env_id: + try: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_cluster_env() + cluster_manager.fetch_build_info() + logger.info( + "Using overridden cluster environment with ID " + f"{cluster_env_id} and build ID " + f"{cluster_manager.cluster_env_build_id}" + ) + except Exception as e: + raise ClusterEnvCreateError( + f"Could not get existing overridden cluster environment " + f"{cluster_env_id}: {e}" + ) from e + else: + cluster_manager.set_cluster_env(cluster_env) - # Base maximum uptime on the combined command and prepare timeouts - command_and_prepare_timeout = command_timeout + prepare_timeout + # Load some timeouts + build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) + command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) + cluster_timeout = int(test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT)) - # Use default timeout = 0 here if wait_for_nodes is empty. This is to make - # sure we don't inflate the maximum_uptime_minutes too much if we don't wait - # for nodes at all. - # The actual default will be otherwise loaded further down. - wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) + # Get prepare command timeout, if any + prepare_cmd = test["run"].get("prepare", None) + if prepare_cmd: + prepare_timeout = test["run"].get("prepare_timeout", command_timeout) + else: + prepare_timeout = 0 - autosuspend_mins = test["cluster"].get("autosuspend_mins", None) - if autosuspend_mins: - cluster_manager.autosuspend_minutes = autosuspend_mins - autosuspend_base = autosuspend_mins - else: - cluster_manager.autosuspend_minutes = min( - DEFAULT_AUTOSUSPEND_MINS, - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, - ) - # Maximum uptime should be based on the command timeout, not the - # DEFAULT_AUTOSUSPEND_MINS - autosuspend_base = ( - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES - ) + # Base maximum uptime on the combined command and prepare timeouts + command_and_prepare_timeout = command_timeout + prepare_timeout - maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) - if maximum_uptime_minutes: - cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes - else: - cluster_manager.maximum_uptime_minutes = ( - autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES - ) + # Use default timeout = 0 here if wait_for_nodes is empty. This is to make + # sure we don't inflate the maximum_uptime_minutes too much if we don't wait + # for nodes at all. + # The actual default will be otherwise loaded further down. + wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) - # Set cluster compute here. Note that this may use timeouts provided - # above. - cluster_manager.set_cluster_compute( - cluster_compute, - extra_tags=extra_tags, + autosuspend_mins = test["cluster"].get("autosuspend_mins", None) + if autosuspend_mins: + cluster_manager.autosuspend_minutes = autosuspend_mins + autosuspend_base = autosuspend_mins + else: + cluster_manager.autosuspend_minutes = min( + DEFAULT_AUTOSUSPEND_MINS, + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, + ) + # Maximum uptime should be based on the command timeout, not the + # DEFAULT_AUTOSUSPEND_MINS + autosuspend_base = ( + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES ) - buildkite_group(":nut_and_bolt: Setting up local environment") - driver_setup_script = test.get("driver_setup", None) - if driver_setup_script: - try: - run_bash_script(driver_setup_script) - except Exception as e: - raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - - # Install local dependencies - command_runner.prepare_local_env(ray_wheels_url) + maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) + if maximum_uptime_minutes: + cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes + else: + cluster_manager.maximum_uptime_minutes = ( + autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + ) - # Re-install anyscale package as local dependencies might have changed - # from local env setup - reinstall_anyscale_dependencies() + # Set cluster compute here. Note that this may use timeouts provided + # above. + cluster_manager.set_cluster_compute( + cluster_compute, + extra_tags=result.extra_tags, + ) - # Print installed pip packages - buildkite_group(":bulb: Local environment information") - pip_packages = get_pip_packages() - pip_package_string = "\n".join(pip_packages) - logger.info(f"Installed python packages:\n{pip_package_string}") + return prepare_cmd, prepare_timeout, build_timeout, cluster_timeout, command_timeout - if isinstance(cluster_manager, FullClusterManager): - if not no_terminate: - register_handler( - lambda sig, frame: cluster_manager.terminate_cluster(wait=True) - ) - - # Start cluster - if cluster_id: - buildkite_group(":rocket: Using existing cluster") - # Re-use existing cluster ID for development - cluster_manager.cluster_id = cluster_id - cluster_manager.cluster_name = get_cluster_name(cluster_id) - else: - buildkite_group(":gear: Building cluster environment") - if cluster_env_id: - cluster_manager.cluster_env_id = cluster_env_id +def _setup_local_environment( + test: Test, + command_runner: CommandRunner, + ray_wheels_url: str, +) -> None: + driver_setup_script = test.get("driver_setup", None) + if driver_setup_script: + try: + run_bash_script(driver_setup_script) + except Exception as e: + raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - cluster_manager.build_configs(timeout=build_timeout) + # Install local dependencies + command_runner.prepare_local_env(ray_wheels_url) - if isinstance(cluster_manager, FullClusterManager): - buildkite_group(":rocket: Starting up cluster") - cluster_manager.start_cluster(timeout=cluster_timeout) - elif isinstance(command_runner, AnyscaleJobRunner): - command_runner.job_manager.cluster_startup_timeout = cluster_timeout + # Re-install anyscale package as local dependencies might have changed + # from local env setup + reinstall_anyscale_dependencies() - result.cluster_url = cluster_manager.get_cluster_url() - result.cluster_id = cluster_manager.cluster_id - # Upload files - buildkite_group(":wrench: Preparing remote environment") - command_runner.prepare_remote_env() +def _local_environment_information( + result: Result, + cluster_manager: ClusterManager, + command_runner: CommandRunner, + build_timeout: int, + cluster_timeout: int, + no_terminate: bool, + cluster_id: Optional[str], + cluster_env_id: Optional[str], +) -> None: + pip_packages = get_pip_packages() + pip_package_string = "\n".join(pip_packages) + logger.info(f"Installed python packages:\n{pip_package_string}") + + if isinstance(cluster_manager, FullClusterManager): + if not no_terminate: + register_handler( + lambda sig, frame: cluster_manager.terminate_cluster(wait=True) + ) - wait_for_nodes = test["run"].get("wait_for_nodes", None) + # Start cluster + if cluster_id: + buildkite_group(":rocket: Using existing cluster") + # Re-use existing cluster ID for development + cluster_manager.cluster_id = cluster_id + cluster_manager.cluster_name = get_cluster_name(cluster_id) + else: + buildkite_group(":gear: Building cluster environment") - if wait_for_nodes: - buildkite_group(":stopwatch: Waiting for nodes to come up") - # Overwrite wait_timeout from above to account for better default - wait_timeout = int( - wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) - ) - num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] - command_runner.wait_for_nodes(num_nodes, wait_timeout) + if cluster_env_id: + cluster_manager.cluster_env_id = cluster_env_id - if prepare_cmd: - try: - command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) - except CommandError as e: - raise PrepareCommandError(e) - except CommandTimeout as e: - raise PrepareCommandTimeout(e) + cluster_manager.build_configs(timeout=build_timeout) - buildkite_group(":runner: Running test script") - command = test["run"]["script"] - command_env = {} + if isinstance(cluster_manager, FullClusterManager): + buildkite_group(":rocket: Starting up cluster") + cluster_manager.start_cluster(timeout=cluster_timeout) + elif isinstance(command_runner, AnyscaleJobRunner): + command_runner.job_manager.cluster_startup_timeout = cluster_timeout - if smoke_test: - command = f"{command} --smoke-test" - command_env["IS_SMOKE_TEST"] = "1" + result.cluster_url = cluster_manager.get_cluster_url() + result.cluster_id = cluster_manager.cluster_id - is_long_running = test["run"].get("long_running", False) - start_time_unix = time.time() +def _prepare_remote_environment( + test: Test, + command_runner: CommandRunner, + prepare_cmd: bool, + prepare_timeout: int, +) -> None: + command_runner.prepare_remote_env() + + wait_for_nodes = test["run"].get("wait_for_nodes", None) + + if wait_for_nodes: + buildkite_group(":stopwatch: Waiting for nodes to come up") + # Overwrite wait_timeout from above to account for better default + wait_timeout = int( + wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) + ) + num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] + command_runner.wait_for_nodes(num_nodes, wait_timeout) + if prepare_cmd: try: - command_runner.run_command( - command, - env=command_env, - timeout=command_timeout, - raise_on_timeout=not is_long_running, - ) - except ( - TestCommandError, - PrepareCommandError, - TestCommandTimeout, - PrepareCommandTimeout, - ) as e: - raise e + command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) except CommandError as e: - raise TestCommandError(e) + raise PrepareCommandError(e) except CommandTimeout as e: - if not is_long_running: - # Only raise error if command is not long running - raise TestCommandTimeout(e) + raise PrepareCommandTimeout(e) - buildkite_group(":floppy_disk: Fetching results") - try: - command_results = command_runner.fetch_results() - except Exception as e: - logger.exception(f"Could not fetch results for test command: {e}") - command_results = {} - fetch_result_exception = e - if artifact_path: - try: - command_runner.fetch_artifact() - except Exception as e: - logger.error("Could not fetch artifact for test command") - logger.exception(e) +def _running_test_script( + test: Test, + smoke_test: bool, + command_runner: CommandRunner, + command_timeout: int, +) -> None: + command = test["run"]["script"] + command_env = {} - # Postprocess result: - if "last_update" in command_results: - command_results["last_update_diff"] = time.time() - command_results.get( - "last_update", 0.0 - ) + if smoke_test: + command = f"{command} --smoke-test" + command_env["IS_SMOKE_TEST"] = "1" + + is_long_running = test["run"].get("long_running", False) + + try: + command_runner.run_command( + command, + env=command_env, + timeout=command_timeout, + raise_on_timeout=not is_long_running, + ) + except ( + TestCommandError, + PrepareCommandError, + TestCommandTimeout, + PrepareCommandTimeout, + ) as e: + raise e + except CommandError as e: + raise TestCommandError(e) + except CommandTimeout as e: + if not is_long_running: + # Only raise error if command is not long running + raise TestCommandTimeout(e) + + +def _fetching_results( + result: Result, + command_runner: CommandRunner, + artifact_path: Optional[str], + smoke_test: bool, + start_time_unix: int, +) -> Tuple[dict, Exception]: + fetch_result_exception = None + try: + command_results = command_runner.fetch_results() + except Exception as e: + logger.exception(f"Could not fetch results for test command: {e}") + command_results = {} + fetch_result_exception = e + if artifact_path: try: - # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py - # Timeout is the time the test took divided by 200 - # (~7 minutes for a 24h test) but no less than 30s - # and no more than 900s - metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) - command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) - metrics = command_runner.fetch_metrics() + command_runner.fetch_artifact() except Exception as e: - logger.exception(f"Could not fetch metrics for test command: {e}") - metrics = {} + logger.error("Could not fetch artifact for test command") + logger.exception(e) + + # Postprocess result: + if "last_update" in command_results: + command_results["last_update_diff"] = time.time() - command_results.get( + "last_update", 0.0 + ) + + try: + # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py + # Timeout is the time the test took divided by 200 + # (~7 minutes for a 24h test) but no less than 30s + # and no more than 900s + metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) + command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) + metrics = command_runner.fetch_metrics() + except Exception as e: + logger.exception(f"Could not fetch metrics for test command: {e}") + metrics = {} - if smoke_test: - command_results["smoke_test"] = True + if smoke_test: + command_results["smoke_test"] = True - result.results = command_results - result.status = "finished" + result.results = command_results + result.status = "finished" return metrics, fetch_result_exception @@ -516,10 +544,7 @@ def run_release_test( if not no_terminate and cluster_manager: buildkite_group(":earth_africa: Terminating cluster") - try: - cluster_manager.terminate_cluster(wait=False) - except Exception as e: - logger.exception(f"Could not terminate cluster: {e}") + cluster_manager.terminate_cluster(wait=False) if hasattr(command_runner, "cleanup"): command_runner.cleanup() @@ -561,11 +586,7 @@ def run_release_test( result.last_logs = traceback.format_exc() buildkite_group(":memo: Reporting results", open=True) - reporters = reporters or [] - for reporter in reporters: - try: - reporter.report_result(test, result) - except Exception as e: - logger.exception(f"Error reporting results via {type(reporter)}: {e}") + for reporter in reporters or []: + reporter.report_result(test, result) return result From 62b530e4d7f6e69b51b20770aeea92b6f0fcdd97 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 11:19:03 -0700 Subject: [PATCH 012/104] Throw and retry on purpose Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 7232e9b81273..66a95831b529 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,6 +460,7 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: + raise ReleaseTestConfigError() buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From 4742040a7700b152689a24d587b309003ff6d62c Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 11:25:05 -0700 Subject: [PATCH 013/104] Fix things Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 7c8397d29a09..eaa9d57a8ef8 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -25,13 +25,13 @@ class Result: status: str = ResultStatus.UNKNOWN.value return_code: int = 0 + buildkite_return_code: int = BuildkiteExitCode.SUCCESS.value last_logs: Optional[str] = None runtime: Optional[float] = None stable: bool = True smoke_test: bool = False - buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None From dbe354cd839ee92c49f229955939889452334d1b Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 11:32:26 -0700 Subject: [PATCH 014/104] Need to use value of enum Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index 9fcae90beebd..533190f4bb70 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -126,7 +126,7 @@ def get_step( step["retry"] = { "automatic": [ { - "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR, + "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR.value, "limit": 2, } ] From f2a69ef3f00d8ffe46b767b77bed17c9950ab4b0 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 10:34:03 -0700 Subject: [PATCH 015/104] out of testing mode Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 66a95831b529..7232e9b81273 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,7 +460,6 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: - raise ReleaseTestConfigError() buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From fea4e57d56695388291944b905701b106fcdac47 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 10:36:42 -0700 Subject: [PATCH 016/104] Name consistency Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 +- release/ray_release/scripts/run_release_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index eaa9d57a8ef8..5f575b28831c 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -25,7 +25,7 @@ class Result: status: str = ResultStatus.UNKNOWN.value return_code: int = 0 - buildkite_return_code: int = BuildkiteExitCode.SUCCESS.value + buildkite_exit_code: int = BuildkiteExitCode.SUCCESS.value last_logs: Optional[str] = None runtime: Optional[float] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index c7de484b382a..d2d1c836e565 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -168,7 +168,7 @@ def main( f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}" ) - sys.exit(result.buildkite_return_code) + sys.exit(result.buildkite_exit_code) if __name__ == "__main__": From 92a82c0e19233ccdc9fc7dfff424d14847881a1c Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 14:03:45 -0700 Subject: [PATCH 017/104] Fix lints Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 8 ++++---- release/ray_release/result.py | 2 ++ release/ray_release/scripts/run_release_test.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index 533190f4bb70..d258ee4f286f 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -125,10 +125,10 @@ def get_step( # Auto-retry on transient infra error (according to result.BuildkiteExitCode) step["retry"] = { "automatic": [ - { - "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR.value, - "limit": 2, - } + { + "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR.value, + "limit": 2, + } ] } diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 5f575b28831c..e190695c45c3 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -4,6 +4,7 @@ from typing import Optional, Dict, Tuple + class ResultStatus(enum.Enum): """ Overall status of the result test run @@ -19,6 +20,7 @@ class ResultStatus(enum.Enum): TIMEOUT = "timeout" + @dataclass class Result: results: Optional[Dict] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index d2d1c836e565..a3a7aabb7cda 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -14,7 +14,7 @@ read_and_validate_release_test_collection, ) from ray_release.env import DEFAULT_ENVIRONMENT, load_environment, populate_os_env -from ray_release.exception import ReleaseTestCLIError, ReleaseTestError +from ray_release.exception import ReleaseTestCLIError from ray_release.glue import run_release_test from ray_release.logger import logger from ray_release.reporter.artifacts import ArtifactsReporter From 88ef3ef78d008be60379fde63bf30758e4dd3b00 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 16:27:22 -0700 Subject: [PATCH 018/104] Fix unit tests Signed-off-by: Cuong Nguyen --- release/ray_release/tests/test_glue.py | 109 ++++++++----------------- 1 file changed, 36 insertions(+), 73 deletions(-) diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 1cf9cdcf1dd2..874b7a376b3e 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -24,19 +24,13 @@ ClusterEnvBuildError, ClusterEnvBuildTimeout, ClusterEnvCreateError, - ClusterCreationError, ClusterStartupError, ClusterStartupTimeout, RemoteEnvSetupError, CommandError, - PrepareCommandError, CommandTimeout, - PrepareCommandTimeout, - TestCommandError, - TestCommandTimeout, FetchResultError, LogsError, - ResultsAlert, ClusterNodesWaitTimeout, ) from ray_release.file_manager.file_manager import FileManager @@ -251,7 +245,7 @@ def _succeed_until(self, until: str): self.mock_alert_return = None - def _run(self, result: Result, **kwargs): + def _run(self, result: Result, **kwargs) -> Result: run_release_test( test=self.test, anyscale_project=self.anyscale_project, @@ -267,26 +261,23 @@ def testInvalidClusterEnv(self): with patch( "ray_release.glue.load_test_cluster_env", _fail_on_call(ReleaseTestConfigError), - ), self.assertRaises(ReleaseTestConfigError): + ): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_env.yaml")) - with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterEnv("{{ INVALID") - with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterEnv("{'test': true, 'fail}") - with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) def testInvalidClusterCompute(self): @@ -295,26 +286,23 @@ def testInvalidClusterCompute(self): with patch( "ray_release.glue.load_test_cluster_compute", _fail_on_call(ReleaseTestConfigError), - ), self.assertRaises(ReleaseTestConfigError): + ): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml")) - with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterCompute("{{ INVALID") - with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterCompute("{'test': true, 'fail}") - with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) @@ -323,9 +311,8 @@ def testAutomaticClusterEnvVariables(self): self._succeed_until("local_env") - with self.assertRaises(LocalEnvSetupError): - self._run(result) - + self._run(result) + self.assertEqual(result.return_code, LocalEnvSetupError().exit_code.value) cluster_manager = self.instances["cluster_manager"] command_timeout = self.test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) @@ -362,8 +349,7 @@ def testInvalidPrepareLocalEnv(self): self.command_runner_return["prepare_local_env"] = _fail_on_call( LocalEnvSetupError ) - with self.assertRaises(LocalEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testDriverSetupFails(self): @@ -371,8 +357,7 @@ def testDriverSetupFails(self): self._succeed_until("local_env") - with self.assertRaises(LocalEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testInvalidClusterIdOverride(self): @@ -382,16 +367,14 @@ def testInvalidClusterIdOverride(self): self.sdk.returns["get_cluster_environment"] = None - with self.assertRaises(ClusterEnvCreateError): - self._run(result, cluster_env_id="existing") + self._run(result, cluster_env_id="existing") self.sdk.returns["get_cluster_environment"] = APIDict( result=APIDict(config_json={"overridden": True}) ) - with self.assertRaises(Exception) as cm: # Fail somewhere else - self._run(result, cluster_env_id="existing") - self.assertNotIsInstance(cm.exception, ClusterEnvCreateError) + self._run(result, cluster_env_id="existing") + self.assertNotEqual(result.return_code, ClusterEnvCreateError().exit_code) def testBuildConfigFailsClusterCompute(self): result = Result() @@ -402,16 +385,14 @@ def testBuildConfigFailsClusterCompute(self): self.command_runner_return["prepare_local_env"] = None # Fails because API response faulty - with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster compute reason self.cluster_manager_return["create_cluster_compute"] = _fail_on_call( ClusterComputeCreateError, "Known" ) - with self.assertRaisesRegex(ClusterComputeCreateError, "Known"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) def testBuildConfigFailsClusterEnv(self): @@ -419,17 +400,14 @@ def testBuildConfigFailsClusterEnv(self): self._succeed_until("cluster_compute") - # Fails because API response faulty - with self.assertRaisesRegex(ClusterEnvCreateError, "Unexpected"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster env create reason self.cluster_manager_return["create_cluster_env"] = _fail_on_call( ClusterEnvCreateError, "Known" ) - with self.assertRaisesRegex(ClusterEnvCreateError, "Known"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Now, succeed creation but fail on cluster env build @@ -438,16 +416,14 @@ def testBuildConfigFailsClusterEnv(self): self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildError ) - with self.assertRaises(ClusterEnvBuildError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_ERROR.value) # Now, fail on cluster env timeout self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildTimeout ) - with self.assertRaises(ClusterEnvBuildTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_TIMEOUT.value) def testStartClusterFails(self): @@ -456,8 +432,7 @@ def testStartClusterFails(self): self._succeed_until("cluster_env") # Fails because API response faulty - with self.assertRaises(ClusterCreationError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) self.cluster_manager_return["cluster_id"] = "valid" @@ -466,8 +441,7 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupError ) - with self.assertRaises(ClusterStartupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_ERROR.value) # Ensure cluster was terminated @@ -477,8 +451,7 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupTimeout ) - with self.assertRaises(ClusterStartupTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_TIMEOUT.value) # Ensure cluster was terminated @@ -492,8 +465,7 @@ def testPrepareRemoteEnvFails(self): self.command_runner_return["prepare_remote_env"] = _fail_on_call( RemoteEnvSetupError ) - with self.assertRaises(RemoteEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.REMOTE_ENV_SETUP_ERROR.value) # Ensure cluster was terminated @@ -508,8 +480,7 @@ def testWaitForNodesFails(self): self.command_runner_return["wait_for_nodes"] = _fail_on_call( ClusterNodesWaitTimeout ) - with self.assertRaises(ClusterNodesWaitTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) # Ensure cluster was terminated @@ -522,16 +493,14 @@ def testPrepareCommandFails(self): # Prepare command fails self.command_runner_return["run_prepare_command"] = _fail_on_call(CommandError) - with self.assertRaises(PrepareCommandError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.PREPARE_ERROR.value) # Prepare command times out self.command_runner_return["run_prepare_command"] = _fail_on_call( CommandTimeout ) - with self.assertRaises(PrepareCommandTimeout): - self._run(result) + self._run(result) # Special case: Prepare commands are usually waiting for nodes # (this may change in the future!) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) @@ -546,14 +515,12 @@ def testTestCommandFails(self): # Test command fails self.command_runner_return["run_command"] = _fail_on_call(CommandError) - with self.assertRaises(TestCommandError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ERROR.value) # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - with self.assertRaises(TestCommandTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # Ensure cluster was terminated @@ -566,8 +533,7 @@ def testTestCommandTimeoutLongRunning(self): # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - with self.assertRaises(TestCommandTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # But now set test to long running @@ -616,10 +582,9 @@ def testFetchResultFailsReqNonEmptyResult(self): self._succeed_until("test_command") self.command_runner_return["fetch_results"] = _fail_on_call(FetchResultError) - with self.assertRaisesRegex(FetchResultError, "Fail"): - with self.assertLogs(logger, "ERROR") as cm: - self._run(result) - self.assertTrue(any("Could not fetch results" in o for o in cm.output)) + with self.assertLogs(logger, "ERROR") as cm: + self._run(result) + self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) self.assertEqual(result.status, "infra_error") @@ -649,9 +614,7 @@ def testAlertFails(self): self.mock_alert_return = "Alert raised" - with self.assertRaises(ResultsAlert): - self._run(result) - + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ALERT.value) self.assertEqual(result.status, "error") From a9d5638b3a296591182ecfddae82255321623644 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 6 Apr 2023 10:46:34 -0700 Subject: [PATCH 019/104] Raise an error for testing Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 7232e9b81273..66a95831b529 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,6 +460,7 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: + raise ReleaseTestConfigError() buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From 41b770cf3762635e50ddc97374eeef6630e57e04 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 6 Apr 2023 10:47:24 -0700 Subject: [PATCH 020/104] Undo debugging code Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 66a95831b529..7232e9b81273 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,7 +460,6 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: - raise ReleaseTestConfigError() buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From f900f101f11900d10ad32be154efaa3c21144586 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 11:11:10 -0700 Subject: [PATCH 021/104] Move retry logic to sh file Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 12 +----------- release/ray_release/result.py | 1 - release/ray_release/scripts/run_release_test.py | 2 +- 3 files changed, 2 insertions(+), 13 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index d258ee4f286f..3078be809167 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,7 +15,7 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar -from ray_release.result import BuildkiteExitCode +from ray_release.result import ExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" @@ -122,16 +122,6 @@ def get_step( if test.get("run", {}).get("type") == "client": step["agents"]["queue"] = str(RELEASE_QUEUE_CLIENT) - # Auto-retry on transient infra error (according to result.BuildkiteExitCode) - step["retry"] = { - "automatic": [ - { - "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR.value, - "limit": 2, - } - ] - } - # If a test is not stable, allow to soft fail stable = test.get("stable", True) if not stable: diff --git a/release/ray_release/result.py b/release/ray_release/result.py index e190695c45c3..92594fd1545f 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -27,7 +27,6 @@ class Result: status: str = ResultStatus.UNKNOWN.value return_code: int = 0 - buildkite_exit_code: int = BuildkiteExitCode.SUCCESS.value last_logs: Optional[str] = None runtime: Optional[float] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index a3a7aabb7cda..790ba13315ca 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -168,7 +168,7 @@ def main( f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}" ) - sys.exit(result.buildkite_exit_code) + sys.exit(result.return_code) if __name__ == "__main__": From 5b1b4e2cf1d5405d82bdb5e5df33a56c118a26a0 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 12:47:18 -0700 Subject: [PATCH 022/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index 3078be809167..a13bde1575d8 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,7 +15,6 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar -from ray_release.result import ExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" From 55636378fdca610664d614592232aaadc1def478 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 12:49:02 -0700 Subject: [PATCH 023/104] for testing Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 7232e9b81273..22b860ba4ad4 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,6 +460,7 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: + raise ReleaseTestSetupError('hahahah') buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From a4725382b3defc5efe90268d94206ca1fd4d569b Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 12:57:40 -0700 Subject: [PATCH 024/104] More refactoring Signed-off-by: Cuong Nguyen --- release/ray_release/scripts/run_release_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 790ba13315ca..449dee26557d 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -14,7 +14,7 @@ read_and_validate_release_test_collection, ) from ray_release.env import DEFAULT_ENVIRONMENT, load_environment, populate_os_env -from ray_release.exception import ReleaseTestCLIError +from ray_release.exception import ReleaseTestCLIError, ReleaseTestError from ray_release.glue import run_release_test from ray_release.logger import logger from ray_release.reporter.artifacts import ArtifactsReporter @@ -166,9 +166,9 @@ def main( return_code = e.exit_code.value logger.info( f"Release test pipeline for test {test['name']} completed. " - f"Returning with exit code = {result.return_code}" + f"Returning with exit code = {return_code}" ) - sys.exit(result.return_code) + sys.exit(return_code) if __name__ == "__main__": From 5d4cc6255c9000a37ee8191115c8903482edeb1a Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 13:04:03 -0700 Subject: [PATCH 025/104] Undo more changes Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 4 +- release/ray_release/tests/test_glue.py | 109 +++++++++++++++++-------- 2 files changed, 76 insertions(+), 37 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 22b860ba4ad4..46ef68f45ce5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,7 +460,6 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: - raise ReleaseTestSetupError('hahahah') buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, @@ -590,4 +589,7 @@ def run_release_test( for reporter in reporters or []: reporter.report_result(test, result) + if pipeline_exception: + raise pipeline_exception + return result diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 874b7a376b3e..1cf9cdcf1dd2 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -24,13 +24,19 @@ ClusterEnvBuildError, ClusterEnvBuildTimeout, ClusterEnvCreateError, + ClusterCreationError, ClusterStartupError, ClusterStartupTimeout, RemoteEnvSetupError, CommandError, + PrepareCommandError, CommandTimeout, + PrepareCommandTimeout, + TestCommandError, + TestCommandTimeout, FetchResultError, LogsError, + ResultsAlert, ClusterNodesWaitTimeout, ) from ray_release.file_manager.file_manager import FileManager @@ -245,7 +251,7 @@ def _succeed_until(self, until: str): self.mock_alert_return = None - def _run(self, result: Result, **kwargs) -> Result: + def _run(self, result: Result, **kwargs): run_release_test( test=self.test, anyscale_project=self.anyscale_project, @@ -261,23 +267,26 @@ def testInvalidClusterEnv(self): with patch( "ray_release.glue.load_test_cluster_env", _fail_on_call(ReleaseTestConfigError), - ): + ), self.assertRaises(ReleaseTestConfigError): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_env.yaml")) - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterEnv("{{ INVALID") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterEnv("{'test': true, 'fail}") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) def testInvalidClusterCompute(self): @@ -286,23 +295,26 @@ def testInvalidClusterCompute(self): with patch( "ray_release.glue.load_test_cluster_compute", _fail_on_call(ReleaseTestConfigError), - ): + ), self.assertRaises(ReleaseTestConfigError): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml")) - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterCompute("{{ INVALID") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterCompute("{'test': true, 'fail}") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) @@ -311,8 +323,9 @@ def testAutomaticClusterEnvVariables(self): self._succeed_until("local_env") - self._run(result) - self.assertEqual(result.return_code, LocalEnvSetupError().exit_code.value) + with self.assertRaises(LocalEnvSetupError): + self._run(result) + cluster_manager = self.instances["cluster_manager"] command_timeout = self.test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) @@ -349,7 +362,8 @@ def testInvalidPrepareLocalEnv(self): self.command_runner_return["prepare_local_env"] = _fail_on_call( LocalEnvSetupError ) - self._run(result) + with self.assertRaises(LocalEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testDriverSetupFails(self): @@ -357,7 +371,8 @@ def testDriverSetupFails(self): self._succeed_until("local_env") - self._run(result) + with self.assertRaises(LocalEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testInvalidClusterIdOverride(self): @@ -367,14 +382,16 @@ def testInvalidClusterIdOverride(self): self.sdk.returns["get_cluster_environment"] = None - self._run(result, cluster_env_id="existing") + with self.assertRaises(ClusterEnvCreateError): + self._run(result, cluster_env_id="existing") self.sdk.returns["get_cluster_environment"] = APIDict( result=APIDict(config_json={"overridden": True}) ) - self._run(result, cluster_env_id="existing") - self.assertNotEqual(result.return_code, ClusterEnvCreateError().exit_code) + with self.assertRaises(Exception) as cm: # Fail somewhere else + self._run(result, cluster_env_id="existing") + self.assertNotIsInstance(cm.exception, ClusterEnvCreateError) def testBuildConfigFailsClusterCompute(self): result = Result() @@ -385,14 +402,16 @@ def testBuildConfigFailsClusterCompute(self): self.command_runner_return["prepare_local_env"] = None # Fails because API response faulty - self._run(result) + with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster compute reason self.cluster_manager_return["create_cluster_compute"] = _fail_on_call( ClusterComputeCreateError, "Known" ) - self._run(result) + with self.assertRaisesRegex(ClusterComputeCreateError, "Known"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) def testBuildConfigFailsClusterEnv(self): @@ -400,14 +419,17 @@ def testBuildConfigFailsClusterEnv(self): self._succeed_until("cluster_compute") - self._run(result) + # Fails because API response faulty + with self.assertRaisesRegex(ClusterEnvCreateError, "Unexpected"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster env create reason self.cluster_manager_return["create_cluster_env"] = _fail_on_call( ClusterEnvCreateError, "Known" ) - self._run(result) + with self.assertRaisesRegex(ClusterEnvCreateError, "Known"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Now, succeed creation but fail on cluster env build @@ -416,14 +438,16 @@ def testBuildConfigFailsClusterEnv(self): self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildError ) - self._run(result) + with self.assertRaises(ClusterEnvBuildError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_ERROR.value) # Now, fail on cluster env timeout self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildTimeout ) - self._run(result) + with self.assertRaises(ClusterEnvBuildTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_TIMEOUT.value) def testStartClusterFails(self): @@ -432,7 +456,8 @@ def testStartClusterFails(self): self._succeed_until("cluster_env") # Fails because API response faulty - self._run(result) + with self.assertRaises(ClusterCreationError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) self.cluster_manager_return["cluster_id"] = "valid" @@ -441,7 +466,8 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupError ) - self._run(result) + with self.assertRaises(ClusterStartupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_ERROR.value) # Ensure cluster was terminated @@ -451,7 +477,8 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupTimeout ) - self._run(result) + with self.assertRaises(ClusterStartupTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_TIMEOUT.value) # Ensure cluster was terminated @@ -465,7 +492,8 @@ def testPrepareRemoteEnvFails(self): self.command_runner_return["prepare_remote_env"] = _fail_on_call( RemoteEnvSetupError ) - self._run(result) + with self.assertRaises(RemoteEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.REMOTE_ENV_SETUP_ERROR.value) # Ensure cluster was terminated @@ -480,7 +508,8 @@ def testWaitForNodesFails(self): self.command_runner_return["wait_for_nodes"] = _fail_on_call( ClusterNodesWaitTimeout ) - self._run(result) + with self.assertRaises(ClusterNodesWaitTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) # Ensure cluster was terminated @@ -493,14 +522,16 @@ def testPrepareCommandFails(self): # Prepare command fails self.command_runner_return["run_prepare_command"] = _fail_on_call(CommandError) - self._run(result) + with self.assertRaises(PrepareCommandError): + self._run(result) self.assertEqual(result.return_code, ExitCode.PREPARE_ERROR.value) # Prepare command times out self.command_runner_return["run_prepare_command"] = _fail_on_call( CommandTimeout ) - self._run(result) + with self.assertRaises(PrepareCommandTimeout): + self._run(result) # Special case: Prepare commands are usually waiting for nodes # (this may change in the future!) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) @@ -515,12 +546,14 @@ def testTestCommandFails(self): # Test command fails self.command_runner_return["run_command"] = _fail_on_call(CommandError) - self._run(result) + with self.assertRaises(TestCommandError): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ERROR.value) # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - self._run(result) + with self.assertRaises(TestCommandTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # Ensure cluster was terminated @@ -533,7 +566,8 @@ def testTestCommandTimeoutLongRunning(self): # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - self._run(result) + with self.assertRaises(TestCommandTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # But now set test to long running @@ -582,9 +616,10 @@ def testFetchResultFailsReqNonEmptyResult(self): self._succeed_until("test_command") self.command_runner_return["fetch_results"] = _fail_on_call(FetchResultError) - with self.assertLogs(logger, "ERROR") as cm: - self._run(result) - self.assertTrue(any("Could not fetch results" in o for o in cm.output)) + with self.assertRaisesRegex(FetchResultError, "Fail"): + with self.assertLogs(logger, "ERROR") as cm: + self._run(result) + self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) self.assertEqual(result.status, "infra_error") @@ -614,7 +649,9 @@ def testAlertFails(self): self.mock_alert_return = "Alert raised" - self._run(result) + with self.assertRaises(ResultsAlert): + self._run(result) + self.assertEqual(result.return_code, ExitCode.COMMAND_ALERT.value) self.assertEqual(result.status, "error") From e0b1e9eecbceb72109aeffe916b66343c53e9028 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 13:06:52 -0700 Subject: [PATCH 026/104] For testing Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + release/ray_release/scripts/run_release_test.py | 1 + 2 files changed, 2 insertions(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..5873cd100286 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,6 +460,7 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: + raise ReleaseTestSetupError('hahahah') buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 449dee26557d..b259e5d3bfc9 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -164,6 +164,7 @@ def main( except ReleaseTestError as e: logger.exception(e) return_code = e.exit_code.value + logger.info( f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {return_code}" From 5160e0268e7eda5fb70746583f5b5c2adc1d4ea7 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 13:39:48 -0700 Subject: [PATCH 027/104] Remove debugging info Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 5873cd100286..46ef68f45ce5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,7 +460,6 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: - raise ReleaseTestSetupError('hahahah') buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From a95c675050bc7fac57b6731b27529f289f3058d8 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Fri, 14 Apr 2023 11:28:55 -0700 Subject: [PATCH 028/104] Fix tests Signed-off-by: Cuong Nguyen --- release/ray_release/tests/test_glue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 1cf9cdcf1dd2..fd35efa77916 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -621,7 +621,7 @@ def testFetchResultFailsReqNonEmptyResult(self): self._run(result) self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) - self.assertEqual(result.status, "infra_error") + self.assertEqual(result.status, "transient_infra_error") # Ensure cluster was terminated, no matter what self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) From e0249d2f892cda945b19a91bcf7087b40d0d2492 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:59:29 -0700 Subject: [PATCH 029/104] Exit buildkite job using buildkite return code Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 92594fd1545f..189f543fb143 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -1,5 +1,6 @@ import enum import os +import os from dataclasses import dataclass from typing import Optional, Dict, Tuple @@ -33,6 +34,7 @@ class Result: stable: bool = True smoke_test: bool = False + buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None From 229f9f179868fc3d722c0e281a3adc85550d19e1 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 14:01:54 -0700 Subject: [PATCH 030/104] Fix lints Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 189f543fb143..94644fb6ea35 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -83,18 +83,21 @@ class ExitCode(enum.Enum): COMMAND_TIMEOUT = 42 PREPARE_ERROR = 43 -class BuildkiteExitCode(enum.Enum): - """ - Final exit code the test runner passes to buildkite-agent. This exit code is used - to determine job policies, such as automatic retries - """ - SUCCESS = 0 - UNKNOWN = 1 - TRANSIENT_INFRA_ERROR = 10 - INFRA_ERROR = 11 - INFRA_TIMEOUT = 30 - ERROR = 40 - TIMEOUT = 42 + +def _is_transient_error(runtime: int) -> bool: + """ + Classify whether an infra-failure issue is a transient issue. This is based on + the status of its previous retries, and its runtime. + """ + retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", "0")) + if retry_count > 0: + # Already retried at least once and failed again, not a transient issue + return False + if runtime > 30 * 60: + # Take too long to run + return False + return True + def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ From 2952e2dafa9221a8c6679712833b595aeb4cbe03 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Fri, 7 Apr 2023 21:50:30 -0700 Subject: [PATCH 031/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 94644fb6ea35..dac9ebd56b1b 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -34,7 +34,6 @@ class Result: stable: bool = True smoke_test: bool = False - buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None From a764f76f38012174610edae6527a50bc042b6dca Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 10 Apr 2023 10:03:08 -0700 Subject: [PATCH 032/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index dac9ebd56b1b..e2ab72092ec2 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -1,6 +1,5 @@ import enum import os -import os from dataclasses import dataclass from typing import Optional, Dict, Tuple From e5b433fd1adf4c064571bf500b806018435b50dd Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 13:58:27 -0700 Subject: [PATCH 033/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 14 +++++++++----- release/run_release_test.sh | 2 ++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index e2ab72092ec2..2efd3bb832c4 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -82,16 +82,20 @@ class ExitCode(enum.Enum): PREPARE_ERROR = 43 -def _is_transient_error(runtime: int) -> bool: +def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ Classify whether an infra-failure issue is a transient issue. This is based on the status of its previous retries, and its runtime. """ - retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", "0")) - if retry_count > 0: - # Already retried at least once and failed again, not a transient issue + if result_status not in [ResultStatus.INFRA_ERROR, ResultStatus.INFRA_TIMEOUT]: + # Not even an infra failure + return False + retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", 0)) + max_retry = int(os.environ.get("BUILDKITE_MAX_RETRIES", 1)) + if retry_count >= max_retry: + # Already reach retry limit return False - if runtime > 30 * 60: + if runtime > os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0): # Take too long to run return False return True diff --git a/release/run_release_test.sh b/release/run_release_test.sh index 52b157a80c8f..95243e2b8826 100755 --- a/release/run_release_test.sh +++ b/release/run_release_test.sh @@ -133,11 +133,13 @@ while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do START=$(date +%s) set +e + START=`date +%s` trap _term SIGINT SIGTERM python "${RAY_TEST_SCRIPT}" "$@" & proc=$! wait "$proc" + END=`date +%s` EXIT_CODE=$? set -e From dec0ff67449d31ae0b4521b59e953ad1b5364afc Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 14:05:38 -0700 Subject: [PATCH 034/104] debugging Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..2c81abd343dd 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,6 +460,7 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: + raise ReleaseTestSetupError('hahaha') buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From a753fcf702c486557ea776a8003c640d537be9b9 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 15:11:26 -0700 Subject: [PATCH 035/104] fix sh Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 2efd3bb832c4..6c3d5c1e76d8 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -95,7 +95,7 @@ def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: if retry_count >= max_retry: # Already reach retry limit return False - if runtime > os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0): + if runtime > int(os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0)): # Take too long to run return False return True From fdef99f97e4b14b014023fc7f0f7521b8882c2e1 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 16:32:51 -0700 Subject: [PATCH 036/104] Fix sh again Signed-off-by: Cuong Nguyen --- release/run_release_test.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/release/run_release_test.sh b/release/run_release_test.sh index 95243e2b8826..52b157a80c8f 100755 --- a/release/run_release_test.sh +++ b/release/run_release_test.sh @@ -133,13 +133,11 @@ while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do START=$(date +%s) set +e - START=`date +%s` trap _term SIGINT SIGTERM python "${RAY_TEST_SCRIPT}" "$@" & proc=$! wait "$proc" - END=`date +%s` EXIT_CODE=$? set -e From bb5a57d97dc80772ef6d34b3f2df546c51d6ebea Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 16:42:25 -0700 Subject: [PATCH 037/104] Remove debugging information Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 2c81abd343dd..46ef68f45ce5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,7 +460,6 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: - raise ReleaseTestSetupError('hahaha') buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From 03f084806549bee91b7d2cecf0d5eb4f6ecf37dd Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 17 Apr 2023 12:42:17 -0700 Subject: [PATCH 038/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 6c3d5c1e76d8..5c7869d7d149 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -4,7 +4,6 @@ from typing import Optional, Dict, Tuple - class ResultStatus(enum.Enum): """ Overall status of the result test run @@ -20,7 +19,6 @@ class ResultStatus(enum.Enum): TIMEOUT = "timeout" - @dataclass class Result: results: Optional[Dict] = None @@ -88,7 +86,7 @@ def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: the status of its previous retries, and its runtime. """ if result_status not in [ResultStatus.INFRA_ERROR, ResultStatus.INFRA_TIMEOUT]: - # Not even an infra failure + # Not even an infra failure return False retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", 0)) max_retry = int(os.environ.get("BUILDKITE_MAX_RETRIES", 1)) From 72fc32aeaa2c93e7f1715a4214a48c72a572cfa6 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 17 Apr 2023 14:10:50 -0700 Subject: [PATCH 039/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/tests/test_glue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index fd35efa77916..1cf9cdcf1dd2 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -621,7 +621,7 @@ def testFetchResultFailsReqNonEmptyResult(self): self._run(result) self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) - self.assertEqual(result.status, "transient_infra_error") + self.assertEqual(result.status, "infra_error") # Ensure cluster was terminated, no matter what self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) From eda6a88f81ed0295c4a1a6d98fd1dd1d706b094e Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:37:11 -0700 Subject: [PATCH 040/104] Auto-retry for infrastructure errors Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 447 ++++++++++++++++------------------ release/ray_release/result.py | 12 + 2 files changed, 225 insertions(+), 234 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..50494aeb5cf0 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -6,11 +6,9 @@ from ray_release.alerts.handle import handle_result, require_result from ray_release.anyscale_util import get_cluster_name from ray_release.buildkite.output import buildkite_group, buildkite_open_last -from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.cluster_manager.full import FullClusterManager from ray_release.cluster_manager.minimal import MinimalClusterManager from ray_release.command_runner.job_runner import JobRunner -from ray_release.command_runner.command_runner import CommandRunner from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner from ray_release.command_runner.sdk_runner import SDKRunner from ray_release.config import ( @@ -92,33 +90,43 @@ def _get_extra_tags_from_env() -> dict: return {key.lower(): os.getenv(key, "") for key in env_vars} -def _load_test_configuration( +def run_release_test( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, + reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, + cluster_id: Optional[str] = None, + cluster_env_id: Optional[str] = None, no_terminate: bool = False, -) -> Tuple[ClusterManager, CommandRunner, str]: +) -> Result: + buildkite_group(":spiral_note_pad: Loading test configuration") + validate_test(test) + logger.info(f"Test config: {test}") - # Populate result paramaters result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test + buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") buildkite_job_id = os.getenv("BUILDKITE_JOB_ID", "") + if buildkite_url: buildkite_url += "#" + buildkite_job_id + result.buildkite_url = buildkite_url result.buildkite_job_id = buildkite_job_id - # Setting up working directory working_dir = test["working_dir"] + + old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) + start_time = time.monotonic() run_type = test["run"].get("type", DEFAULT_RUN_TYPE) # Workaround while Anyscale Jobs don't support leaving cluster alive @@ -153,6 +161,7 @@ def _load_test_configuration( logger.info(f"Got command runner cls: {command_runner_cls}") logger.info(f"Got file manager cls: {file_manager_cls}") + # Extra tags to be set on resources on cloud provider's side extra_tags = _get_extra_tags_from_env() # We don't need other attributes as they can be derived from the name @@ -176,267 +185,230 @@ def _load_test_configuration( except Exception as e: raise ReleaseTestSetupError(f"Error setting up release test: {e}") from e - return cluster_manager, command_runner, artifact_path + pipeline_exception = None + # non critical for some tests. So separate it from the general one. + fetch_result_exception = None + try: + setup_signal_handling() + # Load configs + cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) + cluster_compute = load_test_cluster_compute(test) + if cluster_env_id: + try: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_cluster_env() + cluster_manager.fetch_build_info() + logger.info( + "Using overridden cluster environment with ID " + f"{cluster_env_id} and build ID " + f"{cluster_manager.cluster_env_build_id}" + ) + except Exception as e: + raise ClusterEnvCreateError( + f"Could not get existing overridden cluster environment " + f"{cluster_env_id}: {e}" + ) from e + else: + cluster_manager.set_cluster_env(cluster_env) -def _setup_cluster_environment( - test: Test, - result: Result, - cluster_manager: ClusterManager, - ray_wheels_url: str, - cluster_env_id: Optional[str], -) -> Tuple[str, int, int, int, int]: - setup_signal_handling() - # Load configs - cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) - cluster_compute = load_test_cluster_compute(test) - - if cluster_env_id: - try: - cluster_manager.cluster_env_id = cluster_env_id - cluster_manager.build_cluster_env() - cluster_manager.fetch_build_info() - logger.info( - "Using overridden cluster environment with ID " - f"{cluster_env_id} and build ID " - f"{cluster_manager.cluster_env_build_id}" - ) - except Exception as e: - raise ClusterEnvCreateError( - f"Could not get existing overridden cluster environment " - f"{cluster_env_id}: {e}" - ) from e - else: - cluster_manager.set_cluster_env(cluster_env) + # Load some timeouts + build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) + command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) + cluster_timeout = int( + test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) + ) - # Load some timeouts - build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) - command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) - cluster_timeout = int(test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT)) + # Get prepare command timeout, if any + prepare_cmd = test["run"].get("prepare", None) + if prepare_cmd: + prepare_timeout = test["run"].get("prepare_timeout", command_timeout) + else: + prepare_timeout = 0 - # Get prepare command timeout, if any - prepare_cmd = test["run"].get("prepare", None) - if prepare_cmd: - prepare_timeout = test["run"].get("prepare_timeout", command_timeout) - else: - prepare_timeout = 0 + # Base maximum uptime on the combined command and prepare timeouts + command_and_prepare_timeout = command_timeout + prepare_timeout - # Base maximum uptime on the combined command and prepare timeouts - command_and_prepare_timeout = command_timeout + prepare_timeout + # Use default timeout = 0 here if wait_for_nodes is empty. This is to make + # sure we don't inflate the maximum_uptime_minutes too much if we don't wait + # for nodes at all. + # The actual default will be otherwise loaded further down. + wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) - # Use default timeout = 0 here if wait_for_nodes is empty. This is to make - # sure we don't inflate the maximum_uptime_minutes too much if we don't wait - # for nodes at all. - # The actual default will be otherwise loaded further down. - wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) + autosuspend_mins = test["cluster"].get("autosuspend_mins", None) + if autosuspend_mins: + cluster_manager.autosuspend_minutes = autosuspend_mins + autosuspend_base = autosuspend_mins + else: + cluster_manager.autosuspend_minutes = min( + DEFAULT_AUTOSUSPEND_MINS, + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, + ) + # Maximum uptime should be based on the command timeout, not the + # DEFAULT_AUTOSUSPEND_MINS + autosuspend_base = ( + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES + ) - autosuspend_mins = test["cluster"].get("autosuspend_mins", None) - if autosuspend_mins: - cluster_manager.autosuspend_minutes = autosuspend_mins - autosuspend_base = autosuspend_mins - else: - cluster_manager.autosuspend_minutes = min( - DEFAULT_AUTOSUSPEND_MINS, - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, - ) - # Maximum uptime should be based on the command timeout, not the - # DEFAULT_AUTOSUSPEND_MINS - autosuspend_base = ( - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES - ) + maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) + if maximum_uptime_minutes: + cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes + else: + cluster_manager.maximum_uptime_minutes = ( + autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + ) - maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) - if maximum_uptime_minutes: - cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes - else: - cluster_manager.maximum_uptime_minutes = ( - autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + # Set cluster compute here. Note that this may use timeouts provided + # above. + cluster_manager.set_cluster_compute( + cluster_compute, + extra_tags=extra_tags, ) - # Set cluster compute here. Note that this may use timeouts provided - # above. - cluster_manager.set_cluster_compute( - cluster_compute, - extra_tags=result.extra_tags, - ) + buildkite_group(":nut_and_bolt: Setting up local environment") + driver_setup_script = test.get("driver_setup", None) + if driver_setup_script: + try: + run_bash_script(driver_setup_script) + except Exception as e: + raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - return prepare_cmd, prepare_timeout, build_timeout, cluster_timeout, command_timeout + # Install local dependencies + command_runner.prepare_local_env(ray_wheels_url) + # Re-install anyscale package as local dependencies might have changed + # from local env setup + reinstall_anyscale_dependencies() -def _setup_local_environment( - test: Test, - command_runner: CommandRunner, - ray_wheels_url: str, -) -> None: - driver_setup_script = test.get("driver_setup", None) - if driver_setup_script: - try: - run_bash_script(driver_setup_script) - except Exception as e: - raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e + # Print installed pip packages + buildkite_group(":bulb: Local environment information") + pip_packages = get_pip_packages() + pip_package_string = "\n".join(pip_packages) + logger.info(f"Installed python packages:\n{pip_package_string}") - # Install local dependencies - command_runner.prepare_local_env(ray_wheels_url) + if isinstance(cluster_manager, FullClusterManager): + if not no_terminate: + register_handler( + lambda sig, frame: cluster_manager.terminate_cluster(wait=True) + ) + + # Start cluster + if cluster_id: + buildkite_group(":rocket: Using existing cluster") + # Re-use existing cluster ID for development + cluster_manager.cluster_id = cluster_id + cluster_manager.cluster_name = get_cluster_name(cluster_id) + else: + buildkite_group(":gear: Building cluster environment") - # Re-install anyscale package as local dependencies might have changed - # from local env setup - reinstall_anyscale_dependencies() + if cluster_env_id: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_configs(timeout=build_timeout) -def _local_environment_information( - result: Result, - cluster_manager: ClusterManager, - command_runner: CommandRunner, - build_timeout: int, - cluster_timeout: int, - no_terminate: bool, - cluster_id: Optional[str], - cluster_env_id: Optional[str], -) -> None: - pip_packages = get_pip_packages() - pip_package_string = "\n".join(pip_packages) - logger.info(f"Installed python packages:\n{pip_package_string}") - - if isinstance(cluster_manager, FullClusterManager): - if not no_terminate: - register_handler( - lambda sig, frame: cluster_manager.terminate_cluster(wait=True) - ) + if isinstance(cluster_manager, FullClusterManager): + buildkite_group(":rocket: Starting up cluster") + cluster_manager.start_cluster(timeout=cluster_timeout) + elif isinstance(command_runner, AnyscaleJobRunner): + command_runner.job_manager.cluster_startup_timeout = cluster_timeout - # Start cluster - if cluster_id: - buildkite_group(":rocket: Using existing cluster") - # Re-use existing cluster ID for development - cluster_manager.cluster_id = cluster_id - cluster_manager.cluster_name = get_cluster_name(cluster_id) - else: - buildkite_group(":gear: Building cluster environment") + result.cluster_url = cluster_manager.get_cluster_url() + result.cluster_id = cluster_manager.cluster_id - if cluster_env_id: - cluster_manager.cluster_env_id = cluster_env_id + # Upload files + buildkite_group(":wrench: Preparing remote environment") + command_runner.prepare_remote_env() - cluster_manager.build_configs(timeout=build_timeout) + wait_for_nodes = test["run"].get("wait_for_nodes", None) - if isinstance(cluster_manager, FullClusterManager): - buildkite_group(":rocket: Starting up cluster") - cluster_manager.start_cluster(timeout=cluster_timeout) - elif isinstance(command_runner, AnyscaleJobRunner): - command_runner.job_manager.cluster_startup_timeout = cluster_timeout + if wait_for_nodes: + buildkite_group(":stopwatch: Waiting for nodes to come up") + # Overwrite wait_timeout from above to account for better default + wait_timeout = int( + wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) + ) + num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] + command_runner.wait_for_nodes(num_nodes, wait_timeout) - result.cluster_url = cluster_manager.get_cluster_url() - result.cluster_id = cluster_manager.cluster_id + if prepare_cmd: + try: + command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) + except CommandError as e: + raise PrepareCommandError(e) + except CommandTimeout as e: + raise PrepareCommandTimeout(e) + buildkite_group(":runner: Running test script") + command = test["run"]["script"] + command_env = {} -def _prepare_remote_environment( - test: Test, - command_runner: CommandRunner, - prepare_cmd: bool, - prepare_timeout: int, -) -> None: - command_runner.prepare_remote_env() - - wait_for_nodes = test["run"].get("wait_for_nodes", None) - - if wait_for_nodes: - buildkite_group(":stopwatch: Waiting for nodes to come up") - # Overwrite wait_timeout from above to account for better default - wait_timeout = int( - wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) - ) - num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] - command_runner.wait_for_nodes(num_nodes, wait_timeout) + if smoke_test: + command = f"{command} --smoke-test" + command_env["IS_SMOKE_TEST"] = "1" + + is_long_running = test["run"].get("long_running", False) + + start_time_unix = time.time() - if prepare_cmd: try: - command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) + command_runner.run_command( + command, + env=command_env, + timeout=command_timeout, + raise_on_timeout=not is_long_running, + ) + except ( + TestCommandError, + PrepareCommandError, + TestCommandTimeout, + PrepareCommandTimeout, + ) as e: + raise e except CommandError as e: - raise PrepareCommandError(e) + raise TestCommandError(e) except CommandTimeout as e: - raise PrepareCommandTimeout(e) - + if not is_long_running: + # Only raise error if command is not long running + raise TestCommandTimeout(e) -def _running_test_script( - test: Test, - smoke_test: bool, - command_runner: CommandRunner, - command_timeout: int, -) -> None: - command = test["run"]["script"] - command_env = {} - - if smoke_test: - command = f"{command} --smoke-test" - command_env["IS_SMOKE_TEST"] = "1" + buildkite_group(":floppy_disk: Fetching results") + try: + command_results = command_runner.fetch_results() + except Exception as e: + logger.exception(f"Could not fetch results for test command: {e}") + command_results = {} + fetch_result_exception = e - is_long_running = test["run"].get("long_running", False) + if artifact_path: + try: + command_runner.fetch_artifact() + except Exception as e: + logger.error("Could not fetch artifact for test command") + logger.exception(e) - try: - command_runner.run_command( - command, - env=command_env, - timeout=command_timeout, - raise_on_timeout=not is_long_running, - ) - except ( - TestCommandError, - PrepareCommandError, - TestCommandTimeout, - PrepareCommandTimeout, - ) as e: - raise e - except CommandError as e: - raise TestCommandError(e) - except CommandTimeout as e: - if not is_long_running: - # Only raise error if command is not long running - raise TestCommandTimeout(e) - - -def _fetching_results( - result: Result, - command_runner: CommandRunner, - artifact_path: Optional[str], - smoke_test: bool, - start_time_unix: int, -) -> Tuple[dict, Exception]: - fetch_result_exception = None - try: - command_results = command_runner.fetch_results() - except Exception as e: - logger.exception(f"Could not fetch results for test command: {e}") - command_results = {} - fetch_result_exception = e + # Postprocess result: + if "last_update" in command_results: + command_results["last_update_diff"] = time.time() - command_results.get( + "last_update", 0.0 + ) - if artifact_path: try: - command_runner.fetch_artifact() + # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py + # Timeout is the time the test took divided by 200 + # (~7 minutes for a 24h test) but no less than 30s + # and no more than 900s + metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) + command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) + metrics = command_runner.fetch_metrics() except Exception as e: - logger.error("Could not fetch artifact for test command") - logger.exception(e) - - # Postprocess result: - if "last_update" in command_results: - command_results["last_update_diff"] = time.time() - command_results.get( - "last_update", 0.0 - ) - - try: - # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py - # Timeout is the time the test took divided by 200 - # (~7 minutes for a 24h test) but no less than 30s - # and no more than 900s - metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) - command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) - metrics = command_runner.fetch_metrics() - except Exception as e: - logger.exception(f"Could not fetch metrics for test command: {e}") - metrics = {} + logger.exception(f"Could not fetch metrics for test command: {e}") + metrics = {} - if smoke_test: - command_results["smoke_test"] = True + if smoke_test: + command_results["smoke_test"] = True - result.results = command_results - result.status = "finished" + result.results = command_results + result.status = "finished" return metrics, fetch_result_exception @@ -544,7 +516,10 @@ def run_release_test( if not no_terminate and cluster_manager: buildkite_group(":earth_africa: Terminating cluster") - cluster_manager.terminate_cluster(wait=False) + try: + cluster_manager.terminate_cluster(wait=False) + except Exception as e: + logger.exception(f"Could not terminate cluster: {e}") if hasattr(command_runner, "cleanup"): command_runner.cleanup() @@ -586,8 +561,12 @@ def run_release_test( result.last_logs = traceback.format_exc() buildkite_group(":memo: Reporting results", open=True) - for reporter in reporters or []: - reporter.report_result(test, result) + reporters = reporters or [] + for reporter in reporters: + try: + reporter.report_result(test, result) + except Exception as e: + logger.exception(f"Error reporting results via {type(reporter)}: {e}") if pipeline_exception: raise pipeline_exception diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 5c7869d7d149..7ae796be84d6 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -79,6 +79,18 @@ class ExitCode(enum.Enum): COMMAND_TIMEOUT = 42 PREPARE_ERROR = 43 +class BuildkiteExitCode(enum.Enum): + """ + Final exit code the test runner passes to buildkite-agent. This exit code is used + to determine job policies, such as automatic retries + """ + SUCCESS = 0 + UNKNOWN = 1 + TRANSIENT_INFRA_ERROR = 10 + INFRA_ERROR = 11 + INFRA_TIMEOUT = 30 + ERROR = 40 + TIMEOUT = 42 def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ From 2568b0238d7de6e61a1a111862e64d9d17c8ab9b Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:59:29 -0700 Subject: [PATCH 041/104] Exit buildkite job using buildkite return code Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 11 +++++++ release/ray_release/glue.py | 3 -- release/ray_release/result.py | 1 + .../ray_release/scripts/run_release_test.py | 32 ++++++++----------- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index a13bde1575d8..9fcae90beebd 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,6 +15,7 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar +from ray_release.result import BuildkiteExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" @@ -121,6 +122,16 @@ def get_step( if test.get("run", {}).get("type") == "client": step["agents"]["queue"] = str(RELEASE_QUEUE_CLIENT) + # Auto-retry on transient infra error (according to result.BuildkiteExitCode) + step["retry"] = { + "automatic": [ + { + "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR, + "limit": 2, + } + ] + } + # If a test is not stable, allow to soft fail stable = test.get("stable", True) if not stable: diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 50494aeb5cf0..339ad99c4e68 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -568,7 +568,4 @@ def run_release_test( except Exception as e: logger.exception(f"Error reporting results via {type(reporter)}: {e}") - if pipeline_exception: - raise pipeline_exception - return result diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 7ae796be84d6..a1dab9b42d16 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -31,6 +31,7 @@ class Result: stable: bool = True smoke_test: bool = False + buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index b259e5d3bfc9..0be02dde3923 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -148,28 +148,22 @@ def main( if report: reporters.append(DBReporter()) - try: - result = run_release_test( - test, - anyscale_project=anyscale_project, - result=result, - ray_wheels_url=ray_wheels_url, - reporters=reporters, - smoke_test=smoke_test, - cluster_id=cluster_id, - cluster_env_id=cluster_env_id, - no_terminate=no_terminate, - ) - return_code = result.return_code - except ReleaseTestError as e: - logger.exception(e) - return_code = e.exit_code.value - + result = run_release_test( + test, + anyscale_project=anyscale_project, + result=result, + ray_wheels_url=ray_wheels_url, + reporters=reporters, + smoke_test=smoke_test, + cluster_id=cluster_id, + cluster_env_id=cluster_env_id, + no_terminate=no_terminate, + ) logger.info( f"Release test pipeline for test {test['name']} completed. " - f"Returning with exit code = {return_code}" + f"Returning with exit code = {result.return_code}" ) - sys.exit(return_code) + sys.exit(result.buildkite_return_code) if __name__ == "__main__": From 12ce6097a1672271d74164d8b2d6e29120e27f03 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 11:16:38 -0700 Subject: [PATCH 042/104] Handle everything through result exceptions Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 447 +++++++++++++++++++----------------- 1 file changed, 234 insertions(+), 213 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 339ad99c4e68..7232e9b81273 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -6,9 +6,11 @@ from ray_release.alerts.handle import handle_result, require_result from ray_release.anyscale_util import get_cluster_name from ray_release.buildkite.output import buildkite_group, buildkite_open_last +from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.cluster_manager.full import FullClusterManager from ray_release.cluster_manager.minimal import MinimalClusterManager from ray_release.command_runner.job_runner import JobRunner +from ray_release.command_runner.command_runner import CommandRunner from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner from ray_release.command_runner.sdk_runner import SDKRunner from ray_release.config import ( @@ -90,43 +92,33 @@ def _get_extra_tags_from_env() -> dict: return {key.lower(): os.getenv(key, "") for key in env_vars} -def run_release_test( +def _load_test_configuration( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, - reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, - cluster_id: Optional[str] = None, - cluster_env_id: Optional[str] = None, no_terminate: bool = False, -) -> Result: - buildkite_group(":spiral_note_pad: Loading test configuration") - +) -> Tuple[ClusterManager, CommandRunner, str]: validate_test(test) - logger.info(f"Test config: {test}") + # Populate result paramaters result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test - buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") buildkite_job_id = os.getenv("BUILDKITE_JOB_ID", "") - if buildkite_url: buildkite_url += "#" + buildkite_job_id - result.buildkite_url = buildkite_url result.buildkite_job_id = buildkite_job_id + # Setting up working directory working_dir = test["working_dir"] - - old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) - start_time = time.monotonic() run_type = test["run"].get("type", DEFAULT_RUN_TYPE) # Workaround while Anyscale Jobs don't support leaving cluster alive @@ -161,7 +153,6 @@ def run_release_test( logger.info(f"Got command runner cls: {command_runner_cls}") logger.info(f"Got file manager cls: {file_manager_cls}") - # Extra tags to be set on resources on cloud provider's side extra_tags = _get_extra_tags_from_env() # We don't need other attributes as they can be derived from the name @@ -185,230 +176,267 @@ def run_release_test( except Exception as e: raise ReleaseTestSetupError(f"Error setting up release test: {e}") from e - pipeline_exception = None - # non critical for some tests. So separate it from the general one. - fetch_result_exception = None - try: - setup_signal_handling() - # Load configs - cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) - cluster_compute = load_test_cluster_compute(test) - - if cluster_env_id: - try: - cluster_manager.cluster_env_id = cluster_env_id - cluster_manager.build_cluster_env() - cluster_manager.fetch_build_info() - logger.info( - "Using overridden cluster environment with ID " - f"{cluster_env_id} and build ID " - f"{cluster_manager.cluster_env_build_id}" - ) - except Exception as e: - raise ClusterEnvCreateError( - f"Could not get existing overridden cluster environment " - f"{cluster_env_id}: {e}" - ) from e - else: - cluster_manager.set_cluster_env(cluster_env) + return cluster_manager, command_runner, artifact_path - # Load some timeouts - build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) - command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) - cluster_timeout = int( - test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) - ) - # Get prepare command timeout, if any - prepare_cmd = test["run"].get("prepare", None) - if prepare_cmd: - prepare_timeout = test["run"].get("prepare_timeout", command_timeout) - else: - prepare_timeout = 0 +def _setup_cluster_environment( + test: Test, + result: Result, + cluster_manager: ClusterManager, + ray_wheels_url: str, + cluster_env_id: Optional[str], +) -> Tuple[str, int, int, int, int]: + setup_signal_handling() + # Load configs + cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) + cluster_compute = load_test_cluster_compute(test) + + if cluster_env_id: + try: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_cluster_env() + cluster_manager.fetch_build_info() + logger.info( + "Using overridden cluster environment with ID " + f"{cluster_env_id} and build ID " + f"{cluster_manager.cluster_env_build_id}" + ) + except Exception as e: + raise ClusterEnvCreateError( + f"Could not get existing overridden cluster environment " + f"{cluster_env_id}: {e}" + ) from e + else: + cluster_manager.set_cluster_env(cluster_env) - # Base maximum uptime on the combined command and prepare timeouts - command_and_prepare_timeout = command_timeout + prepare_timeout + # Load some timeouts + build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) + command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) + cluster_timeout = int(test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT)) - # Use default timeout = 0 here if wait_for_nodes is empty. This is to make - # sure we don't inflate the maximum_uptime_minutes too much if we don't wait - # for nodes at all. - # The actual default will be otherwise loaded further down. - wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) + # Get prepare command timeout, if any + prepare_cmd = test["run"].get("prepare", None) + if prepare_cmd: + prepare_timeout = test["run"].get("prepare_timeout", command_timeout) + else: + prepare_timeout = 0 - autosuspend_mins = test["cluster"].get("autosuspend_mins", None) - if autosuspend_mins: - cluster_manager.autosuspend_minutes = autosuspend_mins - autosuspend_base = autosuspend_mins - else: - cluster_manager.autosuspend_minutes = min( - DEFAULT_AUTOSUSPEND_MINS, - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, - ) - # Maximum uptime should be based on the command timeout, not the - # DEFAULT_AUTOSUSPEND_MINS - autosuspend_base = ( - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES - ) + # Base maximum uptime on the combined command and prepare timeouts + command_and_prepare_timeout = command_timeout + prepare_timeout - maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) - if maximum_uptime_minutes: - cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes - else: - cluster_manager.maximum_uptime_minutes = ( - autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES - ) + # Use default timeout = 0 here if wait_for_nodes is empty. This is to make + # sure we don't inflate the maximum_uptime_minutes too much if we don't wait + # for nodes at all. + # The actual default will be otherwise loaded further down. + wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) - # Set cluster compute here. Note that this may use timeouts provided - # above. - cluster_manager.set_cluster_compute( - cluster_compute, - extra_tags=extra_tags, + autosuspend_mins = test["cluster"].get("autosuspend_mins", None) + if autosuspend_mins: + cluster_manager.autosuspend_minutes = autosuspend_mins + autosuspend_base = autosuspend_mins + else: + cluster_manager.autosuspend_minutes = min( + DEFAULT_AUTOSUSPEND_MINS, + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, + ) + # Maximum uptime should be based on the command timeout, not the + # DEFAULT_AUTOSUSPEND_MINS + autosuspend_base = ( + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES ) - buildkite_group(":nut_and_bolt: Setting up local environment") - driver_setup_script = test.get("driver_setup", None) - if driver_setup_script: - try: - run_bash_script(driver_setup_script) - except Exception as e: - raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - - # Install local dependencies - command_runner.prepare_local_env(ray_wheels_url) + maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) + if maximum_uptime_minutes: + cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes + else: + cluster_manager.maximum_uptime_minutes = ( + autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + ) - # Re-install anyscale package as local dependencies might have changed - # from local env setup - reinstall_anyscale_dependencies() + # Set cluster compute here. Note that this may use timeouts provided + # above. + cluster_manager.set_cluster_compute( + cluster_compute, + extra_tags=result.extra_tags, + ) - # Print installed pip packages - buildkite_group(":bulb: Local environment information") - pip_packages = get_pip_packages() - pip_package_string = "\n".join(pip_packages) - logger.info(f"Installed python packages:\n{pip_package_string}") + return prepare_cmd, prepare_timeout, build_timeout, cluster_timeout, command_timeout - if isinstance(cluster_manager, FullClusterManager): - if not no_terminate: - register_handler( - lambda sig, frame: cluster_manager.terminate_cluster(wait=True) - ) - - # Start cluster - if cluster_id: - buildkite_group(":rocket: Using existing cluster") - # Re-use existing cluster ID for development - cluster_manager.cluster_id = cluster_id - cluster_manager.cluster_name = get_cluster_name(cluster_id) - else: - buildkite_group(":gear: Building cluster environment") - if cluster_env_id: - cluster_manager.cluster_env_id = cluster_env_id +def _setup_local_environment( + test: Test, + command_runner: CommandRunner, + ray_wheels_url: str, +) -> None: + driver_setup_script = test.get("driver_setup", None) + if driver_setup_script: + try: + run_bash_script(driver_setup_script) + except Exception as e: + raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - cluster_manager.build_configs(timeout=build_timeout) + # Install local dependencies + command_runner.prepare_local_env(ray_wheels_url) - if isinstance(cluster_manager, FullClusterManager): - buildkite_group(":rocket: Starting up cluster") - cluster_manager.start_cluster(timeout=cluster_timeout) - elif isinstance(command_runner, AnyscaleJobRunner): - command_runner.job_manager.cluster_startup_timeout = cluster_timeout + # Re-install anyscale package as local dependencies might have changed + # from local env setup + reinstall_anyscale_dependencies() - result.cluster_url = cluster_manager.get_cluster_url() - result.cluster_id = cluster_manager.cluster_id - # Upload files - buildkite_group(":wrench: Preparing remote environment") - command_runner.prepare_remote_env() +def _local_environment_information( + result: Result, + cluster_manager: ClusterManager, + command_runner: CommandRunner, + build_timeout: int, + cluster_timeout: int, + no_terminate: bool, + cluster_id: Optional[str], + cluster_env_id: Optional[str], +) -> None: + pip_packages = get_pip_packages() + pip_package_string = "\n".join(pip_packages) + logger.info(f"Installed python packages:\n{pip_package_string}") + + if isinstance(cluster_manager, FullClusterManager): + if not no_terminate: + register_handler( + lambda sig, frame: cluster_manager.terminate_cluster(wait=True) + ) - wait_for_nodes = test["run"].get("wait_for_nodes", None) + # Start cluster + if cluster_id: + buildkite_group(":rocket: Using existing cluster") + # Re-use existing cluster ID for development + cluster_manager.cluster_id = cluster_id + cluster_manager.cluster_name = get_cluster_name(cluster_id) + else: + buildkite_group(":gear: Building cluster environment") - if wait_for_nodes: - buildkite_group(":stopwatch: Waiting for nodes to come up") - # Overwrite wait_timeout from above to account for better default - wait_timeout = int( - wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) - ) - num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] - command_runner.wait_for_nodes(num_nodes, wait_timeout) + if cluster_env_id: + cluster_manager.cluster_env_id = cluster_env_id - if prepare_cmd: - try: - command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) - except CommandError as e: - raise PrepareCommandError(e) - except CommandTimeout as e: - raise PrepareCommandTimeout(e) + cluster_manager.build_configs(timeout=build_timeout) - buildkite_group(":runner: Running test script") - command = test["run"]["script"] - command_env = {} + if isinstance(cluster_manager, FullClusterManager): + buildkite_group(":rocket: Starting up cluster") + cluster_manager.start_cluster(timeout=cluster_timeout) + elif isinstance(command_runner, AnyscaleJobRunner): + command_runner.job_manager.cluster_startup_timeout = cluster_timeout - if smoke_test: - command = f"{command} --smoke-test" - command_env["IS_SMOKE_TEST"] = "1" + result.cluster_url = cluster_manager.get_cluster_url() + result.cluster_id = cluster_manager.cluster_id - is_long_running = test["run"].get("long_running", False) - start_time_unix = time.time() +def _prepare_remote_environment( + test: Test, + command_runner: CommandRunner, + prepare_cmd: bool, + prepare_timeout: int, +) -> None: + command_runner.prepare_remote_env() + + wait_for_nodes = test["run"].get("wait_for_nodes", None) + + if wait_for_nodes: + buildkite_group(":stopwatch: Waiting for nodes to come up") + # Overwrite wait_timeout from above to account for better default + wait_timeout = int( + wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) + ) + num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] + command_runner.wait_for_nodes(num_nodes, wait_timeout) + if prepare_cmd: try: - command_runner.run_command( - command, - env=command_env, - timeout=command_timeout, - raise_on_timeout=not is_long_running, - ) - except ( - TestCommandError, - PrepareCommandError, - TestCommandTimeout, - PrepareCommandTimeout, - ) as e: - raise e + command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) except CommandError as e: - raise TestCommandError(e) + raise PrepareCommandError(e) except CommandTimeout as e: - if not is_long_running: - # Only raise error if command is not long running - raise TestCommandTimeout(e) + raise PrepareCommandTimeout(e) - buildkite_group(":floppy_disk: Fetching results") - try: - command_results = command_runner.fetch_results() - except Exception as e: - logger.exception(f"Could not fetch results for test command: {e}") - command_results = {} - fetch_result_exception = e - if artifact_path: - try: - command_runner.fetch_artifact() - except Exception as e: - logger.error("Could not fetch artifact for test command") - logger.exception(e) +def _running_test_script( + test: Test, + smoke_test: bool, + command_runner: CommandRunner, + command_timeout: int, +) -> None: + command = test["run"]["script"] + command_env = {} - # Postprocess result: - if "last_update" in command_results: - command_results["last_update_diff"] = time.time() - command_results.get( - "last_update", 0.0 - ) + if smoke_test: + command = f"{command} --smoke-test" + command_env["IS_SMOKE_TEST"] = "1" + + is_long_running = test["run"].get("long_running", False) + + try: + command_runner.run_command( + command, + env=command_env, + timeout=command_timeout, + raise_on_timeout=not is_long_running, + ) + except ( + TestCommandError, + PrepareCommandError, + TestCommandTimeout, + PrepareCommandTimeout, + ) as e: + raise e + except CommandError as e: + raise TestCommandError(e) + except CommandTimeout as e: + if not is_long_running: + # Only raise error if command is not long running + raise TestCommandTimeout(e) + + +def _fetching_results( + result: Result, + command_runner: CommandRunner, + artifact_path: Optional[str], + smoke_test: bool, + start_time_unix: int, +) -> Tuple[dict, Exception]: + fetch_result_exception = None + try: + command_results = command_runner.fetch_results() + except Exception as e: + logger.exception(f"Could not fetch results for test command: {e}") + command_results = {} + fetch_result_exception = e + if artifact_path: try: - # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py - # Timeout is the time the test took divided by 200 - # (~7 minutes for a 24h test) but no less than 30s - # and no more than 900s - metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) - command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) - metrics = command_runner.fetch_metrics() + command_runner.fetch_artifact() except Exception as e: - logger.exception(f"Could not fetch metrics for test command: {e}") - metrics = {} + logger.error("Could not fetch artifact for test command") + logger.exception(e) + + # Postprocess result: + if "last_update" in command_results: + command_results["last_update_diff"] = time.time() - command_results.get( + "last_update", 0.0 + ) + + try: + # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py + # Timeout is the time the test took divided by 200 + # (~7 minutes for a 24h test) but no less than 30s + # and no more than 900s + metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) + command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) + metrics = command_runner.fetch_metrics() + except Exception as e: + logger.exception(f"Could not fetch metrics for test command: {e}") + metrics = {} - if smoke_test: - command_results["smoke_test"] = True + if smoke_test: + command_results["smoke_test"] = True - result.results = command_results - result.status = "finished" + result.results = command_results + result.status = "finished" return metrics, fetch_result_exception @@ -516,10 +544,7 @@ def run_release_test( if not no_terminate and cluster_manager: buildkite_group(":earth_africa: Terminating cluster") - try: - cluster_manager.terminate_cluster(wait=False) - except Exception as e: - logger.exception(f"Could not terminate cluster: {e}") + cluster_manager.terminate_cluster(wait=False) if hasattr(command_runner, "cleanup"): command_runner.cleanup() @@ -561,11 +586,7 @@ def run_release_test( result.last_logs = traceback.format_exc() buildkite_group(":memo: Reporting results", open=True) - reporters = reporters or [] - for reporter in reporters: - try: - reporter.report_result(test, result) - except Exception as e: - logger.exception(f"Error reporting results via {type(reporter)}: {e}") + for reporter in reporters or []: + reporter.report_result(test, result) return result From e07f3e0dac28f6fb769ea9a8cdf95bd547353f0a Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 11:19:03 -0700 Subject: [PATCH 043/104] Throw and retry on purpose Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 7232e9b81273..66a95831b529 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,6 +460,7 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: + raise ReleaseTestConfigError() buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, From ee21c0bade656eb5740670b03d09137f17141ee0 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 11:25:05 -0700 Subject: [PATCH 044/104] Fix things Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index a1dab9b42d16..cc97f9aae7c2 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -25,13 +25,13 @@ class Result: status: str = ResultStatus.UNKNOWN.value return_code: int = 0 + buildkite_return_code: int = BuildkiteExitCode.SUCCESS.value last_logs: Optional[str] = None runtime: Optional[float] = None stable: bool = True smoke_test: bool = False - buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None From 86bffd9fd53bef3e456d3899fb97f098f322d8eb Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 10:36:42 -0700 Subject: [PATCH 045/104] Name consistency Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 +- release/ray_release/scripts/run_release_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index cc97f9aae7c2..3e986d4a3a6d 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -25,7 +25,7 @@ class Result: status: str = ResultStatus.UNKNOWN.value return_code: int = 0 - buildkite_return_code: int = BuildkiteExitCode.SUCCESS.value + buildkite_exit_code: int = BuildkiteExitCode.SUCCESS.value last_logs: Optional[str] = None runtime: Optional[float] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 0be02dde3923..d8ec4791f16f 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -163,7 +163,7 @@ def main( f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}" ) - sys.exit(result.buildkite_return_code) + sys.exit(result.buildkite_exit_code) if __name__ == "__main__": From 0b2ab0fb715ab7d4024e6ae1436f9cee7c9de792 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 14:03:45 -0700 Subject: [PATCH 046/104] Fix lints Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 ++ release/ray_release/scripts/run_release_test.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 3e986d4a3a6d..722bc38c6773 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -4,6 +4,7 @@ from typing import Optional, Dict, Tuple + class ResultStatus(enum.Enum): """ Overall status of the result test run @@ -19,6 +20,7 @@ class ResultStatus(enum.Enum): TIMEOUT = "timeout" + @dataclass class Result: results: Optional[Dict] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index d8ec4791f16f..b2581ccf8cdc 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -14,7 +14,7 @@ read_and_validate_release_test_collection, ) from ray_release.env import DEFAULT_ENVIRONMENT, load_environment, populate_os_env -from ray_release.exception import ReleaseTestCLIError, ReleaseTestError +from ray_release.exception import ReleaseTestCLIError from ray_release.glue import run_release_test from ray_release.logger import logger from ray_release.reporter.artifacts import ArtifactsReporter From 3320f85b30740d1b9db4d8792fb2f09fecf6cd62 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 16:27:22 -0700 Subject: [PATCH 047/104] Fix unit tests Signed-off-by: Cuong Nguyen --- release/ray_release/tests/test_glue.py | 111 +++++++++---------------- 1 file changed, 37 insertions(+), 74 deletions(-) diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 1cf9cdcf1dd2..232d0456ee9c 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -24,19 +24,13 @@ ClusterEnvBuildError, ClusterEnvBuildTimeout, ClusterEnvCreateError, - ClusterCreationError, ClusterStartupError, ClusterStartupTimeout, RemoteEnvSetupError, CommandError, - PrepareCommandError, CommandTimeout, - PrepareCommandTimeout, - TestCommandError, - TestCommandTimeout, FetchResultError, LogsError, - ResultsAlert, ClusterNodesWaitTimeout, ) from ray_release.file_manager.file_manager import FileManager @@ -251,7 +245,7 @@ def _succeed_until(self, until: str): self.mock_alert_return = None - def _run(self, result: Result, **kwargs): + def _run(self, result: Result, **kwargs) -> Result: run_release_test( test=self.test, anyscale_project=self.anyscale_project, @@ -267,26 +261,23 @@ def testInvalidClusterEnv(self): with patch( "ray_release.glue.load_test_cluster_env", _fail_on_call(ReleaseTestConfigError), - ), self.assertRaises(ReleaseTestConfigError): + ): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_env.yaml")) - with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterEnv("{{ INVALID") - with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterEnv("{'test': true, 'fail}") - with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) def testInvalidClusterCompute(self): @@ -295,26 +286,23 @@ def testInvalidClusterCompute(self): with patch( "ray_release.glue.load_test_cluster_compute", _fail_on_call(ReleaseTestConfigError), - ), self.assertRaises(ReleaseTestConfigError): + ): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml")) - with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterCompute("{{ INVALID") - with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterCompute("{'test': true, 'fail}") - with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) @@ -323,9 +311,8 @@ def testAutomaticClusterEnvVariables(self): self._succeed_until("local_env") - with self.assertRaises(LocalEnvSetupError): - self._run(result) - + self._run(result) + self.assertEqual(result.return_code, LocalEnvSetupError().exit_code.value) cluster_manager = self.instances["cluster_manager"] command_timeout = self.test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) @@ -362,8 +349,7 @@ def testInvalidPrepareLocalEnv(self): self.command_runner_return["prepare_local_env"] = _fail_on_call( LocalEnvSetupError ) - with self.assertRaises(LocalEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testDriverSetupFails(self): @@ -371,8 +357,7 @@ def testDriverSetupFails(self): self._succeed_until("local_env") - with self.assertRaises(LocalEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testInvalidClusterIdOverride(self): @@ -382,16 +367,14 @@ def testInvalidClusterIdOverride(self): self.sdk.returns["get_cluster_environment"] = None - with self.assertRaises(ClusterEnvCreateError): - self._run(result, cluster_env_id="existing") + self._run(result, cluster_env_id="existing") self.sdk.returns["get_cluster_environment"] = APIDict( result=APIDict(config_json={"overridden": True}) ) - with self.assertRaises(Exception) as cm: # Fail somewhere else - self._run(result, cluster_env_id="existing") - self.assertNotIsInstance(cm.exception, ClusterEnvCreateError) + self._run(result, cluster_env_id="existing") + self.assertNotEqual(result.return_code, ClusterEnvCreateError().exit_code) def testBuildConfigFailsClusterCompute(self): result = Result() @@ -402,16 +385,14 @@ def testBuildConfigFailsClusterCompute(self): self.command_runner_return["prepare_local_env"] = None # Fails because API response faulty - with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster compute reason self.cluster_manager_return["create_cluster_compute"] = _fail_on_call( ClusterComputeCreateError, "Known" ) - with self.assertRaisesRegex(ClusterComputeCreateError, "Known"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) def testBuildConfigFailsClusterEnv(self): @@ -419,17 +400,14 @@ def testBuildConfigFailsClusterEnv(self): self._succeed_until("cluster_compute") - # Fails because API response faulty - with self.assertRaisesRegex(ClusterEnvCreateError, "Unexpected"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster env create reason self.cluster_manager_return["create_cluster_env"] = _fail_on_call( ClusterEnvCreateError, "Known" ) - with self.assertRaisesRegex(ClusterEnvCreateError, "Known"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Now, succeed creation but fail on cluster env build @@ -438,16 +416,14 @@ def testBuildConfigFailsClusterEnv(self): self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildError ) - with self.assertRaises(ClusterEnvBuildError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_ERROR.value) # Now, fail on cluster env timeout self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildTimeout ) - with self.assertRaises(ClusterEnvBuildTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_TIMEOUT.value) def testStartClusterFails(self): @@ -456,8 +432,7 @@ def testStartClusterFails(self): self._succeed_until("cluster_env") # Fails because API response faulty - with self.assertRaises(ClusterCreationError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) self.cluster_manager_return["cluster_id"] = "valid" @@ -466,8 +441,7 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupError ) - with self.assertRaises(ClusterStartupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_ERROR.value) # Ensure cluster was terminated @@ -477,8 +451,7 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupTimeout ) - with self.assertRaises(ClusterStartupTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_TIMEOUT.value) # Ensure cluster was terminated @@ -492,8 +465,7 @@ def testPrepareRemoteEnvFails(self): self.command_runner_return["prepare_remote_env"] = _fail_on_call( RemoteEnvSetupError ) - with self.assertRaises(RemoteEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.REMOTE_ENV_SETUP_ERROR.value) # Ensure cluster was terminated @@ -508,8 +480,7 @@ def testWaitForNodesFails(self): self.command_runner_return["wait_for_nodes"] = _fail_on_call( ClusterNodesWaitTimeout ) - with self.assertRaises(ClusterNodesWaitTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) # Ensure cluster was terminated @@ -522,16 +493,14 @@ def testPrepareCommandFails(self): # Prepare command fails self.command_runner_return["run_prepare_command"] = _fail_on_call(CommandError) - with self.assertRaises(PrepareCommandError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.PREPARE_ERROR.value) # Prepare command times out self.command_runner_return["run_prepare_command"] = _fail_on_call( CommandTimeout ) - with self.assertRaises(PrepareCommandTimeout): - self._run(result) + self._run(result) # Special case: Prepare commands are usually waiting for nodes # (this may change in the future!) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) @@ -546,14 +515,12 @@ def testTestCommandFails(self): # Test command fails self.command_runner_return["run_command"] = _fail_on_call(CommandError) - with self.assertRaises(TestCommandError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ERROR.value) # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - with self.assertRaises(TestCommandTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # Ensure cluster was terminated @@ -566,8 +533,7 @@ def testTestCommandTimeoutLongRunning(self): # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - with self.assertRaises(TestCommandTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # But now set test to long running @@ -616,12 +582,11 @@ def testFetchResultFailsReqNonEmptyResult(self): self._succeed_until("test_command") self.command_runner_return["fetch_results"] = _fail_on_call(FetchResultError) - with self.assertRaisesRegex(FetchResultError, "Fail"): - with self.assertLogs(logger, "ERROR") as cm: - self._run(result) - self.assertTrue(any("Could not fetch results" in o for o in cm.output)) + with self.assertLogs(logger, "ERROR") as cm: + self._run(result) + self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) - self.assertEqual(result.status, "infra_error") + self.assertEqual(result.status, "transient_infra_error") # Ensure cluster was terminated, no matter what self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) @@ -649,9 +614,7 @@ def testAlertFails(self): self.mock_alert_return = "Alert raised" - with self.assertRaises(ResultsAlert): - self._run(result) - + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ALERT.value) self.assertEqual(result.status, "error") From 972b88591044ecd899d05cf21237c79985897445 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 11:11:10 -0700 Subject: [PATCH 048/104] Move retry logic to sh file Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 10 +++++----- release/ray_release/result.py | 1 - release/ray_release/scripts/run_release_test.py | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index 9fcae90beebd..5a564797ae3e 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,7 +15,7 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar -from ray_release.result import BuildkiteExitCode +from ray_release.result import ExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" @@ -125,10 +125,10 @@ def get_step( # Auto-retry on transient infra error (according to result.BuildkiteExitCode) step["retry"] = { "automatic": [ - { - "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR, - "limit": 2, - } + { + "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR.value, + "limit": 2, + } ] } diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 722bc38c6773..34b74aa82ea7 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -27,7 +27,6 @@ class Result: status: str = ResultStatus.UNKNOWN.value return_code: int = 0 - buildkite_exit_code: int = BuildkiteExitCode.SUCCESS.value last_logs: Optional[str] = None runtime: Optional[float] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index b2581ccf8cdc..89aad23ab2cd 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -163,7 +163,7 @@ def main( f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}" ) - sys.exit(result.buildkite_exit_code) + sys.exit(result.return_code) if __name__ == "__main__": From c316ee4a637c2d84b3501cefb9d64061ecea41ab Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 12:47:18 -0700 Subject: [PATCH 049/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index 5a564797ae3e..dffbc88090d5 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,7 +15,6 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar -from ray_release.result import ExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" From 55676c59d3d7146a47367c82f54da6e41598d056 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 12:57:40 -0700 Subject: [PATCH 050/104] More refactoring Signed-off-by: Cuong Nguyen --- .../ray_release/scripts/run_release_test.py | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 89aad23ab2cd..449dee26557d 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -14,7 +14,7 @@ read_and_validate_release_test_collection, ) from ray_release.env import DEFAULT_ENVIRONMENT, load_environment, populate_os_env -from ray_release.exception import ReleaseTestCLIError +from ray_release.exception import ReleaseTestCLIError, ReleaseTestError from ray_release.glue import run_release_test from ray_release.logger import logger from ray_release.reporter.artifacts import ArtifactsReporter @@ -148,22 +148,27 @@ def main( if report: reporters.append(DBReporter()) - result = run_release_test( - test, - anyscale_project=anyscale_project, - result=result, - ray_wheels_url=ray_wheels_url, - reporters=reporters, - smoke_test=smoke_test, - cluster_id=cluster_id, - cluster_env_id=cluster_env_id, - no_terminate=no_terminate, - ) + try: + result = run_release_test( + test, + anyscale_project=anyscale_project, + result=result, + ray_wheels_url=ray_wheels_url, + reporters=reporters, + smoke_test=smoke_test, + cluster_id=cluster_id, + cluster_env_id=cluster_env_id, + no_terminate=no_terminate, + ) + return_code = result.return_code + except ReleaseTestError as e: + logger.exception(e) + return_code = e.exit_code.value logger.info( f"Release test pipeline for test {test['name']} completed. " - f"Returning with exit code = {result.return_code}" + f"Returning with exit code = {return_code}" ) - sys.exit(result.return_code) + sys.exit(return_code) if __name__ == "__main__": From f322ae38a35bd2f15b5a0dd59f1e71537417a185 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 13:04:03 -0700 Subject: [PATCH 051/104] Undo more changes Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 5 +- release/ray_release/tests/test_glue.py | 111 ++++++++++++++++--------- 2 files changed, 78 insertions(+), 38 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 66a95831b529..5873cd100286 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,7 +460,7 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: - raise ReleaseTestConfigError() + raise ReleaseTestSetupError('hahahah') buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, @@ -590,4 +590,7 @@ def run_release_test( for reporter in reporters or []: reporter.report_result(test, result) + if pipeline_exception: + raise pipeline_exception + return result diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 232d0456ee9c..1cf9cdcf1dd2 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -24,13 +24,19 @@ ClusterEnvBuildError, ClusterEnvBuildTimeout, ClusterEnvCreateError, + ClusterCreationError, ClusterStartupError, ClusterStartupTimeout, RemoteEnvSetupError, CommandError, + PrepareCommandError, CommandTimeout, + PrepareCommandTimeout, + TestCommandError, + TestCommandTimeout, FetchResultError, LogsError, + ResultsAlert, ClusterNodesWaitTimeout, ) from ray_release.file_manager.file_manager import FileManager @@ -245,7 +251,7 @@ def _succeed_until(self, until: str): self.mock_alert_return = None - def _run(self, result: Result, **kwargs) -> Result: + def _run(self, result: Result, **kwargs): run_release_test( test=self.test, anyscale_project=self.anyscale_project, @@ -261,23 +267,26 @@ def testInvalidClusterEnv(self): with patch( "ray_release.glue.load_test_cluster_env", _fail_on_call(ReleaseTestConfigError), - ): + ), self.assertRaises(ReleaseTestConfigError): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_env.yaml")) - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterEnv("{{ INVALID") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterEnv("{'test': true, 'fail}") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) def testInvalidClusterCompute(self): @@ -286,23 +295,26 @@ def testInvalidClusterCompute(self): with patch( "ray_release.glue.load_test_cluster_compute", _fail_on_call(ReleaseTestConfigError), - ): + ), self.assertRaises(ReleaseTestConfigError): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml")) - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterCompute("{{ INVALID") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterCompute("{'test': true, 'fail}") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) @@ -311,8 +323,9 @@ def testAutomaticClusterEnvVariables(self): self._succeed_until("local_env") - self._run(result) - self.assertEqual(result.return_code, LocalEnvSetupError().exit_code.value) + with self.assertRaises(LocalEnvSetupError): + self._run(result) + cluster_manager = self.instances["cluster_manager"] command_timeout = self.test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) @@ -349,7 +362,8 @@ def testInvalidPrepareLocalEnv(self): self.command_runner_return["prepare_local_env"] = _fail_on_call( LocalEnvSetupError ) - self._run(result) + with self.assertRaises(LocalEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testDriverSetupFails(self): @@ -357,7 +371,8 @@ def testDriverSetupFails(self): self._succeed_until("local_env") - self._run(result) + with self.assertRaises(LocalEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testInvalidClusterIdOverride(self): @@ -367,14 +382,16 @@ def testInvalidClusterIdOverride(self): self.sdk.returns["get_cluster_environment"] = None - self._run(result, cluster_env_id="existing") + with self.assertRaises(ClusterEnvCreateError): + self._run(result, cluster_env_id="existing") self.sdk.returns["get_cluster_environment"] = APIDict( result=APIDict(config_json={"overridden": True}) ) - self._run(result, cluster_env_id="existing") - self.assertNotEqual(result.return_code, ClusterEnvCreateError().exit_code) + with self.assertRaises(Exception) as cm: # Fail somewhere else + self._run(result, cluster_env_id="existing") + self.assertNotIsInstance(cm.exception, ClusterEnvCreateError) def testBuildConfigFailsClusterCompute(self): result = Result() @@ -385,14 +402,16 @@ def testBuildConfigFailsClusterCompute(self): self.command_runner_return["prepare_local_env"] = None # Fails because API response faulty - self._run(result) + with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster compute reason self.cluster_manager_return["create_cluster_compute"] = _fail_on_call( ClusterComputeCreateError, "Known" ) - self._run(result) + with self.assertRaisesRegex(ClusterComputeCreateError, "Known"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) def testBuildConfigFailsClusterEnv(self): @@ -400,14 +419,17 @@ def testBuildConfigFailsClusterEnv(self): self._succeed_until("cluster_compute") - self._run(result) + # Fails because API response faulty + with self.assertRaisesRegex(ClusterEnvCreateError, "Unexpected"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster env create reason self.cluster_manager_return["create_cluster_env"] = _fail_on_call( ClusterEnvCreateError, "Known" ) - self._run(result) + with self.assertRaisesRegex(ClusterEnvCreateError, "Known"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Now, succeed creation but fail on cluster env build @@ -416,14 +438,16 @@ def testBuildConfigFailsClusterEnv(self): self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildError ) - self._run(result) + with self.assertRaises(ClusterEnvBuildError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_ERROR.value) # Now, fail on cluster env timeout self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildTimeout ) - self._run(result) + with self.assertRaises(ClusterEnvBuildTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_TIMEOUT.value) def testStartClusterFails(self): @@ -432,7 +456,8 @@ def testStartClusterFails(self): self._succeed_until("cluster_env") # Fails because API response faulty - self._run(result) + with self.assertRaises(ClusterCreationError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) self.cluster_manager_return["cluster_id"] = "valid" @@ -441,7 +466,8 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupError ) - self._run(result) + with self.assertRaises(ClusterStartupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_ERROR.value) # Ensure cluster was terminated @@ -451,7 +477,8 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupTimeout ) - self._run(result) + with self.assertRaises(ClusterStartupTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_TIMEOUT.value) # Ensure cluster was terminated @@ -465,7 +492,8 @@ def testPrepareRemoteEnvFails(self): self.command_runner_return["prepare_remote_env"] = _fail_on_call( RemoteEnvSetupError ) - self._run(result) + with self.assertRaises(RemoteEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.REMOTE_ENV_SETUP_ERROR.value) # Ensure cluster was terminated @@ -480,7 +508,8 @@ def testWaitForNodesFails(self): self.command_runner_return["wait_for_nodes"] = _fail_on_call( ClusterNodesWaitTimeout ) - self._run(result) + with self.assertRaises(ClusterNodesWaitTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) # Ensure cluster was terminated @@ -493,14 +522,16 @@ def testPrepareCommandFails(self): # Prepare command fails self.command_runner_return["run_prepare_command"] = _fail_on_call(CommandError) - self._run(result) + with self.assertRaises(PrepareCommandError): + self._run(result) self.assertEqual(result.return_code, ExitCode.PREPARE_ERROR.value) # Prepare command times out self.command_runner_return["run_prepare_command"] = _fail_on_call( CommandTimeout ) - self._run(result) + with self.assertRaises(PrepareCommandTimeout): + self._run(result) # Special case: Prepare commands are usually waiting for nodes # (this may change in the future!) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) @@ -515,12 +546,14 @@ def testTestCommandFails(self): # Test command fails self.command_runner_return["run_command"] = _fail_on_call(CommandError) - self._run(result) + with self.assertRaises(TestCommandError): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ERROR.value) # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - self._run(result) + with self.assertRaises(TestCommandTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # Ensure cluster was terminated @@ -533,7 +566,8 @@ def testTestCommandTimeoutLongRunning(self): # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - self._run(result) + with self.assertRaises(TestCommandTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # But now set test to long running @@ -582,11 +616,12 @@ def testFetchResultFailsReqNonEmptyResult(self): self._succeed_until("test_command") self.command_runner_return["fetch_results"] = _fail_on_call(FetchResultError) - with self.assertLogs(logger, "ERROR") as cm: - self._run(result) - self.assertTrue(any("Could not fetch results" in o for o in cm.output)) + with self.assertRaisesRegex(FetchResultError, "Fail"): + with self.assertLogs(logger, "ERROR") as cm: + self._run(result) + self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) - self.assertEqual(result.status, "transient_infra_error") + self.assertEqual(result.status, "infra_error") # Ensure cluster was terminated, no matter what self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) @@ -614,7 +649,9 @@ def testAlertFails(self): self.mock_alert_return = "Alert raised" - self._run(result) + with self.assertRaises(ResultsAlert): + self._run(result) + self.assertEqual(result.return_code, ExitCode.COMMAND_ALERT.value) self.assertEqual(result.status, "error") From afd9967fc9cb3e7575931a3d1149cb5543c68325 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Fri, 14 Apr 2023 11:28:55 -0700 Subject: [PATCH 052/104] Fix tests Signed-off-by: Cuong Nguyen --- release/ray_release/tests/test_glue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 1cf9cdcf1dd2..fd35efa77916 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -621,7 +621,7 @@ def testFetchResultFailsReqNonEmptyResult(self): self._run(result) self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) - self.assertEqual(result.status, "infra_error") + self.assertEqual(result.status, "transient_infra_error") # Ensure cluster was terminated, no matter what self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) From 3c9a156dd2ccaa9197a02a7c7b43e8da42b3c8ad Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:59:29 -0700 Subject: [PATCH 053/104] Exit buildkite job using buildkite return code Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 34b74aa82ea7..de8b4a611247 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -1,5 +1,6 @@ import enum import os +import os from dataclasses import dataclass from typing import Optional, Dict, Tuple @@ -33,6 +34,7 @@ class Result: stable: bool = True smoke_test: bool = False + buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None From 32d7f5b033eb23d362c94754573d39c6604d0a32 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 10 Apr 2023 10:03:08 -0700 Subject: [PATCH 054/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index de8b4a611247..8f01eff6ef42 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -1,6 +1,5 @@ import enum import os -import os from dataclasses import dataclass from typing import Optional, Dict, Tuple From ed6542e968d51a571b48c71669e177f87e745e81 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 13:58:25 -0700 Subject: [PATCH 055/104] Only failed-fast job can have transient error Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 8f01eff6ef42..b1bcfbcab967 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -82,18 +82,6 @@ class ExitCode(enum.Enum): COMMAND_TIMEOUT = 42 PREPARE_ERROR = 43 -class BuildkiteExitCode(enum.Enum): - """ - Final exit code the test runner passes to buildkite-agent. This exit code is used - to determine job policies, such as automatic retries - """ - SUCCESS = 0 - UNKNOWN = 1 - TRANSIENT_INFRA_ERROR = 10 - INFRA_ERROR = 11 - INFRA_TIMEOUT = 30 - ERROR = 40 - TIMEOUT = 42 def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ From 7384a7959e1c0989db2d558edbed1a90e62ab334 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Fri, 7 Apr 2023 21:40:58 -0700 Subject: [PATCH 056/104] Set ray log to stderr Signed-off-by: Cuong Nguyen --- release/ray_release/cluster_manager/cluster_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/cluster_manager/cluster_manager.py b/release/ray_release/cluster_manager/cluster_manager.py index 2294dbfd4c49..5dd350f07d1e 100644 --- a/release/ray_release/cluster_manager/cluster_manager.py +++ b/release/ray_release/cluster_manager/cluster_manager.py @@ -56,6 +56,7 @@ def set_cluster_env(self, cluster_env: Dict[str, Any]): self.cluster_env.setdefault("env_vars", {}) self.cluster_env["env_vars"]["MATCH_AUTOSCALER_AND_RAY_IMAGES"] = "1" self.cluster_env["env_vars"]["RAY_USAGE_STATS_ENABLED"] = "1" + self.cluster_env["env_vars"]["RAY_LOG_TO_STDERR"] = "1" self.cluster_env["env_vars"]["RAY_USAGE_STATS_SOURCE"] = "nightly-tests" self.cluster_env["env_vars"][ "RAY_USAGE_STATS_EXTRA_TAGS" From 0df92636fbc6c2d0597a59d72f6d3ce2c236b163 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sat, 8 Apr 2023 13:15:43 -0700 Subject: [PATCH 057/104] Get ray logs Signed-off-by: Cuong Nguyen --- release/ray_release/cluster_manager/cluster_manager.py | 1 - release/ray_release/job_manager/anyscale_job_manager.py | 3 +++ release/ray_release/job_manager/job_manager.py | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/release/ray_release/cluster_manager/cluster_manager.py b/release/ray_release/cluster_manager/cluster_manager.py index 5dd350f07d1e..2294dbfd4c49 100644 --- a/release/ray_release/cluster_manager/cluster_manager.py +++ b/release/ray_release/cluster_manager/cluster_manager.py @@ -56,7 +56,6 @@ def set_cluster_env(self, cluster_env: Dict[str, Any]): self.cluster_env.setdefault("env_vars", {}) self.cluster_env["env_vars"]["MATCH_AUTOSCALER_AND_RAY_IMAGES"] = "1" self.cluster_env["env_vars"]["RAY_USAGE_STATS_ENABLED"] = "1" - self.cluster_env["env_vars"]["RAY_LOG_TO_STDERR"] = "1" self.cluster_env["env_vars"]["RAY_USAGE_STATS_SOURCE"] = "nightly-tests" self.cluster_env["env_vars"][ "RAY_USAGE_STATS_EXTRA_TAGS" diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 1b605b412e84..711a999d2e2a 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -45,6 +45,7 @@ def __init__(self, cluster_manager: ClusterManager): self.cluster_manager = cluster_manager self._last_job_result = None self._last_logs = None + self._last_ray_logs = None self.cluster_startup_timeout = 600 def _run_job( @@ -320,6 +321,8 @@ def get_last_logs(self): if self._last_logs: return self._last_logs + return self.get_last_ray_logs() + # TODO: replace with an actual API call. def _get_logs(): buf = io.StringIO() diff --git a/release/ray_release/job_manager/job_manager.py b/release/ray_release/job_manager/job_manager.py index 72bc7f24de61..dcd39b0f2e28 100644 --- a/release/ray_release/job_manager/job_manager.py +++ b/release/ray_release/job_manager/job_manager.py @@ -119,3 +119,6 @@ def get_last_logs(self): # return None job_client = self._get_job_client() return job_client.get_job_logs(self.last_job_id) + + def get_last_ray_logs(self): + return None From 630132bbef6c520cdc9a4c108bf315dc34f9fb15 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sat, 8 Apr 2023 16:52:41 -0700 Subject: [PATCH 058/104] job response Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 711a999d2e2a..b9a066abb081 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -84,6 +84,7 @@ def _run_job( ), ), ) + logger.info(f'JOB RESPONSE {job_response}') except Exception as e: raise JobStartupFailed( "Error starting job with name " From 6311d2879d8832341b1e0236f81feb29e8a476af Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sat, 8 Apr 2023 17:25:10 -0700 Subject: [PATCH 059/104] correct update last job result Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index b9a066abb081..07a3ca9685d8 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -84,7 +84,6 @@ def _run_job( ), ), ) - logger.info(f'JOB RESPONSE {job_response}') except Exception as e: raise JobStartupFailed( "Error starting job with name " @@ -108,6 +107,7 @@ def last_job_result(self): @last_job_result.setter def last_job_result(self, value): + logger.info(f'last job result: {value}') cluster_id = value.state.cluster_id # Set this only once. if self.cluster_manager.cluster_id is None and cluster_id: From 8b70ae73a6139ede1d636c931c795b9dd94fe853 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sat, 8 Apr 2023 17:44:57 -0700 Subject: [PATCH 060/104] Fix get log group Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 07a3ca9685d8..711a999d2e2a 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -107,7 +107,6 @@ def last_job_result(self): @last_job_result.setter def last_job_result(self, value): - logger.info(f'last job result: {value}') cluster_id = value.state.cluster_id # Set this only once. if self.cluster_manager.cluster_id is None and cluster_id: From 1625377e730057ee3cfb8eaae0eb5c6941dbbccf Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sat, 8 Apr 2023 20:15:38 -0700 Subject: [PATCH 061/104] Best attempt to get ray error logs on infra failures Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 711a999d2e2a..84e27e6eba45 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -45,7 +45,6 @@ def __init__(self, cluster_manager: ClusterManager): self.cluster_manager = cluster_manager self._last_job_result = None self._last_logs = None - self._last_ray_logs = None self.cluster_startup_timeout = 600 def _run_job( @@ -321,8 +320,6 @@ def get_last_logs(self): if self._last_logs: return self._last_logs - return self.get_last_ray_logs() - # TODO: replace with an actual API call. def _get_logs(): buf = io.StringIO() @@ -346,6 +343,8 @@ def _get_logs(): initial_retry_delay_s=30, max_retries=3, ) + if not ret: + ret = self.get_last_ray_error_logs() if ret and not self.in_progress: self._last_logs = ret return ret From 6a91523f6cf128bdae106cf12bfc78763339f1fe Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sun, 9 Apr 2023 17:53:34 -0700 Subject: [PATCH 062/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 84e27e6eba45..1b605b412e84 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -343,8 +343,6 @@ def _get_logs(): initial_retry_delay_s=30, max_retries=3, ) - if not ret: - ret = self.get_last_ray_error_logs() if ret and not self.in_progress: self._last_logs = ret return ret From 7237ef67f971ee715dfb343134177bbca704bd5d Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 10 Apr 2023 09:48:08 -0700 Subject: [PATCH 063/104] Undo changes to job_manager Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/job_manager.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/release/ray_release/job_manager/job_manager.py b/release/ray_release/job_manager/job_manager.py index dcd39b0f2e28..72bc7f24de61 100644 --- a/release/ray_release/job_manager/job_manager.py +++ b/release/ray_release/job_manager/job_manager.py @@ -119,6 +119,3 @@ def get_last_logs(self): # return None job_client = self._get_job_client() return job_client.get_job_logs(self.last_job_id) - - def get_last_ray_logs(self): - return None From d1011383c3ecaf8d495eded04b99d225eb1d5a67 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 10 Apr 2023 14:45:41 -0700 Subject: [PATCH 064/104] Use api to download rather than stream ray log files Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 1b605b412e84..0f50b083f7e6 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -37,7 +37,6 @@ HaJobStates.TERMINATED: -3, } - class AnyscaleJobManager: def __init__(self, cluster_manager: ClusterManager): self.start_time = None From 0a69db6bba56d0c7a1dd6bb9e6b243c543079d25 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 10 Apr 2023 15:58:45 -0700 Subject: [PATCH 065/104] Fix lints Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 0f50b083f7e6..1b605b412e84 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -37,6 +37,7 @@ HaJobStates.TERMINATED: -3, } + class AnyscaleJobManager: def __init__(self, cluster_manager: ClusterManager): self.start_time = None From 957fde48b2fdea431810a5fbffd16bdea1d33a22 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 17 Apr 2023 14:22:30 -0700 Subject: [PATCH 066/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 10 ---------- release/ray_release/glue.py | 1 - release/ray_release/result.py | 1 - release/ray_release/tests/test_glue.py | 2 +- 4 files changed, 1 insertion(+), 13 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index dffbc88090d5..a13bde1575d8 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -121,16 +121,6 @@ def get_step( if test.get("run", {}).get("type") == "client": step["agents"]["queue"] = str(RELEASE_QUEUE_CLIENT) - # Auto-retry on transient infra error (according to result.BuildkiteExitCode) - step["retry"] = { - "automatic": [ - { - "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR.value, - "limit": 2, - } - ] - } - # If a test is not stable, allow to soft fail stable = test.get("stable", True) if not stable: diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 5873cd100286..46ef68f45ce5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -460,7 +460,6 @@ def run_release_test( # non critical for some tests. So separate it from the general one. fetch_result_exception = None try: - raise ReleaseTestSetupError('hahahah') buildkite_group(":spiral_note_pad: Loading test configuration") cluster_manager, command_runner, artifact_path = _load_test_configuration( test, diff --git a/release/ray_release/result.py b/release/ray_release/result.py index b1bcfbcab967..013228438ea8 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -33,7 +33,6 @@ class Result: stable: bool = True smoke_test: bool = False - buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index fd35efa77916..1cf9cdcf1dd2 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -621,7 +621,7 @@ def testFetchResultFailsReqNonEmptyResult(self): self._run(result) self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) - self.assertEqual(result.status, "transient_infra_error") + self.assertEqual(result.status, "infra_error") # Ensure cluster was terminated, no matter what self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) From 04f84b644eb803a5a77cb050c628f0fcf7d6d8cb Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 17 Apr 2023 14:22:53 -0700 Subject: [PATCH 067/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 013228438ea8..5c7869d7d149 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -4,7 +4,6 @@ from typing import Optional, Dict, Tuple - class ResultStatus(enum.Enum): """ Overall status of the result test run @@ -20,7 +19,6 @@ class ResultStatus(enum.Enum): TIMEOUT = "timeout" - @dataclass class Result: results: Optional[Dict] = None From b940df3dbf0af1b39c5aa6d4666acfc0b3d8a604 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 18 Apr 2023 08:27:07 -0700 Subject: [PATCH 068/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 5c7869d7d149..ed476cf0e734 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -99,25 +99,6 @@ def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: return True -def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: - """ - Classify whether an infra-failure issue is a transient issue. This is based on - the status of its previous retries, and its runtime. - """ - if result_status not in [ResultStatus.INFRA_ERROR, ResultStatus.INFRA_TIMEOUT]: - # Not even an infra failure - return False - retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", 0)) - max_retry = int(os.environ.get("BUILDKITE_MAX_RETRIES", 1)) - if retry_count >= max_retry: - # Already reach retry limit - return False - if runtime > int(os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0)): - # Take too long to run - return False - return True - - def handle_exception( e: Exception, run_duration: int ) -> Tuple[ExitCode, ResultStatus, Optional[int]]: From 66eaa384903fec8c583abb401402217e35263dec Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 18 Apr 2023 13:32:56 -0700 Subject: [PATCH 069/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 1b605b412e84..eeac25ff48c6 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -262,7 +262,8 @@ def run_and_wait( ) return self._wait_job(timeout) - def _get_ray_error_logs(self) -> Optional[str]: + @staticmethod + def _get_ray_error_logs(cluster_id: str) -> Optional[str]: """ Obtain any ray logs that contain keywords that indicate a crash, such as ERROR or Traceback @@ -333,7 +334,8 @@ def _get_logs(): print("", flush=True) output = buf.getvalue().strip() if "### Starting ###" not in output: - output = self._get_ray_error_logs() + output = AnyscaleJobManager._get_ray_error_logs( + self.cluster_manager.cluster_id) assert output, "No logs fetched" return "\n".join(output.splitlines()[-LAST_LOGS_LENGTH * 3 :]) From 3eac9ad08775573e923db68e6d6a1602f8182488 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 18 Apr 2023 14:59:38 -0700 Subject: [PATCH 070/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index eeac25ff48c6..1b605b412e84 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -262,8 +262,7 @@ def run_and_wait( ) return self._wait_job(timeout) - @staticmethod - def _get_ray_error_logs(cluster_id: str) -> Optional[str]: + def _get_ray_error_logs(self) -> Optional[str]: """ Obtain any ray logs that contain keywords that indicate a crash, such as ERROR or Traceback @@ -334,8 +333,7 @@ def _get_logs(): print("", flush=True) output = buf.getvalue().strip() if "### Starting ###" not in output: - output = AnyscaleJobManager._get_ray_error_logs( - self.cluster_manager.cluster_id) + output = self._get_ray_error_logs() assert output, "No logs fetched" return "\n".join(output.splitlines()[-LAST_LOGS_LENGTH * 3 :]) From 443b9f712732c8ebeb125b63cfc4b57ca28159aa Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:37:11 -0700 Subject: [PATCH 071/104] Auto-retry for infrastructure errors Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 447 ++++++++++++++++------------------ release/ray_release/result.py | 12 + 2 files changed, 225 insertions(+), 234 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..50494aeb5cf0 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -6,11 +6,9 @@ from ray_release.alerts.handle import handle_result, require_result from ray_release.anyscale_util import get_cluster_name from ray_release.buildkite.output import buildkite_group, buildkite_open_last -from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.cluster_manager.full import FullClusterManager from ray_release.cluster_manager.minimal import MinimalClusterManager from ray_release.command_runner.job_runner import JobRunner -from ray_release.command_runner.command_runner import CommandRunner from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner from ray_release.command_runner.sdk_runner import SDKRunner from ray_release.config import ( @@ -92,33 +90,43 @@ def _get_extra_tags_from_env() -> dict: return {key.lower(): os.getenv(key, "") for key in env_vars} -def _load_test_configuration( +def run_release_test( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, + reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, + cluster_id: Optional[str] = None, + cluster_env_id: Optional[str] = None, no_terminate: bool = False, -) -> Tuple[ClusterManager, CommandRunner, str]: +) -> Result: + buildkite_group(":spiral_note_pad: Loading test configuration") + validate_test(test) + logger.info(f"Test config: {test}") - # Populate result paramaters result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test + buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") buildkite_job_id = os.getenv("BUILDKITE_JOB_ID", "") + if buildkite_url: buildkite_url += "#" + buildkite_job_id + result.buildkite_url = buildkite_url result.buildkite_job_id = buildkite_job_id - # Setting up working directory working_dir = test["working_dir"] + + old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) + start_time = time.monotonic() run_type = test["run"].get("type", DEFAULT_RUN_TYPE) # Workaround while Anyscale Jobs don't support leaving cluster alive @@ -153,6 +161,7 @@ def _load_test_configuration( logger.info(f"Got command runner cls: {command_runner_cls}") logger.info(f"Got file manager cls: {file_manager_cls}") + # Extra tags to be set on resources on cloud provider's side extra_tags = _get_extra_tags_from_env() # We don't need other attributes as they can be derived from the name @@ -176,267 +185,230 @@ def _load_test_configuration( except Exception as e: raise ReleaseTestSetupError(f"Error setting up release test: {e}") from e - return cluster_manager, command_runner, artifact_path + pipeline_exception = None + # non critical for some tests. So separate it from the general one. + fetch_result_exception = None + try: + setup_signal_handling() + # Load configs + cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) + cluster_compute = load_test_cluster_compute(test) + if cluster_env_id: + try: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_cluster_env() + cluster_manager.fetch_build_info() + logger.info( + "Using overridden cluster environment with ID " + f"{cluster_env_id} and build ID " + f"{cluster_manager.cluster_env_build_id}" + ) + except Exception as e: + raise ClusterEnvCreateError( + f"Could not get existing overridden cluster environment " + f"{cluster_env_id}: {e}" + ) from e + else: + cluster_manager.set_cluster_env(cluster_env) -def _setup_cluster_environment( - test: Test, - result: Result, - cluster_manager: ClusterManager, - ray_wheels_url: str, - cluster_env_id: Optional[str], -) -> Tuple[str, int, int, int, int]: - setup_signal_handling() - # Load configs - cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) - cluster_compute = load_test_cluster_compute(test) - - if cluster_env_id: - try: - cluster_manager.cluster_env_id = cluster_env_id - cluster_manager.build_cluster_env() - cluster_manager.fetch_build_info() - logger.info( - "Using overridden cluster environment with ID " - f"{cluster_env_id} and build ID " - f"{cluster_manager.cluster_env_build_id}" - ) - except Exception as e: - raise ClusterEnvCreateError( - f"Could not get existing overridden cluster environment " - f"{cluster_env_id}: {e}" - ) from e - else: - cluster_manager.set_cluster_env(cluster_env) + # Load some timeouts + build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) + command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) + cluster_timeout = int( + test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) + ) - # Load some timeouts - build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) - command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) - cluster_timeout = int(test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT)) + # Get prepare command timeout, if any + prepare_cmd = test["run"].get("prepare", None) + if prepare_cmd: + prepare_timeout = test["run"].get("prepare_timeout", command_timeout) + else: + prepare_timeout = 0 - # Get prepare command timeout, if any - prepare_cmd = test["run"].get("prepare", None) - if prepare_cmd: - prepare_timeout = test["run"].get("prepare_timeout", command_timeout) - else: - prepare_timeout = 0 + # Base maximum uptime on the combined command and prepare timeouts + command_and_prepare_timeout = command_timeout + prepare_timeout - # Base maximum uptime on the combined command and prepare timeouts - command_and_prepare_timeout = command_timeout + prepare_timeout + # Use default timeout = 0 here if wait_for_nodes is empty. This is to make + # sure we don't inflate the maximum_uptime_minutes too much if we don't wait + # for nodes at all. + # The actual default will be otherwise loaded further down. + wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) - # Use default timeout = 0 here if wait_for_nodes is empty. This is to make - # sure we don't inflate the maximum_uptime_minutes too much if we don't wait - # for nodes at all. - # The actual default will be otherwise loaded further down. - wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) + autosuspend_mins = test["cluster"].get("autosuspend_mins", None) + if autosuspend_mins: + cluster_manager.autosuspend_minutes = autosuspend_mins + autosuspend_base = autosuspend_mins + else: + cluster_manager.autosuspend_minutes = min( + DEFAULT_AUTOSUSPEND_MINS, + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, + ) + # Maximum uptime should be based on the command timeout, not the + # DEFAULT_AUTOSUSPEND_MINS + autosuspend_base = ( + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES + ) - autosuspend_mins = test["cluster"].get("autosuspend_mins", None) - if autosuspend_mins: - cluster_manager.autosuspend_minutes = autosuspend_mins - autosuspend_base = autosuspend_mins - else: - cluster_manager.autosuspend_minutes = min( - DEFAULT_AUTOSUSPEND_MINS, - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, - ) - # Maximum uptime should be based on the command timeout, not the - # DEFAULT_AUTOSUSPEND_MINS - autosuspend_base = ( - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES - ) + maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) + if maximum_uptime_minutes: + cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes + else: + cluster_manager.maximum_uptime_minutes = ( + autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + ) - maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) - if maximum_uptime_minutes: - cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes - else: - cluster_manager.maximum_uptime_minutes = ( - autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + # Set cluster compute here. Note that this may use timeouts provided + # above. + cluster_manager.set_cluster_compute( + cluster_compute, + extra_tags=extra_tags, ) - # Set cluster compute here. Note that this may use timeouts provided - # above. - cluster_manager.set_cluster_compute( - cluster_compute, - extra_tags=result.extra_tags, - ) + buildkite_group(":nut_and_bolt: Setting up local environment") + driver_setup_script = test.get("driver_setup", None) + if driver_setup_script: + try: + run_bash_script(driver_setup_script) + except Exception as e: + raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - return prepare_cmd, prepare_timeout, build_timeout, cluster_timeout, command_timeout + # Install local dependencies + command_runner.prepare_local_env(ray_wheels_url) + # Re-install anyscale package as local dependencies might have changed + # from local env setup + reinstall_anyscale_dependencies() -def _setup_local_environment( - test: Test, - command_runner: CommandRunner, - ray_wheels_url: str, -) -> None: - driver_setup_script = test.get("driver_setup", None) - if driver_setup_script: - try: - run_bash_script(driver_setup_script) - except Exception as e: - raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e + # Print installed pip packages + buildkite_group(":bulb: Local environment information") + pip_packages = get_pip_packages() + pip_package_string = "\n".join(pip_packages) + logger.info(f"Installed python packages:\n{pip_package_string}") - # Install local dependencies - command_runner.prepare_local_env(ray_wheels_url) + if isinstance(cluster_manager, FullClusterManager): + if not no_terminate: + register_handler( + lambda sig, frame: cluster_manager.terminate_cluster(wait=True) + ) + + # Start cluster + if cluster_id: + buildkite_group(":rocket: Using existing cluster") + # Re-use existing cluster ID for development + cluster_manager.cluster_id = cluster_id + cluster_manager.cluster_name = get_cluster_name(cluster_id) + else: + buildkite_group(":gear: Building cluster environment") - # Re-install anyscale package as local dependencies might have changed - # from local env setup - reinstall_anyscale_dependencies() + if cluster_env_id: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_configs(timeout=build_timeout) -def _local_environment_information( - result: Result, - cluster_manager: ClusterManager, - command_runner: CommandRunner, - build_timeout: int, - cluster_timeout: int, - no_terminate: bool, - cluster_id: Optional[str], - cluster_env_id: Optional[str], -) -> None: - pip_packages = get_pip_packages() - pip_package_string = "\n".join(pip_packages) - logger.info(f"Installed python packages:\n{pip_package_string}") - - if isinstance(cluster_manager, FullClusterManager): - if not no_terminate: - register_handler( - lambda sig, frame: cluster_manager.terminate_cluster(wait=True) - ) + if isinstance(cluster_manager, FullClusterManager): + buildkite_group(":rocket: Starting up cluster") + cluster_manager.start_cluster(timeout=cluster_timeout) + elif isinstance(command_runner, AnyscaleJobRunner): + command_runner.job_manager.cluster_startup_timeout = cluster_timeout - # Start cluster - if cluster_id: - buildkite_group(":rocket: Using existing cluster") - # Re-use existing cluster ID for development - cluster_manager.cluster_id = cluster_id - cluster_manager.cluster_name = get_cluster_name(cluster_id) - else: - buildkite_group(":gear: Building cluster environment") + result.cluster_url = cluster_manager.get_cluster_url() + result.cluster_id = cluster_manager.cluster_id - if cluster_env_id: - cluster_manager.cluster_env_id = cluster_env_id + # Upload files + buildkite_group(":wrench: Preparing remote environment") + command_runner.prepare_remote_env() - cluster_manager.build_configs(timeout=build_timeout) + wait_for_nodes = test["run"].get("wait_for_nodes", None) - if isinstance(cluster_manager, FullClusterManager): - buildkite_group(":rocket: Starting up cluster") - cluster_manager.start_cluster(timeout=cluster_timeout) - elif isinstance(command_runner, AnyscaleJobRunner): - command_runner.job_manager.cluster_startup_timeout = cluster_timeout + if wait_for_nodes: + buildkite_group(":stopwatch: Waiting for nodes to come up") + # Overwrite wait_timeout from above to account for better default + wait_timeout = int( + wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) + ) + num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] + command_runner.wait_for_nodes(num_nodes, wait_timeout) - result.cluster_url = cluster_manager.get_cluster_url() - result.cluster_id = cluster_manager.cluster_id + if prepare_cmd: + try: + command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) + except CommandError as e: + raise PrepareCommandError(e) + except CommandTimeout as e: + raise PrepareCommandTimeout(e) + buildkite_group(":runner: Running test script") + command = test["run"]["script"] + command_env = {} -def _prepare_remote_environment( - test: Test, - command_runner: CommandRunner, - prepare_cmd: bool, - prepare_timeout: int, -) -> None: - command_runner.prepare_remote_env() - - wait_for_nodes = test["run"].get("wait_for_nodes", None) - - if wait_for_nodes: - buildkite_group(":stopwatch: Waiting for nodes to come up") - # Overwrite wait_timeout from above to account for better default - wait_timeout = int( - wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) - ) - num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] - command_runner.wait_for_nodes(num_nodes, wait_timeout) + if smoke_test: + command = f"{command} --smoke-test" + command_env["IS_SMOKE_TEST"] = "1" + + is_long_running = test["run"].get("long_running", False) + + start_time_unix = time.time() - if prepare_cmd: try: - command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) + command_runner.run_command( + command, + env=command_env, + timeout=command_timeout, + raise_on_timeout=not is_long_running, + ) + except ( + TestCommandError, + PrepareCommandError, + TestCommandTimeout, + PrepareCommandTimeout, + ) as e: + raise e except CommandError as e: - raise PrepareCommandError(e) + raise TestCommandError(e) except CommandTimeout as e: - raise PrepareCommandTimeout(e) - + if not is_long_running: + # Only raise error if command is not long running + raise TestCommandTimeout(e) -def _running_test_script( - test: Test, - smoke_test: bool, - command_runner: CommandRunner, - command_timeout: int, -) -> None: - command = test["run"]["script"] - command_env = {} - - if smoke_test: - command = f"{command} --smoke-test" - command_env["IS_SMOKE_TEST"] = "1" + buildkite_group(":floppy_disk: Fetching results") + try: + command_results = command_runner.fetch_results() + except Exception as e: + logger.exception(f"Could not fetch results for test command: {e}") + command_results = {} + fetch_result_exception = e - is_long_running = test["run"].get("long_running", False) + if artifact_path: + try: + command_runner.fetch_artifact() + except Exception as e: + logger.error("Could not fetch artifact for test command") + logger.exception(e) - try: - command_runner.run_command( - command, - env=command_env, - timeout=command_timeout, - raise_on_timeout=not is_long_running, - ) - except ( - TestCommandError, - PrepareCommandError, - TestCommandTimeout, - PrepareCommandTimeout, - ) as e: - raise e - except CommandError as e: - raise TestCommandError(e) - except CommandTimeout as e: - if not is_long_running: - # Only raise error if command is not long running - raise TestCommandTimeout(e) - - -def _fetching_results( - result: Result, - command_runner: CommandRunner, - artifact_path: Optional[str], - smoke_test: bool, - start_time_unix: int, -) -> Tuple[dict, Exception]: - fetch_result_exception = None - try: - command_results = command_runner.fetch_results() - except Exception as e: - logger.exception(f"Could not fetch results for test command: {e}") - command_results = {} - fetch_result_exception = e + # Postprocess result: + if "last_update" in command_results: + command_results["last_update_diff"] = time.time() - command_results.get( + "last_update", 0.0 + ) - if artifact_path: try: - command_runner.fetch_artifact() + # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py + # Timeout is the time the test took divided by 200 + # (~7 minutes for a 24h test) but no less than 30s + # and no more than 900s + metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) + command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) + metrics = command_runner.fetch_metrics() except Exception as e: - logger.error("Could not fetch artifact for test command") - logger.exception(e) - - # Postprocess result: - if "last_update" in command_results: - command_results["last_update_diff"] = time.time() - command_results.get( - "last_update", 0.0 - ) - - try: - # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py - # Timeout is the time the test took divided by 200 - # (~7 minutes for a 24h test) but no less than 30s - # and no more than 900s - metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) - command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) - metrics = command_runner.fetch_metrics() - except Exception as e: - logger.exception(f"Could not fetch metrics for test command: {e}") - metrics = {} + logger.exception(f"Could not fetch metrics for test command: {e}") + metrics = {} - if smoke_test: - command_results["smoke_test"] = True + if smoke_test: + command_results["smoke_test"] = True - result.results = command_results - result.status = "finished" + result.results = command_results + result.status = "finished" return metrics, fetch_result_exception @@ -544,7 +516,10 @@ def run_release_test( if not no_terminate and cluster_manager: buildkite_group(":earth_africa: Terminating cluster") - cluster_manager.terminate_cluster(wait=False) + try: + cluster_manager.terminate_cluster(wait=False) + except Exception as e: + logger.exception(f"Could not terminate cluster: {e}") if hasattr(command_runner, "cleanup"): command_runner.cleanup() @@ -586,8 +561,12 @@ def run_release_test( result.last_logs = traceback.format_exc() buildkite_group(":memo: Reporting results", open=True) - for reporter in reporters or []: - reporter.report_result(test, result) + reporters = reporters or [] + for reporter in reporters: + try: + reporter.report_result(test, result) + except Exception as e: + logger.exception(f"Error reporting results via {type(reporter)}: {e}") if pipeline_exception: raise pipeline_exception diff --git a/release/ray_release/result.py b/release/ray_release/result.py index ed476cf0e734..4c2667b892ff 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -79,6 +79,18 @@ class ExitCode(enum.Enum): COMMAND_TIMEOUT = 42 PREPARE_ERROR = 43 +class BuildkiteExitCode(enum.Enum): + """ + Final exit code the test runner passes to buildkite-agent. This exit code is used + to determine job policies, such as automatic retries + """ + SUCCESS = 0 + UNKNOWN = 1 + TRANSIENT_INFRA_ERROR = 10 + INFRA_ERROR = 11 + INFRA_TIMEOUT = 30 + ERROR = 40 + TIMEOUT = 42 def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ From 5d04eef369dd80604107768612c87315883ebad0 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 16:27:22 -0700 Subject: [PATCH 072/104] Fix unit tests Signed-off-by: Cuong Nguyen --- release/ray_release/tests/test_glue.py | 109 ++++++++----------------- 1 file changed, 36 insertions(+), 73 deletions(-) diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 1cf9cdcf1dd2..874b7a376b3e 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -24,19 +24,13 @@ ClusterEnvBuildError, ClusterEnvBuildTimeout, ClusterEnvCreateError, - ClusterCreationError, ClusterStartupError, ClusterStartupTimeout, RemoteEnvSetupError, CommandError, - PrepareCommandError, CommandTimeout, - PrepareCommandTimeout, - TestCommandError, - TestCommandTimeout, FetchResultError, LogsError, - ResultsAlert, ClusterNodesWaitTimeout, ) from ray_release.file_manager.file_manager import FileManager @@ -251,7 +245,7 @@ def _succeed_until(self, until: str): self.mock_alert_return = None - def _run(self, result: Result, **kwargs): + def _run(self, result: Result, **kwargs) -> Result: run_release_test( test=self.test, anyscale_project=self.anyscale_project, @@ -267,26 +261,23 @@ def testInvalidClusterEnv(self): with patch( "ray_release.glue.load_test_cluster_env", _fail_on_call(ReleaseTestConfigError), - ), self.assertRaises(ReleaseTestConfigError): + ): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_env.yaml")) - with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterEnv("{{ INVALID") - with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterEnv("{'test': true, 'fail}") - with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) def testInvalidClusterCompute(self): @@ -295,26 +286,23 @@ def testInvalidClusterCompute(self): with patch( "ray_release.glue.load_test_cluster_compute", _fail_on_call(ReleaseTestConfigError), - ), self.assertRaises(ReleaseTestConfigError): + ): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml")) - with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterCompute("{{ INVALID") - with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterCompute("{'test': true, 'fail}") - with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) @@ -323,9 +311,8 @@ def testAutomaticClusterEnvVariables(self): self._succeed_until("local_env") - with self.assertRaises(LocalEnvSetupError): - self._run(result) - + self._run(result) + self.assertEqual(result.return_code, LocalEnvSetupError().exit_code.value) cluster_manager = self.instances["cluster_manager"] command_timeout = self.test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) @@ -362,8 +349,7 @@ def testInvalidPrepareLocalEnv(self): self.command_runner_return["prepare_local_env"] = _fail_on_call( LocalEnvSetupError ) - with self.assertRaises(LocalEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testDriverSetupFails(self): @@ -371,8 +357,7 @@ def testDriverSetupFails(self): self._succeed_until("local_env") - with self.assertRaises(LocalEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testInvalidClusterIdOverride(self): @@ -382,16 +367,14 @@ def testInvalidClusterIdOverride(self): self.sdk.returns["get_cluster_environment"] = None - with self.assertRaises(ClusterEnvCreateError): - self._run(result, cluster_env_id="existing") + self._run(result, cluster_env_id="existing") self.sdk.returns["get_cluster_environment"] = APIDict( result=APIDict(config_json={"overridden": True}) ) - with self.assertRaises(Exception) as cm: # Fail somewhere else - self._run(result, cluster_env_id="existing") - self.assertNotIsInstance(cm.exception, ClusterEnvCreateError) + self._run(result, cluster_env_id="existing") + self.assertNotEqual(result.return_code, ClusterEnvCreateError().exit_code) def testBuildConfigFailsClusterCompute(self): result = Result() @@ -402,16 +385,14 @@ def testBuildConfigFailsClusterCompute(self): self.command_runner_return["prepare_local_env"] = None # Fails because API response faulty - with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster compute reason self.cluster_manager_return["create_cluster_compute"] = _fail_on_call( ClusterComputeCreateError, "Known" ) - with self.assertRaisesRegex(ClusterComputeCreateError, "Known"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) def testBuildConfigFailsClusterEnv(self): @@ -419,17 +400,14 @@ def testBuildConfigFailsClusterEnv(self): self._succeed_until("cluster_compute") - # Fails because API response faulty - with self.assertRaisesRegex(ClusterEnvCreateError, "Unexpected"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster env create reason self.cluster_manager_return["create_cluster_env"] = _fail_on_call( ClusterEnvCreateError, "Known" ) - with self.assertRaisesRegex(ClusterEnvCreateError, "Known"): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Now, succeed creation but fail on cluster env build @@ -438,16 +416,14 @@ def testBuildConfigFailsClusterEnv(self): self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildError ) - with self.assertRaises(ClusterEnvBuildError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_ERROR.value) # Now, fail on cluster env timeout self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildTimeout ) - with self.assertRaises(ClusterEnvBuildTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_TIMEOUT.value) def testStartClusterFails(self): @@ -456,8 +432,7 @@ def testStartClusterFails(self): self._succeed_until("cluster_env") # Fails because API response faulty - with self.assertRaises(ClusterCreationError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) self.cluster_manager_return["cluster_id"] = "valid" @@ -466,8 +441,7 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupError ) - with self.assertRaises(ClusterStartupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_ERROR.value) # Ensure cluster was terminated @@ -477,8 +451,7 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupTimeout ) - with self.assertRaises(ClusterStartupTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_TIMEOUT.value) # Ensure cluster was terminated @@ -492,8 +465,7 @@ def testPrepareRemoteEnvFails(self): self.command_runner_return["prepare_remote_env"] = _fail_on_call( RemoteEnvSetupError ) - with self.assertRaises(RemoteEnvSetupError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.REMOTE_ENV_SETUP_ERROR.value) # Ensure cluster was terminated @@ -508,8 +480,7 @@ def testWaitForNodesFails(self): self.command_runner_return["wait_for_nodes"] = _fail_on_call( ClusterNodesWaitTimeout ) - with self.assertRaises(ClusterNodesWaitTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) # Ensure cluster was terminated @@ -522,16 +493,14 @@ def testPrepareCommandFails(self): # Prepare command fails self.command_runner_return["run_prepare_command"] = _fail_on_call(CommandError) - with self.assertRaises(PrepareCommandError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.PREPARE_ERROR.value) # Prepare command times out self.command_runner_return["run_prepare_command"] = _fail_on_call( CommandTimeout ) - with self.assertRaises(PrepareCommandTimeout): - self._run(result) + self._run(result) # Special case: Prepare commands are usually waiting for nodes # (this may change in the future!) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) @@ -546,14 +515,12 @@ def testTestCommandFails(self): # Test command fails self.command_runner_return["run_command"] = _fail_on_call(CommandError) - with self.assertRaises(TestCommandError): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ERROR.value) # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - with self.assertRaises(TestCommandTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # Ensure cluster was terminated @@ -566,8 +533,7 @@ def testTestCommandTimeoutLongRunning(self): # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - with self.assertRaises(TestCommandTimeout): - self._run(result) + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # But now set test to long running @@ -616,10 +582,9 @@ def testFetchResultFailsReqNonEmptyResult(self): self._succeed_until("test_command") self.command_runner_return["fetch_results"] = _fail_on_call(FetchResultError) - with self.assertRaisesRegex(FetchResultError, "Fail"): - with self.assertLogs(logger, "ERROR") as cm: - self._run(result) - self.assertTrue(any("Could not fetch results" in o for o in cm.output)) + with self.assertLogs(logger, "ERROR") as cm: + self._run(result) + self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) self.assertEqual(result.status, "infra_error") @@ -649,9 +614,7 @@ def testAlertFails(self): self.mock_alert_return = "Alert raised" - with self.assertRaises(ResultsAlert): - self._run(result) - + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ALERT.value) self.assertEqual(result.status, "error") From 2aba338173ee9b7b0a7e87ca5f80828412b12e6b Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 13:04:03 -0700 Subject: [PATCH 073/104] Undo more changes Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 3 + release/ray_release/tests/test_glue.py | 109 +++++++++++++++++-------- 2 files changed, 76 insertions(+), 36 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 50494aeb5cf0..188b1bab7354 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -571,4 +571,7 @@ def run_release_test( if pipeline_exception: raise pipeline_exception + if pipeline_exception: + raise pipeline_exception + return result diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 874b7a376b3e..1cf9cdcf1dd2 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -24,13 +24,19 @@ ClusterEnvBuildError, ClusterEnvBuildTimeout, ClusterEnvCreateError, + ClusterCreationError, ClusterStartupError, ClusterStartupTimeout, RemoteEnvSetupError, CommandError, + PrepareCommandError, CommandTimeout, + PrepareCommandTimeout, + TestCommandError, + TestCommandTimeout, FetchResultError, LogsError, + ResultsAlert, ClusterNodesWaitTimeout, ) from ray_release.file_manager.file_manager import FileManager @@ -245,7 +251,7 @@ def _succeed_until(self, until: str): self.mock_alert_return = None - def _run(self, result: Result, **kwargs) -> Result: + def _run(self, result: Result, **kwargs): run_release_test( test=self.test, anyscale_project=self.anyscale_project, @@ -261,23 +267,26 @@ def testInvalidClusterEnv(self): with patch( "ray_release.glue.load_test_cluster_env", _fail_on_call(ReleaseTestConfigError), - ): + ), self.assertRaises(ReleaseTestConfigError): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_env.yaml")) - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterEnv("{{ INVALID") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterEnv("{'test': true, 'fail}") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) def testInvalidClusterCompute(self): @@ -286,23 +295,26 @@ def testInvalidClusterCompute(self): with patch( "ray_release.glue.load_test_cluster_compute", _fail_on_call(ReleaseTestConfigError), - ): + ), self.assertRaises(ReleaseTestConfigError): self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because file not found os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml")) - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid jinja template self.writeClusterCompute("{{ INVALID") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) # Fails because invalid json self.writeClusterCompute("{'test': true, 'fail}") - self._run(result) + with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) @@ -311,8 +323,9 @@ def testAutomaticClusterEnvVariables(self): self._succeed_until("local_env") - self._run(result) - self.assertEqual(result.return_code, LocalEnvSetupError().exit_code.value) + with self.assertRaises(LocalEnvSetupError): + self._run(result) + cluster_manager = self.instances["cluster_manager"] command_timeout = self.test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) @@ -349,7 +362,8 @@ def testInvalidPrepareLocalEnv(self): self.command_runner_return["prepare_local_env"] = _fail_on_call( LocalEnvSetupError ) - self._run(result) + with self.assertRaises(LocalEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testDriverSetupFails(self): @@ -357,7 +371,8 @@ def testDriverSetupFails(self): self._succeed_until("local_env") - self._run(result) + with self.assertRaises(LocalEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) def testInvalidClusterIdOverride(self): @@ -367,14 +382,16 @@ def testInvalidClusterIdOverride(self): self.sdk.returns["get_cluster_environment"] = None - self._run(result, cluster_env_id="existing") + with self.assertRaises(ClusterEnvCreateError): + self._run(result, cluster_env_id="existing") self.sdk.returns["get_cluster_environment"] = APIDict( result=APIDict(config_json={"overridden": True}) ) - self._run(result, cluster_env_id="existing") - self.assertNotEqual(result.return_code, ClusterEnvCreateError().exit_code) + with self.assertRaises(Exception) as cm: # Fail somewhere else + self._run(result, cluster_env_id="existing") + self.assertNotIsInstance(cm.exception, ClusterEnvCreateError) def testBuildConfigFailsClusterCompute(self): result = Result() @@ -385,14 +402,16 @@ def testBuildConfigFailsClusterCompute(self): self.command_runner_return["prepare_local_env"] = None # Fails because API response faulty - self._run(result) + with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster compute reason self.cluster_manager_return["create_cluster_compute"] = _fail_on_call( ClusterComputeCreateError, "Known" ) - self._run(result) + with self.assertRaisesRegex(ClusterComputeCreateError, "Known"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) def testBuildConfigFailsClusterEnv(self): @@ -400,14 +419,17 @@ def testBuildConfigFailsClusterEnv(self): self._succeed_until("cluster_compute") - self._run(result) + # Fails because API response faulty + with self.assertRaisesRegex(ClusterEnvCreateError, "Unexpected"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Fails for random cluster env create reason self.cluster_manager_return["create_cluster_env"] = _fail_on_call( ClusterEnvCreateError, "Known" ) - self._run(result) + with self.assertRaisesRegex(ClusterEnvCreateError, "Known"): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) # Now, succeed creation but fail on cluster env build @@ -416,14 +438,16 @@ def testBuildConfigFailsClusterEnv(self): self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildError ) - self._run(result) + with self.assertRaises(ClusterEnvBuildError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_ERROR.value) # Now, fail on cluster env timeout self.cluster_manager_return["build_cluster_env"] = _fail_on_call( ClusterEnvBuildTimeout ) - self._run(result) + with self.assertRaises(ClusterEnvBuildTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_TIMEOUT.value) def testStartClusterFails(self): @@ -432,7 +456,8 @@ def testStartClusterFails(self): self._succeed_until("cluster_env") # Fails because API response faulty - self._run(result) + with self.assertRaises(ClusterCreationError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) self.cluster_manager_return["cluster_id"] = "valid" @@ -441,7 +466,8 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupError ) - self._run(result) + with self.assertRaises(ClusterStartupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_ERROR.value) # Ensure cluster was terminated @@ -451,7 +477,8 @@ def testStartClusterFails(self): self.cluster_manager_return["start_cluster"] = _fail_on_call( ClusterStartupTimeout ) - self._run(result) + with self.assertRaises(ClusterStartupTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_TIMEOUT.value) # Ensure cluster was terminated @@ -465,7 +492,8 @@ def testPrepareRemoteEnvFails(self): self.command_runner_return["prepare_remote_env"] = _fail_on_call( RemoteEnvSetupError ) - self._run(result) + with self.assertRaises(RemoteEnvSetupError): + self._run(result) self.assertEqual(result.return_code, ExitCode.REMOTE_ENV_SETUP_ERROR.value) # Ensure cluster was terminated @@ -480,7 +508,8 @@ def testWaitForNodesFails(self): self.command_runner_return["wait_for_nodes"] = _fail_on_call( ClusterNodesWaitTimeout ) - self._run(result) + with self.assertRaises(ClusterNodesWaitTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) # Ensure cluster was terminated @@ -493,14 +522,16 @@ def testPrepareCommandFails(self): # Prepare command fails self.command_runner_return["run_prepare_command"] = _fail_on_call(CommandError) - self._run(result) + with self.assertRaises(PrepareCommandError): + self._run(result) self.assertEqual(result.return_code, ExitCode.PREPARE_ERROR.value) # Prepare command times out self.command_runner_return["run_prepare_command"] = _fail_on_call( CommandTimeout ) - self._run(result) + with self.assertRaises(PrepareCommandTimeout): + self._run(result) # Special case: Prepare commands are usually waiting for nodes # (this may change in the future!) self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value) @@ -515,12 +546,14 @@ def testTestCommandFails(self): # Test command fails self.command_runner_return["run_command"] = _fail_on_call(CommandError) - self._run(result) + with self.assertRaises(TestCommandError): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_ERROR.value) # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - self._run(result) + with self.assertRaises(TestCommandTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # Ensure cluster was terminated @@ -533,7 +566,8 @@ def testTestCommandTimeoutLongRunning(self): # Test command times out self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) - self._run(result) + with self.assertRaises(TestCommandTimeout): + self._run(result) self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) # But now set test to long running @@ -582,9 +616,10 @@ def testFetchResultFailsReqNonEmptyResult(self): self._succeed_until("test_command") self.command_runner_return["fetch_results"] = _fail_on_call(FetchResultError) - with self.assertLogs(logger, "ERROR") as cm: - self._run(result) - self.assertTrue(any("Could not fetch results" in o for o in cm.output)) + with self.assertRaisesRegex(FetchResultError, "Fail"): + with self.assertLogs(logger, "ERROR") as cm: + self._run(result) + self.assertTrue(any("Could not fetch results" in o for o in cm.output)) self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value) self.assertEqual(result.status, "infra_error") @@ -614,7 +649,9 @@ def testAlertFails(self): self.mock_alert_return = "Alert raised" - self._run(result) + with self.assertRaises(ResultsAlert): + self._run(result) + self.assertEqual(result.return_code, ExitCode.COMMAND_ALERT.value) self.assertEqual(result.status, "error") From 22dee5f82489ae9ff866be2fa38ebf6048ba5369 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 14:01:54 -0700 Subject: [PATCH 074/104] Fix lints Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 4c2667b892ff..67370aba6a86 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -79,18 +79,21 @@ class ExitCode(enum.Enum): COMMAND_TIMEOUT = 42 PREPARE_ERROR = 43 -class BuildkiteExitCode(enum.Enum): - """ - Final exit code the test runner passes to buildkite-agent. This exit code is used - to determine job policies, such as automatic retries - """ - SUCCESS = 0 - UNKNOWN = 1 - TRANSIENT_INFRA_ERROR = 10 - INFRA_ERROR = 11 - INFRA_TIMEOUT = 30 - ERROR = 40 - TIMEOUT = 42 + +def _is_transient_error(runtime: int) -> bool: + """ + Classify whether an infra-failure issue is a transient issue. This is based on + the status of its previous retries, and its runtime. + """ + retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", "0")) + if retry_count > 0: + # Already retried at least once and failed again, not a transient issue + return False + if runtime > 30 * 60: + # Take too long to run + return False + return True + def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ From 64a1c9e4b43c522892f1453a19bbefcae684f688 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 13:58:27 -0700 Subject: [PATCH 075/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 14 +++++++++----- release/run_release_test.sh | 2 ++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 67370aba6a86..33b67611e676 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -80,16 +80,20 @@ class ExitCode(enum.Enum): PREPARE_ERROR = 43 -def _is_transient_error(runtime: int) -> bool: +def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ Classify whether an infra-failure issue is a transient issue. This is based on the status of its previous retries, and its runtime. """ - retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", "0")) - if retry_count > 0: - # Already retried at least once and failed again, not a transient issue + if result_status not in [ResultStatus.INFRA_ERROR, ResultStatus.INFRA_TIMEOUT]: + # Not even an infra failure + return False + retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", 0)) + max_retry = int(os.environ.get("BUILDKITE_MAX_RETRIES", 1)) + if retry_count >= max_retry: + # Already reach retry limit return False - if runtime > 30 * 60: + if runtime > os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0): # Take too long to run return False return True diff --git a/release/run_release_test.sh b/release/run_release_test.sh index 52b157a80c8f..95243e2b8826 100755 --- a/release/run_release_test.sh +++ b/release/run_release_test.sh @@ -133,11 +133,13 @@ while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do START=$(date +%s) set +e + START=`date +%s` trap _term SIGINT SIGTERM python "${RAY_TEST_SCRIPT}" "$@" & proc=$! wait "$proc" + END=`date +%s` EXIT_CODE=$? set -e From ef09f03fe03d67851706fba5c6c1c851db2a4082 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 15:11:26 -0700 Subject: [PATCH 076/104] fix sh Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 33b67611e676..e521183f8e9c 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -93,7 +93,7 @@ def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: if retry_count >= max_retry: # Already reach retry limit return False - if runtime > os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0): + if runtime > int(os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0)): # Take too long to run return False return True From d88b4781673dd1c18688fbe6f25c8a204bcc711b Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 16:32:51 -0700 Subject: [PATCH 077/104] Fix sh again Signed-off-by: Cuong Nguyen --- release/run_release_test.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/release/run_release_test.sh b/release/run_release_test.sh index 95243e2b8826..52b157a80c8f 100755 --- a/release/run_release_test.sh +++ b/release/run_release_test.sh @@ -133,13 +133,11 @@ while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do START=$(date +%s) set +e - START=`date +%s` trap _term SIGINT SIGTERM python "${RAY_TEST_SCRIPT}" "$@" & proc=$! wait "$proc" - END=`date +%s` EXIT_CODE=$? set -e From e234c1dd267df26a95ae40793bd84a525ff1cb36 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:59:29 -0700 Subject: [PATCH 078/104] Exit buildkite job using buildkite return code Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 11 +++++++++++ release/ray_release/glue.py | 3 --- release/ray_release/result.py | 1 + release/ray_release/scripts/run_release_test.py | 4 ++-- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index a13bde1575d8..9fcae90beebd 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,6 +15,7 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar +from ray_release.result import BuildkiteExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" @@ -121,6 +122,16 @@ def get_step( if test.get("run", {}).get("type") == "client": step["agents"]["queue"] = str(RELEASE_QUEUE_CLIENT) + # Auto-retry on transient infra error (according to result.BuildkiteExitCode) + step["retry"] = { + "automatic": [ + { + "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR, + "limit": 2, + } + ] + } + # If a test is not stable, allow to soft fail stable = test.get("stable", True) if not stable: diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 188b1bab7354..50494aeb5cf0 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -571,7 +571,4 @@ def run_release_test( if pipeline_exception: raise pipeline_exception - if pipeline_exception: - raise pipeline_exception - return result diff --git a/release/ray_release/result.py b/release/ray_release/result.py index e521183f8e9c..b0cb7148c000 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -31,6 +31,7 @@ class Result: stable: bool = True smoke_test: bool = False + buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 449dee26557d..c7de484b382a 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -166,9 +166,9 @@ def main( return_code = e.exit_code.value logger.info( f"Release test pipeline for test {test['name']} completed. " - f"Returning with exit code = {return_code}" + f"Returning with exit code = {result.return_code}" ) - sys.exit(return_code) + sys.exit(result.buildkite_return_code) if __name__ == "__main__": From 2b478c5741e70c217c3c8b74a32199ab89dc58b1 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 5 Apr 2023 10:36:42 -0700 Subject: [PATCH 079/104] Name consistency Signed-off-by: Cuong Nguyen --- release/ray_release/scripts/run_release_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index c7de484b382a..d2d1c836e565 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -168,7 +168,7 @@ def main( f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}" ) - sys.exit(result.buildkite_return_code) + sys.exit(result.buildkite_exit_code) if __name__ == "__main__": From ee54250850244804bc42267d79bcdd0b53d4bff5 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 11:11:10 -0700 Subject: [PATCH 080/104] Move retry logic to sh file Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 12 +----------- release/ray_release/scripts/run_release_test.py | 2 +- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index 9fcae90beebd..3078be809167 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,7 +15,7 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar -from ray_release.result import BuildkiteExitCode +from ray_release.result import ExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" @@ -122,16 +122,6 @@ def get_step( if test.get("run", {}).get("type") == "client": step["agents"]["queue"] = str(RELEASE_QUEUE_CLIENT) - # Auto-retry on transient infra error (according to result.BuildkiteExitCode) - step["retry"] = { - "automatic": [ - { - "exit_status": BuildkiteExitCode.TRANSIENT_INFRA_ERROR, - "limit": 2, - } - ] - } - # If a test is not stable, allow to soft fail stable = test.get("stable", True) if not stable: diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index d2d1c836e565..52bfea17c6b9 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -168,7 +168,7 @@ def main( f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}" ) - sys.exit(result.buildkite_exit_code) + sys.exit(result.return_code) if __name__ == "__main__": From 70954cca7e338344169a8acf7cea38a4c3ddb73f Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 12:57:40 -0700 Subject: [PATCH 081/104] More refactoring Signed-off-by: Cuong Nguyen --- release/ray_release/scripts/run_release_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 52bfea17c6b9..449dee26557d 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -166,9 +166,9 @@ def main( return_code = e.exit_code.value logger.info( f"Release test pipeline for test {test['name']} completed. " - f"Returning with exit code = {result.return_code}" + f"Returning with exit code = {return_code}" ) - sys.exit(result.return_code) + sys.exit(return_code) if __name__ == "__main__": From c07d123208c74b51bf472d46ca1da84d57c7a7cc Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Thu, 13 Apr 2023 13:04:03 -0700 Subject: [PATCH 082/104] Undo more changes Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 50494aeb5cf0..188b1bab7354 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -571,4 +571,7 @@ def run_release_test( if pipeline_exception: raise pipeline_exception + if pipeline_exception: + raise pipeline_exception + return result From bdbfb318b4f78cec5ffb497d5f899f1b82ca552f Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 4 Apr 2023 10:59:29 -0700 Subject: [PATCH 083/104] Exit buildkite job using buildkite return code Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index b0cb7148c000..245322c962ca 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -1,5 +1,6 @@ import enum import os +import os from dataclasses import dataclass from typing import Optional, Dict, Tuple From d4904dc4caea67958c47e4fea4482046080e93af Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sun, 9 Apr 2023 12:27:49 -0700 Subject: [PATCH 084/104] Log aggegration Signed-off-by: Cuong Nguyen --- release/ray_release/anyscale_util.py | 2 +- release/ray_release/reporter/db.py | 56 ++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/release/ray_release/anyscale_util.py b/release/ray_release/anyscale_util.py index 6552dae281da..6ef84cbea060 100644 --- a/release/ray_release/anyscale_util.py +++ b/release/ray_release/anyscale_util.py @@ -7,7 +7,7 @@ from anyscale.sdk.anyscale_client.sdk import AnyscaleSDK -LAST_LOGS_LENGTH = 10 +LAST_LOGS_LENGTH = 30 def find_cloud_by_name( diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index e9295140f921..e75231450971 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -1,6 +1,7 @@ import time import json import boto3 +from typing import Optional, List from botocore.config import Config from ray_release.reporter.reporter import Reporter @@ -13,6 +14,61 @@ class DBReporter(Reporter): def __init__(self): self.firehose = boto3.client("firehose", config=Config(region_name="us-west-2")) + def compute_stack_pattern(self, result: Result) -> Optional[str]: + stack_trace = self.compute_stack_trace(result) + return self.compute_unique_pattern(stack_trace) + + def compute_unique_pattern(stack_trace: List(str)) -> Optional[str]: + return None + + def compute_stack_trace(self, result: Result) -> List(str): + """ + Extract stack trace pattern from the logs. Stack trace pattern often matches + the following: + ERROR + Traceback (most recent call last): + File "...", line ..., in ... + ... + Exception: exception error + """ + error_stacktrace = [] + stacktrace = [] + logs = result.last_logs.split("\n") + i = 0 + while i < len(logs): + stack = [] + trace = error_stacktrace + if 'ERROR' in logs[i]: + stack.append(logs[i]) + next = i + 1 + if i+1 < len(logs) and logs[i+1].startswith('Traceback'): + stack.append(logs[i+1]) + next = i + 2 + elif logs[i].startswith('Traceback'): + stack.append(logs[i]) + trace = stacktrace + next = i + 1 + else: + i = i + 1 + continue + while next < len(logs): + if logs[next].startswith((' ', '\t')): + stack.append(logs[next]) + next = next + 1 + if next < len(logs): + stack.append(logs[next]) + if stack: + trace.append(stack) + i = next + 1 + + if not error_stacktrace: + return error_stacktrace[-1] + + if not stacktrace: + return stacktrace[-1] + + return [] + def report_result(self, test: Test, result: Result): logger.info("Persisting result to the databricks delta lake...") From 927b90d9b7b56b781dcd5fdc7c7b02a4f8ec2262 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sun, 9 Apr 2023 13:34:57 -0700 Subject: [PATCH 085/104] Compute unique crash pattern and store to databrick Signed-off-by: Cuong Nguyen --- release/ray_release/reporter/db.py | 37 +++++++++---- release/ray_release/tests/test_db_reporter.py | 54 +++++++++++++++++++ 2 files changed, 81 insertions(+), 10 deletions(-) create mode 100644 release/ray_release/tests/test_db_reporter.py diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index e75231450971..5b96ce6591ca 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -1,4 +1,5 @@ import time +import re import json import boto3 from typing import Optional, List @@ -14,18 +15,32 @@ class DBReporter(Reporter): def __init__(self): self.firehose = boto3.client("firehose", config=Config(region_name="us-west-2")) - def compute_stack_pattern(self, result: Result) -> Optional[str]: - stack_trace = self.compute_stack_trace(result) - return self.compute_unique_pattern(stack_trace) + def compute_crash_pattern(self, logs: str) -> str: + stack_trace = self._compute_stack_trace(logs.splitlines()) + return self._compute_unique_pattern(stack_trace) - def compute_unique_pattern(stack_trace: List(str)) -> Optional[str]: - return None + def _compute_unique_pattern(self, stack_trace: List[str]) -> str: + """ + Compute unique pattern from stack trace, by remove factors such as date, time, + temp directory, line numbers, etc. This help to aggregate similar logs into + same bug patterns + """ + massaged_trace = [] + for line in stack_trace: + line = re.sub(r'\d', '', line.strip()) + if line == 'Traceback (most recent call last):': + continue + file_line = re.search(r'File "(.*)", (.*)', line) + if file_line: + line = f'{file_line.group(1).split("/")[-1]}{file_line.group(2)}' + massaged_trace.append(line) + return ''.join(massaged_trace) - def compute_stack_trace(self, result: Result) -> List(str): + def _compute_stack_trace(self, logs: List[str]) -> List[str]: """ Extract stack trace pattern from the logs. Stack trace pattern often matches the following: - ERROR + ERROR ... Traceback (most recent call last): File "...", line ..., in ... ... @@ -33,7 +48,6 @@ def compute_stack_trace(self, result: Result) -> List(str): """ error_stacktrace = [] stacktrace = [] - logs = result.last_logs.split("\n") i = 0 while i < len(logs): stack = [] @@ -55,16 +69,18 @@ def compute_stack_trace(self, result: Result) -> List(str): if logs[next].startswith((' ', '\t')): stack.append(logs[next]) next = next + 1 + else: + break if next < len(logs): stack.append(logs[next]) if stack: trace.append(stack) i = next + 1 - if not error_stacktrace: + if error_stacktrace: return error_stacktrace[-1] - if not stacktrace: + if stacktrace: return stacktrace[-1] return [] @@ -96,6 +112,7 @@ def report_result(self, test: Test, result: Result): "return_code": result.return_code, "smoke_test": result.smoke_test, "extra_tags": result.extra_tags or {}, + "crash_pattern": self.compute_crash_pattern(result.last_logs or "") } logger.debug(f"Result json: {json.dumps(result_json)}") diff --git a/release/ray_release/tests/test_db_reporter.py b/release/ray_release/tests/test_db_reporter.py new file mode 100644 index 000000000000..35012cb24b6f --- /dev/null +++ b/release/ray_release/tests/test_db_reporter.py @@ -0,0 +1,54 @@ +from ray_release.reporter.db import DBReporter + +def test_compute_stack_pattern(): + assert (DBReporter()).compute_crash_pattern( +""" +haha +Traceback (most recent call last): + File "/tmp/something", line 584 +Exception: yaya45 +hehe +""" + ) == 'somethingline Exception: yaya' + +def test_compute_unique_pattern(): + assert (DBReporter())._compute_unique_pattern( + [ + 'Traceback (most recent call last):', + ' File "/tmp/something", line 584', + 'Exception: yaya45', + ] + ) == 'somethingline Exception: yaya' + +def test_compute_stack_trace(): + trace = [ + 'Traceback (most recent call last):', + ' File "/tmp/something", line 584, in run_release_test', + ' raise pipeline_exception', + 'ray_release.exception.JobNoLogsError: Could not obtain logs for the job.', + ] + error_trace = [ + '[2023-01-01] ERROR: something is wrong' + 'Traceback (most recent call last):', + ' File "/tmp/something", line 584, in run_release_test', + ' raise pipeline_exception', + 'ray_release.exception.JobStartupTimeout: Cluster did not start.', + ] + error_trace_short = [ + '[2023-01-01] ERROR: something is wrong' + ' File "/tmp/something", line 584, in run_release_test', + ' raise pipeline_exception', + 'ray_release.exception.JobStartupTimeout: Cluster did not start.', + ] + assert (DBReporter())._compute_stack_trace( + ['haha'] + trace + ['hehe'] + ) == trace + assert (DBReporter())._compute_stack_trace( + ['haha'] + error_trace + ['hehe'] + ) == error_trace + assert (DBReporter())._compute_stack_trace( + ['haha'] + error_trace_short + ['hehe'] + ) == error_trace_short + assert (DBReporter())._compute_stack_trace( + ['haha'] + trace + ['w00t'] + error_trace + ['hehe'] + ) == error_trace \ No newline at end of file From f06b8e53c83ff44f2bc83be194771f849a320593 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sun, 9 Apr 2023 13:38:31 -0700 Subject: [PATCH 086/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/reporter/db.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index 5b96ce6591ca..70fe99a2aef2 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -10,6 +10,7 @@ from ray_release.config import Test from ray_release.logger import logger +CRASH_PATTERN_MAX_LENGTH = 4000 class DBReporter(Reporter): def __init__(self): @@ -17,7 +18,7 @@ def __init__(self): def compute_crash_pattern(self, logs: str) -> str: stack_trace = self._compute_stack_trace(logs.splitlines()) - return self._compute_unique_pattern(stack_trace) + return self._compute_unique_pattern(stack_trace)[:CRASH_PATTERN_MAX_LENGTH] def _compute_unique_pattern(self, stack_trace: List[str]) -> str: """ From c34753848b38e3c31315bc52f2a45f7c73212bb2 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sun, 9 Apr 2023 16:22:22 -0700 Subject: [PATCH 087/104] Test Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 188b1bab7354..c89ba2c6e80a 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -38,6 +38,7 @@ from ray_release.file_manager.session_controller import SessionControllerFileManager from ray_release.logger import logger from ray_release.reporter.reporter import Reporter +from ray_release.reporter.db import DBReporter from ray_release.result import Result, handle_exception from ray_release.signal_handling import ( setup_signal_handling, @@ -560,6 +561,9 @@ def run_release_test( if not result.last_logs: result.last_logs = traceback.format_exc() + logger.info( + f'Crash pattern: {(DBReporter().compute_crash_pattern(result.last_logs))}') + buildkite_group(":memo: Reporting results", open=True) reporters = reporters or [] for reporter in reporters: From 2516036491f30797551606f4f51c525eb556420d Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sun, 9 Apr 2023 19:51:42 -0700 Subject: [PATCH 088/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index c89ba2c6e80a..b039c74a8986 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -561,9 +561,6 @@ def run_release_test( if not result.last_logs: result.last_logs = traceback.format_exc() - logger.info( - f'Crash pattern: {(DBReporter().compute_crash_pattern(result.last_logs))}') - buildkite_group(":memo: Reporting results", open=True) reporters = reporters or [] for reporter in reporters: From a2a08ebb84c4288c5b6b89674046edba7b369d23 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sun, 9 Apr 2023 20:53:00 -0700 Subject: [PATCH 089/104] Remove debugging info Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index b039c74a8986..188b1bab7354 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -38,7 +38,6 @@ from ray_release.file_manager.session_controller import SessionControllerFileManager from ray_release.logger import logger from ray_release.reporter.reporter import Reporter -from ray_release.reporter.db import DBReporter from ray_release.result import Result, handle_exception from ray_release.signal_handling import ( setup_signal_handling, From ccd15307d615c46da3d948fe8e5deba08356b636 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 10 Apr 2023 10:04:56 -0700 Subject: [PATCH 090/104] Lint Signed-off-by: Cuong Nguyen --- release/ray_release/reporter/db.py | 25 +++++----- release/ray_release/tests/test_db_reporter.py | 49 ++++++++++--------- 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index 70fe99a2aef2..67b78758bbaf 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -2,7 +2,7 @@ import re import json import boto3 -from typing import Optional, List +from typing import List from botocore.config import Config from ray_release.reporter.reporter import Reporter @@ -12,6 +12,7 @@ CRASH_PATTERN_MAX_LENGTH = 4000 + class DBReporter(Reporter): def __init__(self): self.firehose = boto3.client("firehose", config=Config(region_name="us-west-2")) @@ -23,23 +24,23 @@ def compute_crash_pattern(self, logs: str) -> str: def _compute_unique_pattern(self, stack_trace: List[str]) -> str: """ Compute unique pattern from stack trace, by remove factors such as date, time, - temp directory, line numbers, etc. This help to aggregate similar logs into + temp directory, line numbers, etc. This help to aggregate similar logs into same bug patterns """ massaged_trace = [] for line in stack_trace: - line = re.sub(r'\d', '', line.strip()) - if line == 'Traceback (most recent call last):': + line = re.sub(r"\d", "", line.strip()) + if line == "Traceback (most recent call last):": continue file_line = re.search(r'File "(.*)", (.*)', line) if file_line: line = f'{file_line.group(1).split("/")[-1]}{file_line.group(2)}' massaged_trace.append(line) - return ''.join(massaged_trace) + return "".join(massaged_trace) def _compute_stack_trace(self, logs: List[str]) -> List[str]: """ - Extract stack trace pattern from the logs. Stack trace pattern often matches + Extract stack trace pattern from the logs. Stack trace pattern often matches the following: ERROR ... Traceback (most recent call last): @@ -53,13 +54,13 @@ def _compute_stack_trace(self, logs: List[str]) -> List[str]: while i < len(logs): stack = [] trace = error_stacktrace - if 'ERROR' in logs[i]: + if "ERROR" in logs[i]: stack.append(logs[i]) next = i + 1 - if i+1 < len(logs) and logs[i+1].startswith('Traceback'): - stack.append(logs[i+1]) + if i + 1 < len(logs) and logs[i + 1].startswith("Traceback"): + stack.append(logs[i + 1]) next = i + 2 - elif logs[i].startswith('Traceback'): + elif logs[i].startswith("Traceback"): stack.append(logs[i]) trace = stacktrace next = i + 1 @@ -67,7 +68,7 @@ def _compute_stack_trace(self, logs: List[str]) -> List[str]: i = i + 1 continue while next < len(logs): - if logs[next].startswith((' ', '\t')): + if logs[next].startswith((" ", "\t")): stack.append(logs[next]) next = next + 1 else: @@ -113,7 +114,7 @@ def report_result(self, test: Test, result: Result): "return_code": result.return_code, "smoke_test": result.smoke_test, "extra_tags": result.extra_tags or {}, - "crash_pattern": self.compute_crash_pattern(result.last_logs or "") + "crash_pattern": self.compute_crash_pattern(result.last_logs or ""), } logger.debug(f"Result json: {json.dumps(result_json)}") diff --git a/release/ray_release/tests/test_db_reporter.py b/release/ray_release/tests/test_db_reporter.py index 35012cb24b6f..be7482ac5b88 100644 --- a/release/ray_release/tests/test_db_reporter.py +++ b/release/ray_release/tests/test_db_reporter.py @@ -1,54 +1,57 @@ from ray_release.reporter.db import DBReporter + def test_compute_stack_pattern(): - assert (DBReporter()).compute_crash_pattern( -""" + assert ( + (DBReporter()).compute_crash_pattern( + """ haha Traceback (most recent call last): File "/tmp/something", line 584 Exception: yaya45 hehe """ - ) == 'somethingline Exception: yaya' + ) + == "somethingline Exception: yaya" + ) + def test_compute_unique_pattern(): assert (DBReporter())._compute_unique_pattern( [ - 'Traceback (most recent call last):', + "Traceback (most recent call last):", ' File "/tmp/something", line 584', - 'Exception: yaya45', + "Exception: yaya45", ] - ) == 'somethingline Exception: yaya' + ) == "somethingline Exception: yaya" + def test_compute_stack_trace(): trace = [ - 'Traceback (most recent call last):', + "Traceback (most recent call last):", ' File "/tmp/something", line 584, in run_release_test', - ' raise pipeline_exception', - 'ray_release.exception.JobNoLogsError: Could not obtain logs for the job.', + " raise pipeline_exception", + "ray_release.exception.JobNoLogsError: Could not obtain logs for the job.", ] error_trace = [ - '[2023-01-01] ERROR: something is wrong' - 'Traceback (most recent call last):', + "[2023-01-01] ERROR: something is wrong" "Traceback (most recent call last):", ' File "/tmp/something", line 584, in run_release_test', - ' raise pipeline_exception', - 'ray_release.exception.JobStartupTimeout: Cluster did not start.', + " raise pipeline_exception", + "ray_release.exception.JobStartupTimeout: Cluster did not start.", ] error_trace_short = [ - '[2023-01-01] ERROR: something is wrong' + "[2023-01-01] ERROR: something is wrong" ' File "/tmp/something", line 584, in run_release_test', - ' raise pipeline_exception', - 'ray_release.exception.JobStartupTimeout: Cluster did not start.', + " raise pipeline_exception", + "ray_release.exception.JobStartupTimeout: Cluster did not start.", ] + assert (DBReporter())._compute_stack_trace(["haha"] + trace + ["hehe"]) == trace assert (DBReporter())._compute_stack_trace( - ['haha'] + trace + ['hehe'] - ) == trace - assert (DBReporter())._compute_stack_trace( - ['haha'] + error_trace + ['hehe'] + ["haha"] + error_trace + ["hehe"] ) == error_trace assert (DBReporter())._compute_stack_trace( - ['haha'] + error_trace_short + ['hehe'] + ["haha"] + error_trace_short + ["hehe"] ) == error_trace_short assert (DBReporter())._compute_stack_trace( - ['haha'] + trace + ['w00t'] + error_trace + ['hehe'] - ) == error_trace \ No newline at end of file + ["haha"] + trace + ["w00t"] + error_trace + ["hehe"] + ) == error_trace From dc18cc30d2546554ba5873d1f2b1ed3c35a1fbdf Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 10 Apr 2023 16:59:35 -0700 Subject: [PATCH 091/104] @aslonnie's comments Signed-off-by: Cuong Nguyen --- release/ray_release/reporter/db.py | 24 +++++++++++++++---- release/ray_release/tests/test_db_reporter.py | 4 ++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index 67b78758bbaf..9b8a42e66b1f 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -19,13 +19,13 @@ def __init__(self): def compute_crash_pattern(self, logs: str) -> str: stack_trace = self._compute_stack_trace(logs.splitlines()) - return self._compute_unique_pattern(stack_trace)[:CRASH_PATTERN_MAX_LENGTH] + return self._compute_signature(stack_trace)[:CRASH_PATTERN_MAX_LENGTH] - def _compute_unique_pattern(self, stack_trace: List[str]) -> str: + def _compute_signature(self, stack_trace: List[str]) -> str: """ - Compute unique pattern from stack trace, by remove factors such as date, time, - temp directory, line numbers, etc. This help to aggregate similar logs into - same bug patterns + Compute signature pattern from stack trace, by remove factors such as date, + time, temp directory, line numbers, etc. This help to aggregate similar logs + into same bug patterns """ massaged_trace = [] for line in stack_trace: @@ -54,34 +54,48 @@ def _compute_stack_trace(self, logs: List[str]) -> List[str]: while i < len(logs): stack = [] trace = error_stacktrace + # Search for lines that are either + # ... ERROR ... + # or + # ... ERROR ... + # Traceback (most recent call last): if "ERROR" in logs[i]: stack.append(logs[i]) next = i + 1 if i + 1 < len(logs) and logs[i + 1].startswith("Traceback"): stack.append(logs[i + 1]) next = i + 2 + # Or if the line with ERROR does not exist, just search for the line with + # Traceback (most recent call last): elif logs[i].startswith("Traceback"): stack.append(logs[i]) trace = stacktrace next = i + 1 + # Or else, skip this line and continue else: i = i + 1 continue + # If the line that contains ERROR, Traceback, etc. is found, scan the logs + # until the line no longer has indentation. This is because stack trace + # is always indented, and stops when the line is no longer indented while next < len(logs): if logs[next].startswith((" ", "\t")): stack.append(logs[next]) next = next + 1 else: break + # Finished capturing the entire stack trace if next < len(logs): stack.append(logs[next]) if stack: trace.append(stack) i = next + 1 + # Favor stack trace that contains the ERROR keyword if error_stacktrace: return error_stacktrace[-1] + # Otherwise any stack trace is fine if stacktrace: return stacktrace[-1] diff --git a/release/ray_release/tests/test_db_reporter.py b/release/ray_release/tests/test_db_reporter.py index be7482ac5b88..92871dffd014 100644 --- a/release/ray_release/tests/test_db_reporter.py +++ b/release/ray_release/tests/test_db_reporter.py @@ -16,8 +16,8 @@ def test_compute_stack_pattern(): ) -def test_compute_unique_pattern(): - assert (DBReporter())._compute_unique_pattern( +def test_compute_signature(): + assert (DBReporter())._compute_signature( [ "Traceback (most recent call last):", ' File "/tmp/something", line 584', From 317705fd8bf8068ded218a10abdb882eda7e6504 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sat, 15 Apr 2023 22:03:49 -0700 Subject: [PATCH 092/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/reporter/db.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index 9b8a42e66b1f..88d817658a55 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -11,7 +11,7 @@ from ray_release.logger import logger CRASH_PATTERN_MAX_LENGTH = 4000 - +TRACEBACK_PATTERN = 'Traceback (most recent call last)' class DBReporter(Reporter): def __init__(self): @@ -62,12 +62,12 @@ def _compute_stack_trace(self, logs: List[str]) -> List[str]: if "ERROR" in logs[i]: stack.append(logs[i]) next = i + 1 - if i + 1 < len(logs) and logs[i + 1].startswith("Traceback"): + if i + 1 < len(logs) and TRACEBACK_PATTERN in logs[i + 1]: stack.append(logs[i + 1]) next = i + 2 # Or if the line with ERROR does not exist, just search for the line with # Traceback (most recent call last): - elif logs[i].startswith("Traceback"): + elif TRACEBACK_PATTERN in logs[i]: stack.append(logs[i]) trace = stacktrace next = i + 1 From bed4b1de92eab6ba477ada42469e43b00598a52c Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 17 Apr 2023 14:33:40 -0700 Subject: [PATCH 093/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/reporter/db.py | 7 ++++--- release/ray_release/result.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index 88d817658a55..2f54eee85564 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -11,7 +11,8 @@ from ray_release.logger import logger CRASH_PATTERN_MAX_LENGTH = 4000 -TRACEBACK_PATTERN = 'Traceback (most recent call last)' +TRACEBACK_PATTERN = "Traceback (most recent call last)" + class DBReporter(Reporter): def __init__(self): @@ -23,8 +24,8 @@ def compute_crash_pattern(self, logs: str) -> str: def _compute_signature(self, stack_trace: List[str]) -> str: """ - Compute signature pattern from stack trace, by remove factors such as date, - time, temp directory, line numbers, etc. This help to aggregate similar logs + Compute signature pattern from stack trace, by remove factors such as date, + time, temp directory, line numbers, etc. This help to aggregate similar logs into same bug patterns """ massaged_trace = [] diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 245322c962ca..b0cb7148c000 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -1,6 +1,5 @@ import enum import os -import os from dataclasses import dataclass from typing import Optional, Dict, Tuple From 2e4d5b94c64ca048dad6e77019eaa2413704d2af Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 17 Apr 2023 15:34:18 -0700 Subject: [PATCH 094/104] Add comments for why we need to look across many ray logs for error patterns Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 1b605b412e84..5d6ae4d1bc7b 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -332,6 +332,9 @@ def _get_logs(): ) print("", flush=True) output = buf.getvalue().strip() + # Many of Ray components have their separated logs (e.g. dashboard, + # gcs_server, etc.), so the interesting errors are not always in the + # job logs. If the job has no logs, check other ray logs for error patterns. if "### Starting ###" not in output: output = self._get_ray_error_logs() assert output, "No logs fetched" From 8f95d5cbb6d6600d0baf1cac9421836c6655e3d8 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 18 Apr 2023 08:34:25 -0700 Subject: [PATCH 095/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/result.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/release/ray_release/result.py b/release/ray_release/result.py index b0cb7148c000..246697b67b6d 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -81,25 +81,6 @@ class ExitCode(enum.Enum): PREPARE_ERROR = 43 -def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: - """ - Classify whether an infra-failure issue is a transient issue. This is based on - the status of its previous retries, and its runtime. - """ - if result_status not in [ResultStatus.INFRA_ERROR, ResultStatus.INFRA_TIMEOUT]: - # Not even an infra failure - return False - retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", 0)) - max_retry = int(os.environ.get("BUILDKITE_MAX_RETRIES", 1)) - if retry_count >= max_retry: - # Already reach retry limit - return False - if runtime > int(os.environ.get("BUILDKITE_TIME_LIMIT_FOR_RETRY", 0)): - # Take too long to run - return False - return True - - def _is_transient_error(result_status: ResultStatus, runtime: int) -> bool: """ Classify whether an infra-failure issue is a transient issue. This is based on From f5e2a5a4da2920b9f47d54410d325dace17bc9d0 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 18 Apr 2023 08:34:55 -0700 Subject: [PATCH 096/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/glue.py | 450 +++++++++++++++++++----------------- 1 file changed, 234 insertions(+), 216 deletions(-) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 188b1bab7354..46ef68f45ce5 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -6,9 +6,11 @@ from ray_release.alerts.handle import handle_result, require_result from ray_release.anyscale_util import get_cluster_name from ray_release.buildkite.output import buildkite_group, buildkite_open_last +from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.cluster_manager.full import FullClusterManager from ray_release.cluster_manager.minimal import MinimalClusterManager from ray_release.command_runner.job_runner import JobRunner +from ray_release.command_runner.command_runner import CommandRunner from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner from ray_release.command_runner.sdk_runner import SDKRunner from ray_release.config import ( @@ -90,43 +92,33 @@ def _get_extra_tags_from_env() -> dict: return {key.lower(): os.getenv(key, "") for key in env_vars} -def run_release_test( +def _load_test_configuration( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, - reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, - cluster_id: Optional[str] = None, - cluster_env_id: Optional[str] = None, no_terminate: bool = False, -) -> Result: - buildkite_group(":spiral_note_pad: Loading test configuration") - +) -> Tuple[ClusterManager, CommandRunner, str]: validate_test(test) - logger.info(f"Test config: {test}") + # Populate result paramaters result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test - buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") buildkite_job_id = os.getenv("BUILDKITE_JOB_ID", "") - if buildkite_url: buildkite_url += "#" + buildkite_job_id - result.buildkite_url = buildkite_url result.buildkite_job_id = buildkite_job_id + # Setting up working directory working_dir = test["working_dir"] - - old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) - start_time = time.monotonic() run_type = test["run"].get("type", DEFAULT_RUN_TYPE) # Workaround while Anyscale Jobs don't support leaving cluster alive @@ -161,7 +153,6 @@ def run_release_test( logger.info(f"Got command runner cls: {command_runner_cls}") logger.info(f"Got file manager cls: {file_manager_cls}") - # Extra tags to be set on resources on cloud provider's side extra_tags = _get_extra_tags_from_env() # We don't need other attributes as they can be derived from the name @@ -185,230 +176,267 @@ def run_release_test( except Exception as e: raise ReleaseTestSetupError(f"Error setting up release test: {e}") from e - pipeline_exception = None - # non critical for some tests. So separate it from the general one. - fetch_result_exception = None - try: - setup_signal_handling() - # Load configs - cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) - cluster_compute = load_test_cluster_compute(test) - - if cluster_env_id: - try: - cluster_manager.cluster_env_id = cluster_env_id - cluster_manager.build_cluster_env() - cluster_manager.fetch_build_info() - logger.info( - "Using overridden cluster environment with ID " - f"{cluster_env_id} and build ID " - f"{cluster_manager.cluster_env_build_id}" - ) - except Exception as e: - raise ClusterEnvCreateError( - f"Could not get existing overridden cluster environment " - f"{cluster_env_id}: {e}" - ) from e - else: - cluster_manager.set_cluster_env(cluster_env) + return cluster_manager, command_runner, artifact_path - # Load some timeouts - build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) - command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) - cluster_timeout = int( - test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) - ) - # Get prepare command timeout, if any - prepare_cmd = test["run"].get("prepare", None) - if prepare_cmd: - prepare_timeout = test["run"].get("prepare_timeout", command_timeout) - else: - prepare_timeout = 0 +def _setup_cluster_environment( + test: Test, + result: Result, + cluster_manager: ClusterManager, + ray_wheels_url: str, + cluster_env_id: Optional[str], +) -> Tuple[str, int, int, int, int]: + setup_signal_handling() + # Load configs + cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) + cluster_compute = load_test_cluster_compute(test) + + if cluster_env_id: + try: + cluster_manager.cluster_env_id = cluster_env_id + cluster_manager.build_cluster_env() + cluster_manager.fetch_build_info() + logger.info( + "Using overridden cluster environment with ID " + f"{cluster_env_id} and build ID " + f"{cluster_manager.cluster_env_build_id}" + ) + except Exception as e: + raise ClusterEnvCreateError( + f"Could not get existing overridden cluster environment " + f"{cluster_env_id}: {e}" + ) from e + else: + cluster_manager.set_cluster_env(cluster_env) - # Base maximum uptime on the combined command and prepare timeouts - command_and_prepare_timeout = command_timeout + prepare_timeout + # Load some timeouts + build_timeout = int(test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT)) + command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)) + cluster_timeout = int(test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT)) - # Use default timeout = 0 here if wait_for_nodes is empty. This is to make - # sure we don't inflate the maximum_uptime_minutes too much if we don't wait - # for nodes at all. - # The actual default will be otherwise loaded further down. - wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) + # Get prepare command timeout, if any + prepare_cmd = test["run"].get("prepare", None) + if prepare_cmd: + prepare_timeout = test["run"].get("prepare_timeout", command_timeout) + else: + prepare_timeout = 0 - autosuspend_mins = test["cluster"].get("autosuspend_mins", None) - if autosuspend_mins: - cluster_manager.autosuspend_minutes = autosuspend_mins - autosuspend_base = autosuspend_mins - else: - cluster_manager.autosuspend_minutes = min( - DEFAULT_AUTOSUSPEND_MINS, - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, - ) - # Maximum uptime should be based on the command timeout, not the - # DEFAULT_AUTOSUSPEND_MINS - autosuspend_base = ( - int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES - ) + # Base maximum uptime on the combined command and prepare timeouts + command_and_prepare_timeout = command_timeout + prepare_timeout - maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) - if maximum_uptime_minutes: - cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes - else: - cluster_manager.maximum_uptime_minutes = ( - autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES - ) + # Use default timeout = 0 here if wait_for_nodes is empty. This is to make + # sure we don't inflate the maximum_uptime_minutes too much if we don't wait + # for nodes at all. + # The actual default will be otherwise loaded further down. + wait_timeout = int(test["run"].get("wait_for_nodes", {}).get("timeout", 0)) - # Set cluster compute here. Note that this may use timeouts provided - # above. - cluster_manager.set_cluster_compute( - cluster_compute, - extra_tags=extra_tags, + autosuspend_mins = test["cluster"].get("autosuspend_mins", None) + if autosuspend_mins: + cluster_manager.autosuspend_minutes = autosuspend_mins + autosuspend_base = autosuspend_mins + else: + cluster_manager.autosuspend_minutes = min( + DEFAULT_AUTOSUSPEND_MINS, + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES, + ) + # Maximum uptime should be based on the command timeout, not the + # DEFAULT_AUTOSUSPEND_MINS + autosuspend_base = ( + int(command_and_prepare_timeout / 60) + TIMEOUT_BUFFER_MINUTES ) - buildkite_group(":nut_and_bolt: Setting up local environment") - driver_setup_script = test.get("driver_setup", None) - if driver_setup_script: - try: - run_bash_script(driver_setup_script) - except Exception as e: - raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e + maximum_uptime_minutes = test["cluster"].get("maximum_uptime_minutes", None) + if maximum_uptime_minutes: + cluster_manager.maximum_uptime_minutes = maximum_uptime_minutes + else: + cluster_manager.maximum_uptime_minutes = ( + autosuspend_base + wait_timeout + TIMEOUT_BUFFER_MINUTES + ) - # Install local dependencies - command_runner.prepare_local_env(ray_wheels_url) + # Set cluster compute here. Note that this may use timeouts provided + # above. + cluster_manager.set_cluster_compute( + cluster_compute, + extra_tags=result.extra_tags, + ) - # Re-install anyscale package as local dependencies might have changed - # from local env setup - reinstall_anyscale_dependencies() + return prepare_cmd, prepare_timeout, build_timeout, cluster_timeout, command_timeout - # Print installed pip packages - buildkite_group(":bulb: Local environment information") - pip_packages = get_pip_packages() - pip_package_string = "\n".join(pip_packages) - logger.info(f"Installed python packages:\n{pip_package_string}") - if isinstance(cluster_manager, FullClusterManager): - if not no_terminate: - register_handler( - lambda sig, frame: cluster_manager.terminate_cluster(wait=True) - ) - - # Start cluster - if cluster_id: - buildkite_group(":rocket: Using existing cluster") - # Re-use existing cluster ID for development - cluster_manager.cluster_id = cluster_id - cluster_manager.cluster_name = get_cluster_name(cluster_id) - else: - buildkite_group(":gear: Building cluster environment") - - if cluster_env_id: - cluster_manager.cluster_env_id = cluster_env_id +def _setup_local_environment( + test: Test, + command_runner: CommandRunner, + ray_wheels_url: str, +) -> None: + driver_setup_script = test.get("driver_setup", None) + if driver_setup_script: + try: + run_bash_script(driver_setup_script) + except Exception as e: + raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - cluster_manager.build_configs(timeout=build_timeout) + # Install local dependencies + command_runner.prepare_local_env(ray_wheels_url) - if isinstance(cluster_manager, FullClusterManager): - buildkite_group(":rocket: Starting up cluster") - cluster_manager.start_cluster(timeout=cluster_timeout) - elif isinstance(command_runner, AnyscaleJobRunner): - command_runner.job_manager.cluster_startup_timeout = cluster_timeout + # Re-install anyscale package as local dependencies might have changed + # from local env setup + reinstall_anyscale_dependencies() - result.cluster_url = cluster_manager.get_cluster_url() - result.cluster_id = cluster_manager.cluster_id - # Upload files - buildkite_group(":wrench: Preparing remote environment") - command_runner.prepare_remote_env() +def _local_environment_information( + result: Result, + cluster_manager: ClusterManager, + command_runner: CommandRunner, + build_timeout: int, + cluster_timeout: int, + no_terminate: bool, + cluster_id: Optional[str], + cluster_env_id: Optional[str], +) -> None: + pip_packages = get_pip_packages() + pip_package_string = "\n".join(pip_packages) + logger.info(f"Installed python packages:\n{pip_package_string}") + + if isinstance(cluster_manager, FullClusterManager): + if not no_terminate: + register_handler( + lambda sig, frame: cluster_manager.terminate_cluster(wait=True) + ) - wait_for_nodes = test["run"].get("wait_for_nodes", None) + # Start cluster + if cluster_id: + buildkite_group(":rocket: Using existing cluster") + # Re-use existing cluster ID for development + cluster_manager.cluster_id = cluster_id + cluster_manager.cluster_name = get_cluster_name(cluster_id) + else: + buildkite_group(":gear: Building cluster environment") - if wait_for_nodes: - buildkite_group(":stopwatch: Waiting for nodes to come up") - # Overwrite wait_timeout from above to account for better default - wait_timeout = int( - wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) - ) - num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] - command_runner.wait_for_nodes(num_nodes, wait_timeout) + if cluster_env_id: + cluster_manager.cluster_env_id = cluster_env_id - if prepare_cmd: - try: - command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) - except CommandError as e: - raise PrepareCommandError(e) - except CommandTimeout as e: - raise PrepareCommandTimeout(e) + cluster_manager.build_configs(timeout=build_timeout) - buildkite_group(":runner: Running test script") - command = test["run"]["script"] - command_env = {} + if isinstance(cluster_manager, FullClusterManager): + buildkite_group(":rocket: Starting up cluster") + cluster_manager.start_cluster(timeout=cluster_timeout) + elif isinstance(command_runner, AnyscaleJobRunner): + command_runner.job_manager.cluster_startup_timeout = cluster_timeout - if smoke_test: - command = f"{command} --smoke-test" - command_env["IS_SMOKE_TEST"] = "1" + result.cluster_url = cluster_manager.get_cluster_url() + result.cluster_id = cluster_manager.cluster_id - is_long_running = test["run"].get("long_running", False) - start_time_unix = time.time() +def _prepare_remote_environment( + test: Test, + command_runner: CommandRunner, + prepare_cmd: bool, + prepare_timeout: int, +) -> None: + command_runner.prepare_remote_env() + + wait_for_nodes = test["run"].get("wait_for_nodes", None) + + if wait_for_nodes: + buildkite_group(":stopwatch: Waiting for nodes to come up") + # Overwrite wait_timeout from above to account for better default + wait_timeout = int( + wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) + ) + num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] + command_runner.wait_for_nodes(num_nodes, wait_timeout) + if prepare_cmd: try: - command_runner.run_command( - command, - env=command_env, - timeout=command_timeout, - raise_on_timeout=not is_long_running, - ) - except ( - TestCommandError, - PrepareCommandError, - TestCommandTimeout, - PrepareCommandTimeout, - ) as e: - raise e + command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) except CommandError as e: - raise TestCommandError(e) + raise PrepareCommandError(e) except CommandTimeout as e: - if not is_long_running: - # Only raise error if command is not long running - raise TestCommandTimeout(e) + raise PrepareCommandTimeout(e) - buildkite_group(":floppy_disk: Fetching results") - try: - command_results = command_runner.fetch_results() - except Exception as e: - logger.exception(f"Could not fetch results for test command: {e}") - command_results = {} - fetch_result_exception = e - if artifact_path: - try: - command_runner.fetch_artifact() - except Exception as e: - logger.error("Could not fetch artifact for test command") - logger.exception(e) +def _running_test_script( + test: Test, + smoke_test: bool, + command_runner: CommandRunner, + command_timeout: int, +) -> None: + command = test["run"]["script"] + command_env = {} - # Postprocess result: - if "last_update" in command_results: - command_results["last_update_diff"] = time.time() - command_results.get( - "last_update", 0.0 - ) + if smoke_test: + command = f"{command} --smoke-test" + command_env["IS_SMOKE_TEST"] = "1" + + is_long_running = test["run"].get("long_running", False) + + try: + command_runner.run_command( + command, + env=command_env, + timeout=command_timeout, + raise_on_timeout=not is_long_running, + ) + except ( + TestCommandError, + PrepareCommandError, + TestCommandTimeout, + PrepareCommandTimeout, + ) as e: + raise e + except CommandError as e: + raise TestCommandError(e) + except CommandTimeout as e: + if not is_long_running: + # Only raise error if command is not long running + raise TestCommandTimeout(e) + + +def _fetching_results( + result: Result, + command_runner: CommandRunner, + artifact_path: Optional[str], + smoke_test: bool, + start_time_unix: int, +) -> Tuple[dict, Exception]: + fetch_result_exception = None + try: + command_results = command_runner.fetch_results() + except Exception as e: + logger.exception(f"Could not fetch results for test command: {e}") + command_results = {} + fetch_result_exception = e + if artifact_path: try: - # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py - # Timeout is the time the test took divided by 200 - # (~7 minutes for a 24h test) but no less than 30s - # and no more than 900s - metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) - command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) - metrics = command_runner.fetch_metrics() + command_runner.fetch_artifact() except Exception as e: - logger.exception(f"Could not fetch metrics for test command: {e}") - metrics = {} + logger.error("Could not fetch artifact for test command") + logger.exception(e) + + # Postprocess result: + if "last_update" in command_results: + command_results["last_update_diff"] = time.time() - command_results.get( + "last_update", 0.0 + ) + + try: + # Logic duplicated in ray_release/command_runner/_anyscale_job_wrapper.py + # Timeout is the time the test took divided by 200 + # (~7 minutes for a 24h test) but no less than 30s + # and no more than 900s + metrics_timeout = max(30, min((time.time() - start_time_unix) / 200, 900)) + command_runner.save_metrics(start_time_unix, timeout=metrics_timeout) + metrics = command_runner.fetch_metrics() + except Exception as e: + logger.exception(f"Could not fetch metrics for test command: {e}") + metrics = {} - if smoke_test: - command_results["smoke_test"] = True + if smoke_test: + command_results["smoke_test"] = True - result.results = command_results - result.status = "finished" + result.results = command_results + result.status = "finished" return metrics, fetch_result_exception @@ -516,10 +544,7 @@ def run_release_test( if not no_terminate and cluster_manager: buildkite_group(":earth_africa: Terminating cluster") - try: - cluster_manager.terminate_cluster(wait=False) - except Exception as e: - logger.exception(f"Could not terminate cluster: {e}") + cluster_manager.terminate_cluster(wait=False) if hasattr(command_runner, "cleanup"): command_runner.cleanup() @@ -561,15 +586,8 @@ def run_release_test( result.last_logs = traceback.format_exc() buildkite_group(":memo: Reporting results", open=True) - reporters = reporters or [] - for reporter in reporters: - try: - reporter.report_result(test, result) - except Exception as e: - logger.exception(f"Error reporting results via {type(reporter)}: {e}") - - if pipeline_exception: - raise pipeline_exception + for reporter in reporters or []: + reporter.report_result(test, result) if pipeline_exception: raise pipeline_exception From c11e4f601a46341aa9e57c1375a4c19b512b7226 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 18 Apr 2023 08:36:03 -0700 Subject: [PATCH 097/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/step.py | 1 - release/ray_release/job_manager/anyscale_job_manager.py | 4 ++-- release/ray_release/result.py | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index 3078be809167..a13bde1575d8 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -15,7 +15,6 @@ from ray_release.env import DEFAULT_ENVIRONMENT, load_environment from ray_release.template import get_test_env_var from ray_release.util import python_version_str, DeferredEnvVar -from ray_release.result import ExitCode DEFAULT_ARTIFACTS_DIR_HOST = "/tmp/ray_release_test_artifacts" diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 5d6ae4d1bc7b..0d02dbb8a784 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -332,8 +332,8 @@ def _get_logs(): ) print("", flush=True) output = buf.getvalue().strip() - # Many of Ray components have their separated logs (e.g. dashboard, - # gcs_server, etc.), so the interesting errors are not always in the + # Many of Ray components have their separated logs (e.g. dashboard, + # gcs_server, etc.), so the interesting errors are not always in the # job logs. If the job has no logs, check other ray logs for error patterns. if "### Starting ###" not in output: output = self._get_ray_error_logs() diff --git a/release/ray_release/result.py b/release/ray_release/result.py index 246697b67b6d..ed476cf0e734 100644 --- a/release/ray_release/result.py +++ b/release/ray_release/result.py @@ -31,7 +31,6 @@ class Result: stable: bool = True smoke_test: bool = False - buildkite_return_code: BuildkiteExitCode.SUCCESS buildkite_url: Optional[str] = None wheels_url: Optional[str] = None cluster_url: Optional[str] = None From e7076ea9adac77f81d1db2b88804fe3146415738 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 18 Apr 2023 15:09:11 -0700 Subject: [PATCH 098/104] @aslonnie's comments Signed-off-by: Cuong Nguyen --- release/ray_release/reporter/db.py | 6 ++++-- release/ray_release/tests/test_db_reporter.py | 19 ++++++++----------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index 2f54eee85564..f2d395be8f76 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -10,7 +10,6 @@ from ray_release.config import Test from ray_release.logger import logger -CRASH_PATTERN_MAX_LENGTH = 4000 TRACEBACK_PATTERN = "Traceback (most recent call last)" @@ -20,7 +19,7 @@ def __init__(self): def compute_crash_pattern(self, logs: str) -> str: stack_trace = self._compute_stack_trace(logs.splitlines()) - return self._compute_signature(stack_trace)[:CRASH_PATTERN_MAX_LENGTH] + return self._compute_signature(stack_trace)[:4000] # limit of databrick field def _compute_signature(self, stack_trace: List[str]) -> str: """ @@ -35,6 +34,9 @@ def _compute_signature(self, stack_trace: List[str]) -> str: continue file_line = re.search(r'File "(.*)", (.*)', line) if file_line: + # append the file's base name and caller information; the result string + # is not something meaningful to human, we just need something that + # uniquely represent the stack trace line = f'{file_line.group(1).split("/")[-1]}{file_line.group(2)}' massaged_trace.append(line) return "".join(massaged_trace) diff --git a/release/ray_release/tests/test_db_reporter.py b/release/ray_release/tests/test_db_reporter.py index 92871dffd014..0f45011f79b4 100644 --- a/release/ray_release/tests/test_db_reporter.py +++ b/release/ray_release/tests/test_db_reporter.py @@ -2,18 +2,15 @@ def test_compute_stack_pattern(): - assert ( - (DBReporter()).compute_crash_pattern( - """ -haha -Traceback (most recent call last): - File "/tmp/something", line 584 -Exception: yaya45 -hehe -""" + assert (DBReporter()).compute_crash_pattern( + "\n".join( + "haha", + "Traceback (most recent call last):", + ' File "/tmp/something", line 584', + "Exception: yaya45", + "hehe", ) - == "somethingline Exception: yaya" - ) + ) == "somethingline Exception: yaya" def test_compute_signature(): From f5043cc59498b19af2f3c1448f581168bda7d213 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Tue, 18 Apr 2023 15:11:34 -0700 Subject: [PATCH 099/104] Rebase Signed-off-by: Cuong Nguyen --- release/ray_release/tests/test_db_reporter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/release/ray_release/tests/test_db_reporter.py b/release/ray_release/tests/test_db_reporter.py index 0f45011f79b4..9707f10e1122 100644 --- a/release/ray_release/tests/test_db_reporter.py +++ b/release/ray_release/tests/test_db_reporter.py @@ -3,13 +3,13 @@ def test_compute_stack_pattern(): assert (DBReporter()).compute_crash_pattern( - "\n".join( + "\n".join([ "haha", "Traceback (most recent call last):", ' File "/tmp/something", line 584', "Exception: yaya45", "hehe", - ) + ]) ) == "somethingline Exception: yaya" From 179d48ba738f2aab68695e14016d083d75bb4682 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 19 Apr 2023 10:52:02 -0700 Subject: [PATCH 100/104] Fix lints Signed-off-by: Cuong Nguyen --- release/ray_release/tests/test_db_reporter.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/release/ray_release/tests/test_db_reporter.py b/release/ray_release/tests/test_db_reporter.py index 9707f10e1122..9e0254864bc7 100644 --- a/release/ray_release/tests/test_db_reporter.py +++ b/release/ray_release/tests/test_db_reporter.py @@ -3,13 +3,15 @@ def test_compute_stack_pattern(): assert (DBReporter()).compute_crash_pattern( - "\n".join([ - "haha", - "Traceback (most recent call last):", - ' File "/tmp/something", line 584', - "Exception: yaya45", - "hehe", - ]) + "\n".join( + [ + "haha", + "Traceback (most recent call last):", + ' File "/tmp/something", line 584', + "Exception: yaya45", + "hehe", + ] + ) ) == "somethingline Exception: yaya" From 485a19951f5d4610c44f68919522acd5afc28f72 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Wed, 19 Apr 2023 10:54:00 -0700 Subject: [PATCH 101/104] Simply check that output is none Signed-off-by: Cuong Nguyen --- release/ray_release/job_manager/anyscale_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 0d02dbb8a784..5b831e4e7d3b 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -335,7 +335,7 @@ def _get_logs(): # Many of Ray components have their separated logs (e.g. dashboard, # gcs_server, etc.), so the interesting errors are not always in the # job logs. If the job has no logs, check other ray logs for error patterns. - if "### Starting ###" not in output: + if not output: output = self._get_ray_error_logs() assert output, "No logs fetched" return "\n".join(output.splitlines()[-LAST_LOGS_LENGTH * 3 :]) From 1c149922cc4f7fc48042b70c91f48da348792c24 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Fri, 21 Apr 2023 09:09:58 -0700 Subject: [PATCH 102/104] @krfricke's comments Signed-off-by: Cuong Nguyen --- release/ray_release/reporter/db.py | 96 +------------------ release/ray_release/tests/test_db_reporter.py | 56 ----------- .../ray_release/tests/test_log_aggregator.py | 68 +++++++++++++ 3 files changed, 72 insertions(+), 148 deletions(-) delete mode 100644 release/ray_release/tests/test_db_reporter.py create mode 100644 release/ray_release/tests/test_log_aggregator.py diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index f2d395be8f76..ec816739a306 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -1,109 +1,19 @@ import time -import re import json import boto3 -from typing import List from botocore.config import Config from ray_release.reporter.reporter import Reporter from ray_release.result import Result from ray_release.config import Test from ray_release.logger import logger - -TRACEBACK_PATTERN = "Traceback (most recent call last)" +from ray_release.log_aggregator import LogAggregator class DBReporter(Reporter): def __init__(self): self.firehose = boto3.client("firehose", config=Config(region_name="us-west-2")) - def compute_crash_pattern(self, logs: str) -> str: - stack_trace = self._compute_stack_trace(logs.splitlines()) - return self._compute_signature(stack_trace)[:4000] # limit of databrick field - - def _compute_signature(self, stack_trace: List[str]) -> str: - """ - Compute signature pattern from stack trace, by remove factors such as date, - time, temp directory, line numbers, etc. This help to aggregate similar logs - into same bug patterns - """ - massaged_trace = [] - for line in stack_trace: - line = re.sub(r"\d", "", line.strip()) - if line == "Traceback (most recent call last):": - continue - file_line = re.search(r'File "(.*)", (.*)', line) - if file_line: - # append the file's base name and caller information; the result string - # is not something meaningful to human, we just need something that - # uniquely represent the stack trace - line = f'{file_line.group(1).split("/")[-1]}{file_line.group(2)}' - massaged_trace.append(line) - return "".join(massaged_trace) - - def _compute_stack_trace(self, logs: List[str]) -> List[str]: - """ - Extract stack trace pattern from the logs. Stack trace pattern often matches - the following: - ERROR ... - Traceback (most recent call last): - File "...", line ..., in ... - ... - Exception: exception error - """ - error_stacktrace = [] - stacktrace = [] - i = 0 - while i < len(logs): - stack = [] - trace = error_stacktrace - # Search for lines that are either - # ... ERROR ... - # or - # ... ERROR ... - # Traceback (most recent call last): - if "ERROR" in logs[i]: - stack.append(logs[i]) - next = i + 1 - if i + 1 < len(logs) and TRACEBACK_PATTERN in logs[i + 1]: - stack.append(logs[i + 1]) - next = i + 2 - # Or if the line with ERROR does not exist, just search for the line with - # Traceback (most recent call last): - elif TRACEBACK_PATTERN in logs[i]: - stack.append(logs[i]) - trace = stacktrace - next = i + 1 - # Or else, skip this line and continue - else: - i = i + 1 - continue - # If the line that contains ERROR, Traceback, etc. is found, scan the logs - # until the line no longer has indentation. This is because stack trace - # is always indented, and stops when the line is no longer indented - while next < len(logs): - if logs[next].startswith((" ", "\t")): - stack.append(logs[next]) - next = next + 1 - else: - break - # Finished capturing the entire stack trace - if next < len(logs): - stack.append(logs[next]) - if stack: - trace.append(stack) - i = next + 1 - - # Favor stack trace that contains the ERROR keyword - if error_stacktrace: - return error_stacktrace[-1] - - # Otherwise any stack trace is fine - if stacktrace: - return stacktrace[-1] - - return [] - def report_result(self, test: Test, result: Result): logger.info("Persisting result to the databricks delta lake...") @@ -131,7 +41,9 @@ def report_result(self, test: Test, result: Result): "return_code": result.return_code, "smoke_test": result.smoke_test, "extra_tags": result.extra_tags or {}, - "crash_pattern": self.compute_crash_pattern(result.last_logs or ""), + "crash_pattern": LogAggregator( + result.last_logs or "" + ).compute_crash_pattern(), } logger.debug(f"Result json: {json.dumps(result_json)}") diff --git a/release/ray_release/tests/test_db_reporter.py b/release/ray_release/tests/test_db_reporter.py deleted file mode 100644 index 9e0254864bc7..000000000000 --- a/release/ray_release/tests/test_db_reporter.py +++ /dev/null @@ -1,56 +0,0 @@ -from ray_release.reporter.db import DBReporter - - -def test_compute_stack_pattern(): - assert (DBReporter()).compute_crash_pattern( - "\n".join( - [ - "haha", - "Traceback (most recent call last):", - ' File "/tmp/something", line 584', - "Exception: yaya45", - "hehe", - ] - ) - ) == "somethingline Exception: yaya" - - -def test_compute_signature(): - assert (DBReporter())._compute_signature( - [ - "Traceback (most recent call last):", - ' File "/tmp/something", line 584', - "Exception: yaya45", - ] - ) == "somethingline Exception: yaya" - - -def test_compute_stack_trace(): - trace = [ - "Traceback (most recent call last):", - ' File "/tmp/something", line 584, in run_release_test', - " raise pipeline_exception", - "ray_release.exception.JobNoLogsError: Could not obtain logs for the job.", - ] - error_trace = [ - "[2023-01-01] ERROR: something is wrong" "Traceback (most recent call last):", - ' File "/tmp/something", line 584, in run_release_test', - " raise pipeline_exception", - "ray_release.exception.JobStartupTimeout: Cluster did not start.", - ] - error_trace_short = [ - "[2023-01-01] ERROR: something is wrong" - ' File "/tmp/something", line 584, in run_release_test', - " raise pipeline_exception", - "ray_release.exception.JobStartupTimeout: Cluster did not start.", - ] - assert (DBReporter())._compute_stack_trace(["haha"] + trace + ["hehe"]) == trace - assert (DBReporter())._compute_stack_trace( - ["haha"] + error_trace + ["hehe"] - ) == error_trace - assert (DBReporter())._compute_stack_trace( - ["haha"] + error_trace_short + ["hehe"] - ) == error_trace_short - assert (DBReporter())._compute_stack_trace( - ["haha"] + trace + ["w00t"] + error_trace + ["hehe"] - ) == error_trace diff --git a/release/ray_release/tests/test_log_aggregator.py b/release/ray_release/tests/test_log_aggregator.py new file mode 100644 index 000000000000..6cc92e74f3b9 --- /dev/null +++ b/release/ray_release/tests/test_log_aggregator.py @@ -0,0 +1,68 @@ +from ray_release.log_aggregator import LogAggregator + + +def test_compute_stack_pattern(): + assert ( + LogAggregator( + "\n".join( + [ + "haha", + "Traceback (most recent call last):", + ' File "/tmp/something", line 584', + "Exception: yaya45", + "hehe", + ] + ) + ).compute_crash_pattern() + == "somethingline Exception: yaya" + ) + + +def test_compute_signature(): + assert ( + LogAggregator._compute_signature( + [ + "Traceback (most recent call last):", + ' File "/tmp/something", line 584', + "Exception: yaya45", + ] + ) + == "somethingline Exception: yaya" + ) + + +def test_compute_stack_trace(): + trace = [ + "Traceback (most recent call last):", + ' File "/tmp/something", line 584, in run_release_test', + " raise pipeline_exception", + "ray_release.exception.JobNoLogsError: Could not obtain logs for the job.", + ] + error_trace = [ + "[2023-01-01] ERROR: something is wrong", + "Traceback (most recent call last):", + ' File "/tmp/something", line 584, in run_release_test', + " raise pipeline_exception", + "ray_release.exception.JobStartupTimeout: Cluster did not start.", + ] + error_trace_short = [ + "[2023-01-01] ERROR: something is wrong" + ' File "/tmp/something", line 584, in run_release_test', + " raise pipeline_exception", + "ray_release.exception.JobStartupTimeout: Cluster did not start.", + ] + assert LogAggregator._compute_stack_trace(["haha"] + trace + ["hehe"]) == trace + assert ( + LogAggregator._compute_stack_trace(["haha"] + error_trace + ["hehe"]) + == error_trace + ) + assert ( + LogAggregator._compute_stack_trace(["haha"] + error_trace_short + ["hehe"]) + == error_trace_short + ) + assert ( + LogAggregator._compute_stack_trace( + ["haha"] + trace + ["w00t"] + error_trace + ["hehe"] + ) + == error_trace + ) From 25bda890e1dccdcd08c3dea3bfd579d294b35080 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Fri, 21 Apr 2023 09:11:43 -0700 Subject: [PATCH 103/104] Add new files Signed-off-by: Cuong Nguyen --- release/ray_release/log_aggregator.py | 99 +++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 release/ray_release/log_aggregator.py diff --git a/release/ray_release/log_aggregator.py b/release/ray_release/log_aggregator.py new file mode 100644 index 000000000000..455c133e73ae --- /dev/null +++ b/release/ray_release/log_aggregator.py @@ -0,0 +1,99 @@ +import re +from typing import List + +TRACEBACK_PATTERN = "Traceback (most recent call last)" + +class LogAggregator(): + def __init__(self, log: str): + self.log = log + + def compute_crash_pattern(self) -> str: + stack_trace = LogAggregator._compute_stack_trace(self.log.splitlines()) + # truncate short enough to store in databases, but long enough to keep the + # pattern unique + return LogAggregator._compute_signature(stack_trace)[:4000] + + @staticmethod + def _compute_signature(stack_trace: List[str]) -> str: + """ + Compute signature pattern from stack trace, by remove factors such as date, + time, temp directory, line numbers, etc. This help to aggregate similar logs + into same bug patterns + """ + massaged_trace = [] + for line in stack_trace: + line = re.sub(r"\d", "", line.strip()) + if line == "Traceback (most recent call last):": + continue + file_line = re.search(r'File "(.*)", (.*)', line) + if file_line: + # append the file's base name and caller information; the result string + # is not something meaningful to human, we just need something that + # uniquely represent the stack trace + line = f'{file_line.group(1).split("/")[-1]}{file_line.group(2)}' + massaged_trace.append(line) + return "".join(massaged_trace) + + @staticmethod + def _compute_stack_trace(logs: List[str]) -> List[str]: + """ + Extract stack trace pattern from the logs. Stack trace pattern often matches + the following: + ERROR ... + Traceback (most recent call last): + File "...", line ..., in ... + ... + Exception: exception error + """ + error_stacktrace = [] + stacktrace = [] + i = 0 + while i < len(logs): + stack = [] + trace = error_stacktrace + # Search for lines that are either + # ... ERROR ... + # or + # ... ERROR ... + # Traceback (most recent call last): + if "ERROR" in logs[i]: + stack.append(logs[i]) + next = i + 1 + if i + 1 < len(logs) and TRACEBACK_PATTERN in logs[i + 1]: + stack.append(logs[i + 1]) + next = i + 2 + # Or if the line with ERROR does not exist, just search for the line with + # Traceback (most recent call last): + elif TRACEBACK_PATTERN in logs[i]: + stack.append(logs[i]) + trace = stacktrace + next = i + 1 + # Or else, skip this line and continue + else: + i = i + 1 + continue + # If the line that contains ERROR, Traceback, etc. is found, scan the logs + # until the line no longer has indentation. This is because stack trace + # is always indented, and stops when the line is no longer indented + while next < len(logs): + if logs[next].startswith((" ", "\t")): + stack.append(logs[next]) + next = next + 1 + else: + break + # Finished capturing the entire stack trace + if next < len(logs): + stack.append(logs[next]) + if stack: + trace.append(stack) + i = next + 1 + + # Favor stack trace that contains the ERROR keyword + if error_stacktrace: + return error_stacktrace[-1] + + # Otherwise any stack trace is fine + if stacktrace: + return stacktrace[-1] + + return [] \ No newline at end of file From bdc8246098de914b1bf97d37c2874519b8752fec Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Fri, 21 Apr 2023 09:32:24 -0700 Subject: [PATCH 104/104] Fix lints Signed-off-by: Cuong Nguyen --- release/ray_release/log_aggregator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/release/ray_release/log_aggregator.py b/release/ray_release/log_aggregator.py index 455c133e73ae..4617b63bfb8b 100644 --- a/release/ray_release/log_aggregator.py +++ b/release/ray_release/log_aggregator.py @@ -3,10 +3,11 @@ TRACEBACK_PATTERN = "Traceback (most recent call last)" -class LogAggregator(): + +class LogAggregator: def __init__(self, log: str): self.log = log - + def compute_crash_pattern(self) -> str: stack_trace = LogAggregator._compute_stack_trace(self.log.splitlines()) # truncate short enough to store in databases, but long enough to keep the @@ -96,4 +97,4 @@ def _compute_stack_trace(logs: List[str]) -> List[str]: if stacktrace: return stacktrace[-1] - return [] \ No newline at end of file + return []