From e20dee177573393841a90dd699fbf6d431c57d61 Mon Sep 17 00:00:00 2001 From: Lonnie Liu Date: Fri, 26 Jan 2024 01:05:41 +0000 Subject: [PATCH] [core] fix out of disk test on release branch Signed-off-by: Lonnie Liu --- .buildkite/core.rayci.yml | 23 ++++++++++++++++++++-- ci/ray_ci/container.py | 11 +++++++++++ ci/ray_ci/test_container.py | 6 ++++++ ci/ray_ci/tester.py | 9 +++++++++ ci/ray_ci/tester_container.py | 2 ++ python/ray/tests/BUILD | 10 +++++++++- python/ray/tests/test_out_of_disk_space.py | 4 ++++ 7 files changed, 62 insertions(+), 3 deletions(-) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index d87ce7c9b266..27a0d10fcd27 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -29,7 +29,7 @@ steps: commands: - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3 - --except-tags debug_tests,asan_tests,post_wheel_build,xcommit,container,manual + --except-tags debug_tests,asan_tests,post_wheel_build,xcommit,tmpfs,container,manual - label: ":ray: core: redis tests" tags: python @@ -39,7 +39,26 @@ steps: - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3 --test-env=TEST_EXTERNAL_REDIS=1 - --except-tags debug_tests,asan_tests,post_wheel_build,xcommit,container,manual + --except-tags debug_tests,asan_tests,post_wheel_build,xcommit,tmpfs,container,manual + + - label: ":ray: core: out of disk tests" + tags: + - python + - oss + instance_type: small + commands: + - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core + --only-tags=tmpfs --tmp-filesystem=tmpfs + + - label: ":ray: core: out of disk redis tests" + tags: + - python + - oss + instance_type: small + commands: + - bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core + --test-env=TEST_EXTERNAL_REDIS=1 + --only-tags=tmpfs --tmp-filesystem=tmpfs - label: ":ray: core: workflow tests" tags: diff --git a/ci/ray_ci/container.py b/ci/ray_ci/container.py index f607cd877c02..21535c854a42 100644 --- a/ci/ray_ci/container.py +++ b/ci/ray_ci/container.py @@ -38,12 +38,18 @@ def __init__( docker_tag: str, volumes: Optional[List[str]] = None, envs: Optional[List[str]] = None, + tmp_filesystem: Optional[str] = None, ) -> None: self.docker_tag = docker_tag self.volumes = volumes or [] self.envs = envs or [] self.envs += _DOCKER_ENV + if tmp_filesystem is not None: + if tmp_filesystem != "tmpfs": + raise ValueError("Only tmpfs is supported for tmp filesystem") + self.tmp_filesystem = tmp_filesystem + def run_script_with_output(self, script: List[str]) -> bytes: """ Run a script in container and returns output @@ -108,6 +114,11 @@ def _get_run_command( command += ["--cap-add", cap] if gpu_ids: command += ["--gpus", f'"device={",".join(map(str, gpu_ids))}"'] + if self.tmp_filesystem: + command += [ + "--mount", + f"type={self.tmp_filesystem},destination=/tmp", + ] command += [ "--workdir", "/rayci", diff --git a/ci/ray_ci/test_container.py b/ci/ray_ci/test_container.py index 0e2980aca53a..93c020d07cfa 100644 --- a/ci/ray_ci/test_container.py +++ b/ci/ray_ci/test_container.py @@ -18,5 +18,11 @@ def test_get_run_command() -> None: assert "/bin/bash -iecuo pipefail -- hi\nhello" in command +def test_get_run_command_tmpfs() -> None: + container = Container("test", tmp_filesystem="tmpfs") + command = " ".join(container._get_run_command(["hi", "hello"])) + assert "--mount type=tmpfs,destination=/tmp" in command + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/ci/ray_ci/tester.py b/ci/ray_ci/tester.py index 6b1ac56f5943..3831d17feeea 100644 --- a/ci/ray_ci/tester.py +++ b/ci/ray_ci/tester.py @@ -134,6 +134,11 @@ ), default="optimized", ) +@click.option( + "--tmp-filesystem", + type=str, + help=("Filesystem to use for /tmp"), +) def main( targets: List[str], team: str, @@ -150,6 +155,7 @@ def main( test_arg: Optional[str], build_name: Optional[str], build_type: Optional[str], + tmp_filesystem: Optional[str], ) -> None: if not bazel_workspace_dir: raise Exception("Please use `bazelisk run //ci/ray_ci`") @@ -166,6 +172,7 @@ def main( worker_id, parallelism_per_worker, gpus, + tmp_filesystem=tmp_filesystem, test_env=list(test_env), build_name=build_name, build_type=build_type, @@ -198,6 +205,7 @@ def _get_container( worker_id: int, parallelism_per_worker: int, gpus: int, + tmp_filesystem: Optional[str] = None, test_env: Optional[List[str]] = None, build_name: Optional[str] = None, build_type: Optional[str] = None, @@ -215,6 +223,7 @@ def _get_container( gpus=gpus, skip_ray_installation=skip_ray_installation, build_type=build_type, + tmp_filesystem=tmp_filesystem, ) diff --git a/ci/ray_ci/tester_container.py b/ci/ray_ci/tester_container.py index 93adb0848066..49e0ec6feffe 100644 --- a/ci/ray_ci/tester_container.py +++ b/ci/ray_ci/tester_container.py @@ -21,6 +21,7 @@ def __init__( shard_ids: Optional[List[int]] = None, skip_ray_installation: bool = False, build_type: Optional[str] = None, + tmp_filesystem: Optional[str] = None, ) -> None: """ :param docker_tag: Name of the wanda build to be used as test container. @@ -36,6 +37,7 @@ def __init__( f"{os.environ.get('RAYCI_CHECKOUT_DIR')}:/ray-mount", "/var/run/docker.sock:/var/run/docker.sock", ], + tmp_filesystem=tmp_filesystem, ) self.shard_count = shard_count self.shard_ids = shard_ids or [] diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index d410c0e204c4..6ad0f8e6e50a 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -402,8 +402,16 @@ py_test( py_test_module_list( files = [ - "test_tqdm.py", "test_out_of_disk_space.py", + ], + size = "large", + tags = ["exclusive", "tmpfs", "team:core"], + deps = ["//:ray_lib", ":conftest"], +) + +py_test_module_list( + files = [ + "test_tqdm.py", "test_failure_4.py", "test_iter.py", "test_object_spilling.py", diff --git a/python/ray/tests/test_out_of_disk_space.py b/python/ray/tests/test_out_of_disk_space.py index b92d4c80fceb..cadfde4ebd79 100644 --- a/python/ray/tests/test_out_of_disk_space.py +++ b/python/ray/tests/test_out_of_disk_space.py @@ -246,6 +246,10 @@ def foo(): except ray.exceptions.RayTaskError as e: assert isinstance(e.cause, ray.exceptions.OutOfDiskError) + # Give it some time for events to appear. + # TODO(core-team): provide some way to wait for events to be flushed. + time.sleep(2) + events = list_cluster_events() print(events) # There could be more than 1 event depending on the test timing.