Skip to content

Commit

Permalink
[core] fix out of disk test on release branch
Browse files Browse the repository at this point in the history
Signed-off-by: Lonnie Liu <[email protected]>
  • Loading branch information
aslonnie committed Jan 26, 2024
1 parent cc2646f commit e20dee1
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 3 deletions.
23 changes: 21 additions & 2 deletions .buildkite/core.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ steps:
commands:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--except-tags debug_tests,asan_tests,post_wheel_build,xcommit,container,manual
--except-tags debug_tests,asan_tests,post_wheel_build,xcommit,tmpfs,container,manual

- label: ":ray: core: redis tests"
tags: python
Expand All @@ -39,7 +39,26 @@ steps:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--test-env=TEST_EXTERNAL_REDIS=1
--except-tags debug_tests,asan_tests,post_wheel_build,xcommit,container,manual
--except-tags debug_tests,asan_tests,post_wheel_build,xcommit,tmpfs,container,manual

- label: ":ray: core: out of disk tests"
tags:
- python
- oss
instance_type: small
commands:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core
--only-tags=tmpfs --tmp-filesystem=tmpfs

- label: ":ray: core: out of disk redis tests"
tags:
- python
- oss
instance_type: small
commands:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core
--test-env=TEST_EXTERNAL_REDIS=1
--only-tags=tmpfs --tmp-filesystem=tmpfs

- label: ":ray: core: workflow tests"
tags:
Expand Down
11 changes: 11 additions & 0 deletions ci/ray_ci/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,18 @@ def __init__(
docker_tag: str,
volumes: Optional[List[str]] = None,
envs: Optional[List[str]] = None,
tmp_filesystem: Optional[str] = None,
) -> None:
self.docker_tag = docker_tag
self.volumes = volumes or []
self.envs = envs or []
self.envs += _DOCKER_ENV

if tmp_filesystem is not None:
if tmp_filesystem != "tmpfs":
raise ValueError("Only tmpfs is supported for tmp filesystem")
self.tmp_filesystem = tmp_filesystem

def run_script_with_output(self, script: List[str]) -> bytes:
"""
Run a script in container and returns output
Expand Down Expand Up @@ -108,6 +114,11 @@ def _get_run_command(
command += ["--cap-add", cap]
if gpu_ids:
command += ["--gpus", f'"device={",".join(map(str, gpu_ids))}"']
if self.tmp_filesystem:
command += [
"--mount",
f"type={self.tmp_filesystem},destination=/tmp",
]
command += [
"--workdir",
"/rayci",
Expand Down
6 changes: 6 additions & 0 deletions ci/ray_ci/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,11 @@ def test_get_run_command() -> None:
assert "/bin/bash -iecuo pipefail -- hi\nhello" in command


def test_get_run_command_tmpfs() -> None:
container = Container("test", tmp_filesystem="tmpfs")
command = " ".join(container._get_run_command(["hi", "hello"]))
assert "--mount type=tmpfs,destination=/tmp" in command


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
9 changes: 9 additions & 0 deletions ci/ray_ci/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@
),
default="optimized",
)
@click.option(
"--tmp-filesystem",
type=str,
help=("Filesystem to use for /tmp"),
)
def main(
targets: List[str],
team: str,
Expand All @@ -150,6 +155,7 @@ def main(
test_arg: Optional[str],
build_name: Optional[str],
build_type: Optional[str],
tmp_filesystem: Optional[str],
) -> None:
if not bazel_workspace_dir:
raise Exception("Please use `bazelisk run //ci/ray_ci`")
Expand All @@ -166,6 +172,7 @@ def main(
worker_id,
parallelism_per_worker,
gpus,
tmp_filesystem=tmp_filesystem,
test_env=list(test_env),
build_name=build_name,
build_type=build_type,
Expand Down Expand Up @@ -198,6 +205,7 @@ def _get_container(
worker_id: int,
parallelism_per_worker: int,
gpus: int,
tmp_filesystem: Optional[str] = None,
test_env: Optional[List[str]] = None,
build_name: Optional[str] = None,
build_type: Optional[str] = None,
Expand All @@ -215,6 +223,7 @@ def _get_container(
gpus=gpus,
skip_ray_installation=skip_ray_installation,
build_type=build_type,
tmp_filesystem=tmp_filesystem,
)


Expand Down
2 changes: 2 additions & 0 deletions ci/ray_ci/tester_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(
shard_ids: Optional[List[int]] = None,
skip_ray_installation: bool = False,
build_type: Optional[str] = None,
tmp_filesystem: Optional[str] = None,
) -> None:
"""
:param docker_tag: Name of the wanda build to be used as test container.
Expand All @@ -36,6 +37,7 @@ def __init__(
f"{os.environ.get('RAYCI_CHECKOUT_DIR')}:/ray-mount",
"/var/run/docker.sock:/var/run/docker.sock",
],
tmp_filesystem=tmp_filesystem,
)
self.shard_count = shard_count
self.shard_ids = shard_ids or []
Expand Down
10 changes: 9 additions & 1 deletion python/ray/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,16 @@ py_test(

py_test_module_list(
files = [
"test_tqdm.py",
"test_out_of_disk_space.py",
],
size = "large",
tags = ["exclusive", "tmpfs", "team:core"],
deps = ["//:ray_lib", ":conftest"],
)

py_test_module_list(
files = [
"test_tqdm.py",
"test_failure_4.py",
"test_iter.py",
"test_object_spilling.py",
Expand Down
4 changes: 4 additions & 0 deletions python/ray/tests/test_out_of_disk_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,10 @@ def foo():
except ray.exceptions.RayTaskError as e:
assert isinstance(e.cause, ray.exceptions.OutOfDiskError)

# Give it some time for events to appear.
# TODO(core-team): provide some way to wait for events to be flushed.
time.sleep(2)

events = list_cluster_events()
print(events)
# There could be more than 1 event depending on the test timing.
Expand Down

0 comments on commit e20dee1

Please sign in to comment.