From 91bb17aedbce50c8149de491fcf1ef5a67a30585 Mon Sep 17 00:00:00 2001 From: Clarence Ng Date: Thu, 9 Feb 2023 04:58:45 -0800 Subject: [PATCH 1/2] dask Signed-off-by: Clarence Ng --- release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml index f4098b019262..92f1dbe7e6a5 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml @@ -1,4 +1,7 @@ base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }} +# We use retriable_lifo as the workload can crash due to +# +env_vars: {"RAY_worker_killing_policy": "retriable_lifo"} debian_packages: [] python: From cbc9f7c679f23f64465ad73ca98b5c381ac41ee1 Mon Sep 17 00:00:00 2001 From: Clarence Ng Date: Thu, 9 Feb 2023 09:07:53 -0800 Subject: [PATCH 2/2] [core][oom] use retriable lifo policy for dask x3 nightly test Signed-off-by: Clarence Ng --- .../nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml index 92f1dbe7e6a5..5f0871723e3a 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml @@ -1,6 +1,8 @@ base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }} -# We use retriable_lifo as the workload can crash due to -# +# We use retriable_lifo as the workload can crash due to multiple tasks from different +# callers running on the same node, we also observed raylet memory leak that would +# trigger the group-by-policy to fail the workload. +# https://github.com/ray-project/ray/issues/32195 env_vars: {"RAY_worker_killing_policy": "retriable_lifo"} debian_packages: []