diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml index f4098b019262..5f0871723e3a 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_app_config.yaml @@ -1,4 +1,9 @@ base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }} +# We use retriable_lifo as the workload can crash due to multiple tasks from different +# callers running on the same node, we also observed raylet memory leak that would +# trigger the group-by-policy to fail the workload. +# https://github.com/ray-project/ray/issues/32195 +env_vars: {"RAY_worker_killing_policy": "retriable_lifo"} debian_packages: [] python: