From 563e20655dc05ec6f98ca8274585f8647ebe2f65 Mon Sep 17 00:00:00 2001 From: Lonnie Liu <95255098+aslonnie@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:19:46 -0700 Subject: [PATCH] [Data] [Release Test] Add AWS ACCESS_DENIED as retryable exception for multi-node Data+Train benchmarks cherrypick #47232 Signed-off-by: Lonnie Liu --- .../nightly_tests/dataset/multi_node_train_benchmark.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/release/nightly_tests/dataset/multi_node_train_benchmark.py b/release/nightly_tests/dataset/multi_node_train_benchmark.py index 679261931ec6..69828258fdea 100644 --- a/release/nightly_tests/dataset/multi_node_train_benchmark.py +++ b/release/nightly_tests/dataset/multi_node_train_benchmark.py @@ -571,8 +571,13 @@ def __iter__(self): def benchmark_code( args, ): + ctx = ray.data.DataContext.get_current() + # This release test runs into ACCESS_DENIED errors fairly often. + # We add ACCESS_DENIED as a retryable exception type to avoid flakiness. + # See for more details: https://github.com/ray-project/ray/issues/47230 + ctx.retried_io_errors.append("AWS Error ACCESS_DENIED") + if args.target_max_block_size_mb is not None: - ctx = ray.data.DataContext.get_current() ctx.target_max_block_size = args.target_max_block_size_mb * 1024 * 1024 cache_input_ds = args.cache_input_ds