diff --git a/release/long_running_distributed_tests/compute_tpl.yaml b/release/long_running_distributed_tests/compute_tpl.yaml index 4ba669e4763a..359e0350a3e7 100644 --- a/release/long_running_distributed_tests/compute_tpl.yaml +++ b/release/long_running_distributed_tests/compute_tpl.yaml @@ -26,4 +26,4 @@ aws: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: - VolumeSize: 202 \ No newline at end of file + VolumeSize: 400 \ No newline at end of file diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py index e06d7ff5142f..1df7054c6fa0 100644 --- a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py +++ b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py @@ -5,7 +5,7 @@ import ray from ray import tune -from ray.air.config import RunConfig, ScalingConfig +from ray.air.config import RunConfig, ScalingConfig, FailureConfig from ray.train.examples.tune_cifar_torch_pbt_example import train_func from ray.train.torch import TorchConfig, TorchTrainer from ray.tune.schedulers import PopulationBasedTraining @@ -69,6 +69,7 @@ ), run_config=RunConfig( stop={"training_iteration": 1} if args.smoke_test else None, + failure_config=FailureConfig(max_failures=-1), callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()], ), )