[tune] Increase volume size for long running pbt failure (#27163)

Currently running into an issue: Cluster startup Failed. Error: RuntimeError: botocore.exceptions.ClientError: An error occurred (InvalidBlockDeviceMapping) when calling the RunInstances operation: Volume of size 202GB is smaller than snapshot 'snap-02c4e6a0ad06cf3d6', expect size >= 400GB
ray-project · Jul 29, 2022 · ee05fc9 · ee05fc9
1 parent c1ac2bb
commit ee05fc9
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 2 deletions.
diff --git a/release/long_running_distributed_tests/compute_tpl.yaml b/release/long_running_distributed_tests/compute_tpl.yaml
@@ -26,4 +26,4 @@ aws:
   BlockDeviceMappings:
     - DeviceName: /dev/sda1
       Ebs:
-        VolumeSize: 202
+        VolumeSize: 400
diff --git a/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/release/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
@@ -5,7 +5,7 @@
 
 import ray
 from ray import tune
-from ray.air.config import RunConfig, ScalingConfig
+from ray.air.config import RunConfig, ScalingConfig, FailureConfig
 from ray.train.examples.tune_cifar_torch_pbt_example import train_func
 from ray.train.torch import TorchConfig, TorchTrainer
 from ray.tune.schedulers import PopulationBasedTraining
@@ -69,6 +69,7 @@
     ),
     run_config=RunConfig(
         stop={"training_iteration": 1} if args.smoke_test else None,
+        failure_config=FailureConfig(max_failures=-1),
         callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()],
     ),
 )