[train] Fix ScalingConfig(accelerator_type) to request a small frac…

…tion of the accelerator label (ray-project#44225) Make Ray Train's accelerator type resource request match Ray Core by setting it to a fractional value (0.001). This is needed to fix autoscaling behavior to request the correct number of GPUs. Signed-off-by: Justin Yu <[email protected]>
stephanie-wang · Mar 27, 2024 · 5ada927 · 5ada927
1 parent f84a7fa
commit 5ada927
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 16 deletions.
diff --git a/python/ray/air/config.py b/python/ray/air/config.py
@@ -206,7 +206,7 @@ def _resources_per_worker_not_none(self):
 
         if self.accelerator_type:
             accelerator = f"{RESOURCE_CONSTRAINT_PREFIX}{self.accelerator_type}"
-            resources_per_worker.setdefault(accelerator, 1)
+            resources_per_worker.setdefault(accelerator, 0.001)
         return resources_per_worker
 
     @property

diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
@@ -149,14 +149,14 @@ def test_scaling_config_accelerator_type():
     }
     assert scaling_config._resources_per_worker_not_none == {
         "GPU": 1,
-        "accelerator_type:A100": 1,
+        "accelerator_type:A100": 0.001,
     }
     assert scaling_config.additional_resources_per_worker == {
-        "accelerator_type:A100": 1
+        "accelerator_type:A100": 0.001
     }
     assert scaling_config.as_placement_group_factory().bundles == [
-        {"GPU": 1, "accelerator_type:A100": 1, "CPU": 1},
-        {"GPU": 1, "accelerator_type:A100": 1},
+        {"GPU": 1, "accelerator_type:A100": 0.001, "CPU": 1},
+        {"GPU": 1, "accelerator_type:A100": 0.001},
     ]
 
     # With resources_per_worker
@@ -172,15 +172,15 @@ def test_scaling_config_accelerator_type():
     assert scaling_config._resources_per_worker_not_none == {
         "GPU": 1,
         "custom_resource": 1,
-        "accelerator_type:A100": 1,
+        "accelerator_type:A100": 0.001,
     }
     assert scaling_config.additional_resources_per_worker == {
         "custom_resource": 1,
-        "accelerator_type:A100": 1,
+        "accelerator_type:A100": 0.001,
     }
     assert scaling_config.as_placement_group_factory().bundles == [
-        {"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 1, "CPU": 1},
-        {"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 1},
+        {"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 0.001, "CPU": 1},
+        {"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 0.001},
     ]
 
     # With trainer_resources
@@ -195,14 +195,14 @@ def test_scaling_config_accelerator_type():
     }
     assert scaling_config._resources_per_worker_not_none == {
         "GPU": 1,
-        "accelerator_type:A100": 1,
+        "accelerator_type:A100": 0.001,
     }
     assert scaling_config.additional_resources_per_worker == {
-        "accelerator_type:A100": 1
+        "accelerator_type:A100": 0.001
     }
     assert scaling_config.as_placement_group_factory().bundles == [
-        {"GPU": 1, "accelerator_type:A100": 1, "memory": 10 * 1024**3},
-        {"GPU": 1, "accelerator_type:A100": 1},
+        {"GPU": 1, "accelerator_type:A100": 0.001, "memory": 10 * 1024**3},
+        {"GPU": 1, "accelerator_type:A100": 0.001},
     ]
 
 

diff --git a/python/ray/train/tests/test_data_parallel_trainer.py b/python/ray/train/tests/test_data_parallel_trainer.py
@@ -51,9 +51,11 @@ def ray_start_heterogenous_cluster():
             cluster.add_node(
                 num_cpus=4,
                 num_gpus=4,
-                resources={f"{RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}": 4}
-                if accelerator_type
-                else {},
+                resources=(
+                    {f"{RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}": 1.0}
+                    if accelerator_type
+                    else {}
+                ),
             )
 
     ray.init(address=cluster.address)