[Train] Make prepare_model always use the correct device (#29104)

Signed-off-by: Amog Kamsetty [email protected] Previously, prepare_model would use the local rank as the device even though local rank may not be the same as the actual device index. This mismatch can happen when CUDA_VISIBLE_DEVICES is set for example, which we do by default in Ray Train. We should always use train.torch.get_device() as the device values for wrapping in DDP. Closes #28996
ray-project · Oct 6, 2022 · 2217f0c · 2217f0c
1 parent 1e616ef
commit 2217f0c
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 2 deletions.
diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
@@ -177,6 +177,32 @@ def train_fn():
     trainer.shutdown()
 
 
+def test_torch_prepare_model_uses_device(ray_start_4_cpus_2_gpus):
+    """Tests if `prepare_model` uses the train.torch.get_device even if it does not
+    match with the local rank."""
+    # The below test should pass without errors.
+
+    @patch.object(
+        ray.train.torch.train_loop_utils._TorchAccelerator,
+        "get_device",
+        lambda self: torch.device(f"cuda:{1 - train.local_rank()}"),
+    )
+    def train_func():
+        # These assert statements must hold for prepare_model to wrap with DDP.
+        assert torch.cuda.is_available()
+        assert train.world_size() > 1
+        model = torch.nn.Linear(1, 1)
+        data = torch.ones(1)
+        data = data.to(train.torch.get_device())
+        model = train.torch.prepare_model(model)
+        model(data)
+
+    trainer = TorchTrainer(
+        train_func, scaling_config=ScalingConfig(num_workers=2, use_gpu=True)
+    )
+    trainer.fit()
+
+
 # TODO: Refactor as a backend test.
 @pytest.mark.parametrize(
     "dataset", (LinearDataset, LinearDatasetDict, NonTensorDataset)

diff --git a/python/ray/train/torch/train_loop_utils.py b/python/ray/train/torch/train_loop_utils.py
@@ -401,8 +401,8 @@ def model_get_state(self):
                 DataParallel = DistributedDataParallel
                 if torch.cuda.is_available():
                     parallel_strategy_kwargs = {
-                        "device_ids": [rank],
-                        "output_device": rank,
+                        "device_ids": [device],
+                        "output_device": device,
                         **parallel_strategy_kwargs,
                     }
             else: