ray-project · matthewdeng · Sep 3, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py
@@ -408,10 +408,12 @@ def env_set_by_user(key):
 CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
 NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
 TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS"
+NPU_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES"
 
 NEURON_CORES = "neuron_cores"
 GPU = "GPU"
 TPU = "TPU"
+NPU = "NPU"
 
 
 RAY_WORKER_NICENESS = "RAY_worker_niceness"

diff --git a/python/ray/air/_internal/device_manager/__init__.py b/python/ray/air/_internal/device_manager/__init__.py
@@ -0,0 +1,5 @@
+from ray.air._internal.device_manager.device_manager import DeviceManager
+
+__all__ = [
+    "DeviceManager",
+]
diff --git a/python/ray/air/_internal/device_manager/device_manager.py b/python/ray/air/_internal/device_manager/device_manager.py
@@ -0,0 +1,21 @@
+from abc import ABC, abstractmethod
+
+
+class DeviceManager(ABC):
+    """This class contains the function needed for supporting
+    an acclerator family in Ray AI Library.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def get_accelerator_name() -> str:
+        """Gets the corresponding accelerator type, e.g. GPU, NPU."""
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def get_device_type() -> str:
+        """Gets the device type in deeplearning framwork,
+        e.g. cuda, hpu, npu in torch.
+        """
+        ...
diff --git a/python/ray/air/_internal/device_manager/hpu.py b/python/ray/air/_internal/device_manager/hpu.py
@@ -0,0 +1,42 @@
+from typing import List, Union
+
+import torch
+
+from ray._private.accelerators.hpu import HPU_PACKAGE_AVAILABLE
+from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
+
+if HPU_PACKAGE_AVAILABLE:
+    import habana_frameworks.torch.hpu as torch_hpu
+
+
+class HPUTorchDeviceManager(TorchDeviceManager):
+    """HPU device manager"""
+
+    @staticmethod
+    def get_accelerator_name() -> str:
+        return "HPU"
+
+    @staticmethod
+    def get_device_type() -> str:
+        return "hpu"
+
+    def is_device_available(self) -> bool():
+        if not HPU_PACKAGE_AVAILABLE:
+            return False
+
+        return torch_hpu.is_available()
+
+    def get_devices(self) -> List[torch.device]:
+        if HPU_PACKAGE_AVAILABLE and torch_hpu.is_available():
+            devices = [torch.device("hpu")]
+        else:
+            devices = [torch.device("cpu")]
+
+        return devices
+
+    def set_device(self, device: Union[torch.device, int, str, None]):
+        torch_hpu.set_device(device)
+
+    def is_support_stream(self) -> bool:
+        """Validate if the device type support create a stream"""
+        return False
diff --git a/python/ray/air/_internal/device_manager/npu.py b/python/ray/air/_internal/device_manager/npu.py
@@ -0,0 +1,103 @@
+import os
+from functools import lru_cache
+from importlib.util import find_spec
+from typing import List, Union
+
+import torch
+
+import ray
+from ray._private.accelerators.npu import ASCEND_RT_VISIBLE_DEVICES_ENV_VAR
+from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
+
+
+@lru_cache()
+def is_package_present(package_name: str) -> bool:
+    try:
+        return find_spec(package_name) is not None
+    except ModuleNotFoundError:
+        return False
+
+
+NPU_TORCH_PACKAGE_AVAILABLE = is_package_present("torch_npu")
+
+
+if NPU_TORCH_PACKAGE_AVAILABLE:
+    import torch_npu  # noqa: F401
+
+
+class NPUTorchDeviceManager(TorchDeviceManager):
+    """Ascend NPU device manager"""
+
+    @staticmethod
+    def get_accelerator_name() -> str:
+        return "NPU"
+
+    @staticmethod
+    def get_device_type() -> str:
+        return "npu"
+
+    def is_device_available(self) -> bool:
+        if not NPU_TORCH_PACKAGE_AVAILABLE:
+            return False
+
+        return torch.npu.is_available()
+
+    def get_devices(self) -> List[torch.device]:
+        """Gets the correct torch device list configured for this process.
+
+        Returns a list of torch NPU devices allocated for the current worker.
+        If no NPUs are assigned, then it returns a list with a single CPU device.
+        """
+        if NPU_TORCH_PACKAGE_AVAILABLE and torch.npu.is_available():
+            npu_ids = [
+                str(id) for id in ray.get_runtime_context().get_accelerator_ids()["NPU"]
+            ]
+
+            device_ids = []
+
+            if len(npu_ids) > 0:
+                npu_visible_str = os.environ.get(ASCEND_RT_VISIBLE_DEVICES_ENV_VAR, "")
+                if npu_visible_str and npu_visible_str != "NoDevFiles":
+                    npu_visible_list = npu_visible_str.split(",")
+                else:
+                    npu_visible_list = []
+
+                for npu_id in npu_ids:
+                    try:
+                        device_ids.append(npu_visible_list.index(npu_id))
+                    except IndexError:
+                        raise RuntimeError(
+                            "ASCEND_RT_VISIBLE_DEVICES set incorrectly. "
+                            f"Got {npu_visible_str}, expected to include {npu_id}. "
+                            "Did you override the `ASCEND_RT_VISIBLE_DEVICES` "
+                            "environment variable?"
+                        )
+            else:
+                # If called on the driver or outside of Ray Train, return the
+                # 0th device.
+                device_ids.append(0)
+
+            devices = [torch.device(f"npu:{device_id}") for device_id in device_ids]
+        else:
+            devices = [torch.device("cpu")]
+
+        return devices
+
+    def set_device(self, device: Union[torch.device, int]):
+        torch.npu.set_device(device)
+
+    def is_support_stream(self) -> bool:
+        """Validate if the device type support create a stream"""
+        return True
+
+    def create_stream(self, deivce):
+        """Create a NPU Stream"""
+        return torch.npu.Stream(deivce)
+
+    def get_stream_context(self, stream):
+        """Get a torch.npu.stream context"""
+        return torch.npu.stream(stream)
+
+    def get_current_stream(self):
+        """Get current stream for npu"""
+        return torch.npu.current_stream()
diff --git a/python/ray/air/_internal/device_manager/nvidia_gpu.py b/python/ray/air/_internal/device_manager/nvidia_gpu.py
@@ -0,0 +1,91 @@
+import os
+from typing import List, Union
+
+import torch
+
+import ray
+from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
+
+
+class CUDATorchDeviceManager(TorchDeviceManager):
+    """CUDA device manager"""
+
+    @staticmethod
+    def get_accelerator_name() -> str:
+        return "GPU"
+
+    @staticmethod
+    def get_device_type() -> str:
+        return "cuda"
+
+    def is_device_available(self) -> bool():
+        return torch.cuda.is_available()
+
+    def get_devices(self) -> List[torch.device]:
+        """Gets the correct torch device list configured for this process.
+
+        Returns a list of torch CUDA devices allocated for the current worker.
+        If no GPUs are assigned, then it returns a list with a single CPU device.
+
+        Assumes that `CUDA_VISIBLE_DEVICES` is set and is a
+        superset of the `ray.get_gpu_ids()`.
+        """
+        if torch.cuda.is_available():
+            # GPU IDs are assigned by Ray after you specify "use_gpu"
+            # GPU `ray.get_gpu_ids()` may return ints or may return strings.
+            # We should always convert to strings.
+            gpu_ids = [str(id) for id in ray.get_gpu_ids()]
+
+            device_ids = []
+
+            if len(gpu_ids) > 0:
+                cuda_visible_str = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+                if cuda_visible_str and cuda_visible_str != "NoDevFiles":
+                    cuda_visible_list = cuda_visible_str.split(",")
+                else:
+                    cuda_visible_list = []
+
+                # By default, there should only be one GPU ID if `use_gpu=True`.
+                # If there are multiple GPUs, return a list of devices.
+                # If using fractional GPUs, these IDs are not guaranteed
+                # to be unique across different processes.
+                for gpu_id in gpu_ids:
+                    try:
+                        device_ids.append(cuda_visible_list.index(gpu_id))
+                    except IndexError:
+                        raise RuntimeError(
+                            "CUDA_VISIBLE_DEVICES set incorrectly. "
+                            f"Got {cuda_visible_str}, expected to include {gpu_id}. "
+                            "Did you override the `CUDA_VISIBLE_DEVICES` environment"
+                            " variable? If not, please help file an issue on Github."
+                        )
+
+            else:
+                # If called on the driver or outside of Ray Train, return the
+                # 0th device.
+                device_ids.append(0)
+
+            devices = [torch.device(f"cuda:{device_id}") for device_id in device_ids]
+        else:
+            devices = [torch.device("cpu")]
+
+        return devices
+
+    def set_device(self, device: Union[torch.device, int, str, None]):
+        torch.cuda.set_device(device)
+
+    def is_support_stream(self) -> bool:
+        """Validate if the device type support create a stream"""
+        return True
+
+    def create_stream(self, deivce: torch.device) -> torch.cuda.Stream:
+        """Create a CUDA Stream"""
+        return torch.cuda.Stream(deivce)
+
+    def get_stream_context(self, stream):
+        """Get a torch.cuda.stream context"""
+        return torch.cuda.stream(stream)
+
+    def get_current_stream(self) -> torch.cuda.Stream:
+        """Get a current stream for cuda"""
+        return torch.cuda.current_stream()
diff --git a/python/ray/air/_internal/device_manager/torch_device_manager.py b/python/ray/air/_internal/device_manager/torch_device_manager.py
@@ -0,0 +1,39 @@
+from typing import List, Union
+
+import torch
+
+from ray.air._internal.device_manager.device_manager import DeviceManager
+
+
+class TorchDeviceManager(DeviceManager):
+    """This class contains the function needed for supporting
+    an acclerator family in Ray AI Library.
+    """
+
+    def is_device_available(self) -> bool:
+        """Validate if device is available."""
+        ...
+
+    def get_devices(self) -> List[torch.device]:
+        """Gets the correct torch device configured for this process"""
+        ...
+
+    def set_device(self, device: Union[torch.device, int, str, None]):
+        """Set the correct device for this process"""
+        ...
+
+    def is_support_stream(self) -> bool:
-    def is_support_stream(self) -> bool:
+    def supports_stream(self) -> bool:
-    def is_support_stream(self) -> bool:
+    def supports_stream(self) -> bool:
+        """Validate if the device type support create a stream"""
+        ...
+
+    def create_stream(self, device: torch.device):
+        """Create a device stream"""
+        ...
+
+    def get_stream_context(self, stream):
+        """Get a stream context like torch.cuda.stream"""
+        ...
+
+    def get_current_stream(self):
+        """Get a torch stream like torch.cuda.current_stream"""
+        ...
diff --git a/python/ray/air/_internal/device_manager/utils.py b/python/ray/air/_internal/device_manager/utils.py
@@ -0,0 +1,51 @@
+import logging
+from typing import Optional
+
+from ray._private.accelerators.hpu import HPU_PACKAGE_AVAILABLE
+from ray.air._internal.device_manager.hpu import HPUTorchDeviceManager
+from ray.air._internal.device_manager.npu import (
+    NPU_TORCH_PACKAGE_AVAILABLE,
+    NPUTorchDeviceManager,
+)
+from ray.air._internal.device_manager.nvidia_gpu import CUDATorchDeviceManager
+from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager
+
+logger = logging.getLogger(__name__)
+
+
+SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER = {
+    "GPU": CUDATorchDeviceManager,
+    "HPU": HPUTorchDeviceManager,
+    "NPU": NPUTorchDeviceManager,
+}
+
+
+def try_register_torch_accelerator_module() -> None:
+    try:
+        if NPU_TORCH_PACKAGE_AVAILABLE:
+            import torch_npu  # noqa: F401
+
+        if HPU_PACKAGE_AVAILABLE:
+            import habana_frameworks.torch.hpu as torch_hpu  # noqa: F401
+
+    except ImportError:
+        raise ImportError("Could not import PyTorch")
+
+
+def get_torch_device_manager_cls_by_resources(
+    resources: Optional[dict],
+) -> TorchDeviceManager:
+    device_manager = None
+
+    # input resources may be None
+    if not resources:
+        return CUDATorchDeviceManager
+
+    # select correct accelerator type from resources
+    for resource_type, resource_value in resources.items():
+        if resource_value and resource_type != "CPU":
+            device_manager = SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER.get(
+                resource_type, None
+            )
+
+    return device_manager or CUDATorchDeviceManager