simplify function calls and add option for custom resources

Signed-off-by: Kevin <[email protected]>
project-codeflare · Jun 21, 2024 · 0ced913 · 0ced913
1 parent a03c72f
commit 0ced913
Show file tree

Hide file tree

Showing 11 changed files with 235 additions and 177 deletions.
diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py
@@ -131,48 +131,7 @@ def create_app_wrapper(self):
         # Validate image configuration
         self.validate_image_config()
 
-        # Before attempting to create the cluster AW, let's evaluate the ClusterConfig
-
-        name = self.config.name
-        namespace = self.config.namespace
-        head_cpus = self.config.head_cpus
-        head_memory = self.config.head_memory
-        head_gpus = self.config.head_gpus
-        min_cpu = self.config.min_cpus
-        max_cpu = self.config.max_cpus
-        min_memory = self.config.min_memory
-        max_memory = self.config.max_memory
-        gpu = self.config.num_gpus
-        workers = self.config.num_workers
-        template = self.config.template
-        image = self.config.image
-        appwrapper = self.config.appwrapper
-        env = self.config.envs
-        image_pull_secrets = self.config.image_pull_secrets
-        write_to_file = self.config.write_to_file
-        local_queue = self.config.local_queue
-        labels = self.config.labels
-        return generate_appwrapper(
-            name=name,
-            namespace=namespace,
-            head_cpus=head_cpus,
-            head_memory=head_memory,
-            head_gpus=head_gpus,
-            min_cpu=min_cpu,
-            max_cpu=max_cpu,
-            min_memory=min_memory,
-            max_memory=max_memory,
-            gpu=gpu,
-            workers=workers,
-            template=template,
-            image=image,
-            appwrapper=appwrapper,
-            env=env,
-            image_pull_secrets=image_pull_secrets,
-            write_to_file=write_to_file,
-            local_queue=local_queue,
-            labels=labels,
-        )
+        return generate_appwrapper(self)
 
     # creates a new cluster with the provided or default spec
     def up(self):
@@ -456,6 +415,29 @@ def job_logs(self, job_id: str) -> str:
         """
         return self.job_client.get_job_logs(job_id)
 
+    @staticmethod
+    def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]:
+        head_extended_resources, worker_extended_resources = {}, {}
+        for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+            "containers"
+        ][0]["resources"]["limits"].keys():
+            if resource in ["memory", "cpu"]:
+                continue
+            worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][
+                "template"
+            ]["spec"]["containers"][0]["resources"]["limits"][resource]
+
+        for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
+            0
+        ]["resources"]["limits"].keys():
+            if resource in ["memory", "cpu"]:
+                continue
+            head_extended_resources[resource] = rc["spec"]["headGroupSpec"]["template"][
+                "spec"
+            ]["containers"][0]["resources"]["limits"][resource]
+
+        return head_extended_resources, worker_extended_resources
+
     def from_k8_cluster_object(
         rc,
         appwrapper=True,
@@ -469,6 +451,11 @@ def from_k8_cluster_object(
             else []
         )
 
+        (
+            head_extended_resources,
+            worker_extended_resources,
+        ) = Cluster._head_worker_extended_resources_from_rc_dict(rc)
+
         cluster_config = ClusterConfiguration(
             name=rc["metadata"]["name"],
             namespace=rc["metadata"]["namespace"],
@@ -486,11 +473,8 @@ def from_k8_cluster_object(
             max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
                 "containers"
             ][0]["resources"]["limits"]["memory"],
-            num_gpus=int(
-                rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
-                    "resources"
-                ]["limits"]["nvidia.com/gpu"]
-            ),
+            worker_extended_resource_requests=worker_extended_resources,
+            head_extended_resource_requests=head_extended_resources,
             image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
                 0
             ]["image"],
@@ -871,6 +855,11 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
                     protocol = "https"
             dashboard_url = f"{protocol}://{ingress.spec.rules[0].host}"
 
+    (
+        head_extended_resources,
+        worker_extended_resources,
+    ) = Cluster._head_worker_extended_resources_from_rc_dict(rc)
+
     return RayCluster(
         name=rc["metadata"]["name"],
         status=status,
@@ -885,17 +874,15 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
         worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
             0
         ]["resources"]["limits"]["cpu"],
-        worker_gpu=0,  # hard to detect currently how many gpus, can override it with what the user asked for
+        worker_extended_resources=worker_extended_resources,
         namespace=rc["metadata"]["namespace"],
         head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
             "resources"
         ]["limits"]["cpu"],
         head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
             "resources"
         ]["limits"]["memory"],
-        head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
-            "resources"
-        ]["limits"]["nvidia.com/gpu"],
+        head_extended_resources=head_extended_resources,
         dashboard=dashboard_url,
     )
 
@@ -920,12 +907,12 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
         worker_mem_min=cluster.config.min_memory,
         worker_mem_max=cluster.config.max_memory,
         worker_cpu=cluster.config.min_cpus,
-        worker_gpu=cluster.config.num_gpus,
+        worker_extended_resources=cluster.config.worker_extended_resource_requests,
         namespace=cluster.config.namespace,
         dashboard=cluster.cluster_dashboard_uri(),
         head_cpus=cluster.config.head_cpus,
         head_mem=cluster.config.head_memory,
-        head_gpu=cluster.config.head_gpus,
+        head_extended_resources=cluster.config.head_extended_resource_requests,
     )
     if ray.status == CodeFlareClusterStatus.READY:
         ray.status = RayClusterStatus.READY

diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py
@@ -21,9 +21,22 @@
 from dataclasses import dataclass, field
 import pathlib
 import typing
+import warnings
 
 dir = pathlib.Path(__file__).parent.parent.resolve()
 
+# https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html
+DEFAULT_RESOURCE_MAPPING = {
+    "nvidia.com/gpu": "GPU",
+    "intel.com/gpu": "GPU",
+    "amd.com/gpu": "GPU",
+    "aws.amazon.com/neuroncore": "neuron_cores",
+    "google.com/tpu": "TPU",
+    "habana.ai/gaudi": "HPU",
+    "huawei.com/Ascend910": "NPU",
+    "huawei.com/Ascend310": "NPU",
+}
+
 
 @dataclass
 class ClusterConfiguration:
@@ -38,6 +51,7 @@ class ClusterConfiguration:
     head_cpus: typing.Union[int, str] = 2
     head_memory: typing.Union[int, str] = 8
     head_gpus: int = 0
+    head_extended_resource_requests: typing.Dict[str, int] = field(default_factory=dict)
     machine_types: list = field(default_factory=list)  # ["m4.xlarge", "g4dn.xlarge"]
     min_cpus: typing.Union[int, str] = 1
     max_cpus: typing.Union[int, str] = 1
@@ -53,6 +67,11 @@ class ClusterConfiguration:
     write_to_file: bool = False
     verify_tls: bool = True
     labels: dict = field(default_factory=dict)
+    worker_extended_resource_requests: typing.Dict[str, int] = field(
+        default_factory=dict
+    )
+    custom_resource_mapping: typing.Dict[str, str] = field(default_factory=dict)
+    overwrite_default_resource_mapping: bool = False
 
     def __post_init__(self):
         if not self.verify_tls:
@@ -61,6 +80,46 @@ def __post_init__(self):
             )
         self._memory_to_string()
         self._str_mem_no_unit_add_GB()
+        self._gpu_to_resource()
+        self._combine_custom_resource_mapping()
+
+    def _combine_custom_resource_mapping(self):
+        if overwritten := set(self.custom_resource_mapping.keys()).intersection(
+            DEFAULT_RESOURCE_MAPPING.keys()
+        ):
+            if self.overwrite_default_resource_mapping:
+                warnings.warn(
+                    f"Overwriting default resource mapping for {overwritten}",
+                    UserWarning,
+                )
+            else:
+                raise ValueError(
+                    f"Resource mapping already exists for {overwritten}, set overwrite_default_resource_mapping to True to overwrite"
+                )
+        self.custom_resource_mapping = {
+            **DEFAULT_RESOURCE_MAPPING,
+            **self.custom_resource_mapping,
+        }
+
+    def _gpu_to_resource(self):
+        if self.head_gpus:
+            warnings.warn(
+                'head_gpus is being deprecated, use head_custom_resource_requests={"nvidia.com/gpu": <num_gpus>}'
+            )
+            if "nvidia.com/gpu" in self.head_extended_resource_requests:
+                raise ValueError(
+                    "nvidia.com/gpu already exists in head_custom_resource_requests"
+                )
+            self.head_extended_resource_requests["nvidia.com/gpu"] = self.head_gpus
+        if self.num_gpus:
+            warnings.warn(
+                'num_gpus is being deprecated, use worker_custom_resource_requests={"nvidia.com/gpu": <num_gpus>}'
+            )
+            if "nvidia.com/gpu" in self.worker_extended_resource_requests:
+                raise ValueError(
+                    "nvidia.com/gpu already exists in worker_custom_resource_requests"
+                )
+            self.worker_extended_resource_requests["nvidia.com/gpu"] = self.num_gpus
 
     def _str_mem_no_unit_add_GB(self):
         if isinstance(self.head_memory, str) and self.head_memory.isdecimal():

diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py
@@ -18,8 +18,9 @@
 dataclasses to store information for Ray clusters and AppWrappers.
 """
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
+import typing
 
 
 class RayClusterStatus(Enum):
@@ -74,14 +75,14 @@ class RayCluster:
     status: RayClusterStatus
     head_cpus: int
     head_mem: str
-    head_gpu: int
     workers: int
     worker_mem_min: str
     worker_mem_max: str
     worker_cpu: int
-    worker_gpu: int
     namespace: str
     dashboard: str
+    worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict)
+    head_extended_resources: typing.Dict[str, int] = field(default_factory=dict)
 
 
 @dataclass

diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml
@@ -86,11 +86,9 @@ spec:
             limits:
               cpu: 2
               memory: "8G"
-              nvidia.com/gpu: 0
             requests:
               cpu: 2
               memory: "8G"
-              nvidia.com/gpu: 0
           volumeMounts:
           - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
             name: odh-trusted-ca-cert
@@ -163,11 +161,9 @@ spec:
             limits:
               cpu: "2"
               memory: "12G"
-              nvidia.com/gpu: "1"
             requests:
               cpu: "2"
               memory: "12G"
-              nvidia.com/gpu: "1"
           volumeMounts:
           - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
             name: odh-trusted-ca-cert