From 0ced913e27a5f0c9aab1134537a8ac679129479b Mon Sep 17 00:00:00 2001 From: Kevin Date: Thu, 9 May 2024 16:38:25 -0400 Subject: [PATCH] simplify function calls and add option for custom resources Signed-off-by: Kevin --- src/codeflare_sdk/cluster/cluster.py | 93 ++++----- src/codeflare_sdk/cluster/config.py | 59 ++++++ src/codeflare_sdk/cluster/model.py | 7 +- .../templates/base-template.yaml | 4 - src/codeflare_sdk/utils/generate_yaml.py | 193 ++++++++++-------- src/codeflare_sdk/utils/pretty_print.py | 2 +- tests/test-case-bad.yaml | 4 +- tests/test-case-no-mcad.yamls | 4 +- tests/test-case.yaml | 4 +- tests/test-default-appwrapper.yaml | 10 +- tests/unit_test.py | 32 +-- 11 files changed, 235 insertions(+), 177 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index e5bbcd86a..187193444 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -131,48 +131,7 @@ def create_app_wrapper(self): # Validate image configuration self.validate_image_config() - # Before attempting to create the cluster AW, let's evaluate the ClusterConfig - - name = self.config.name - namespace = self.config.namespace - head_cpus = self.config.head_cpus - head_memory = self.config.head_memory - head_gpus = self.config.head_gpus - min_cpu = self.config.min_cpus - max_cpu = self.config.max_cpus - min_memory = self.config.min_memory - max_memory = self.config.max_memory - gpu = self.config.num_gpus - workers = self.config.num_workers - template = self.config.template - image = self.config.image - appwrapper = self.config.appwrapper - env = self.config.envs - image_pull_secrets = self.config.image_pull_secrets - write_to_file = self.config.write_to_file - local_queue = self.config.local_queue - labels = self.config.labels - return generate_appwrapper( - name=name, - namespace=namespace, - head_cpus=head_cpus, - head_memory=head_memory, - head_gpus=head_gpus, - min_cpu=min_cpu, - max_cpu=max_cpu, - min_memory=min_memory, - max_memory=max_memory, - gpu=gpu, - workers=workers, - template=template, - image=image, - appwrapper=appwrapper, - env=env, - image_pull_secrets=image_pull_secrets, - write_to_file=write_to_file, - local_queue=local_queue, - labels=labels, - ) + return generate_appwrapper(self) # creates a new cluster with the provided or default spec def up(self): @@ -456,6 +415,29 @@ def job_logs(self, job_id: str) -> str: """ return self.job_client.get_job_logs(job_id) + @staticmethod + def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]: + head_extended_resources, worker_extended_resources = {}, {} + for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][ + "template" + ]["spec"]["containers"][0]["resources"]["limits"][resource] + + for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + head_extended_resources[resource] = rc["spec"]["headGroupSpec"]["template"][ + "spec" + ]["containers"][0]["resources"]["limits"][resource] + + return head_extended_resources, worker_extended_resources + def from_k8_cluster_object( rc, appwrapper=True, @@ -469,6 +451,11 @@ def from_k8_cluster_object( else [] ) + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], @@ -486,11 +473,8 @@ def from_k8_cluster_object( max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - num_gpus=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"] - ), + worker_extended_resource_requests=worker_extended_resources, + head_extended_resource_requests=head_extended_resources, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], @@ -871,6 +855,11 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: protocol = "https" dashboard_url = f"{protocol}://{ingress.spec.rules[0].host}" + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + return RayCluster( name=rc["metadata"]["name"], status=status, @@ -885,7 +874,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["resources"]["limits"]["cpu"], - worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for + worker_extended_resources=worker_extended_resources, namespace=rc["metadata"]["namespace"], head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" @@ -893,9 +882,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" ]["limits"]["memory"], - head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"], + head_extended_resources=head_extended_resources, dashboard=dashboard_url, ) @@ -920,12 +907,12 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: worker_mem_min=cluster.config.min_memory, worker_mem_max=cluster.config.max_memory, worker_cpu=cluster.config.min_cpus, - worker_gpu=cluster.config.num_gpus, + worker_extended_resources=cluster.config.worker_extended_resource_requests, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), head_cpus=cluster.config.head_cpus, head_mem=cluster.config.head_memory, - head_gpu=cluster.config.head_gpus, + head_extended_resources=cluster.config.head_extended_resource_requests, ) if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 9e069c376..ace03f452 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -21,9 +21,22 @@ from dataclasses import dataclass, field import pathlib import typing +import warnings dir = pathlib.Path(__file__).parent.parent.resolve() +# https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html +DEFAULT_RESOURCE_MAPPING = { + "nvidia.com/gpu": "GPU", + "intel.com/gpu": "GPU", + "amd.com/gpu": "GPU", + "aws.amazon.com/neuroncore": "neuron_cores", + "google.com/tpu": "TPU", + "habana.ai/gaudi": "HPU", + "huawei.com/Ascend910": "NPU", + "huawei.com/Ascend310": "NPU", +} + @dataclass class ClusterConfiguration: @@ -38,6 +51,7 @@ class ClusterConfiguration: head_cpus: typing.Union[int, str] = 2 head_memory: typing.Union[int, str] = 8 head_gpus: int = 0 + head_extended_resource_requests: typing.Dict[str, int] = field(default_factory=dict) machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] min_cpus: typing.Union[int, str] = 1 max_cpus: typing.Union[int, str] = 1 @@ -53,6 +67,11 @@ class ClusterConfiguration: write_to_file: bool = False verify_tls: bool = True labels: dict = field(default_factory=dict) + worker_extended_resource_requests: typing.Dict[str, int] = field( + default_factory=dict + ) + custom_resource_mapping: typing.Dict[str, str] = field(default_factory=dict) + overwrite_default_resource_mapping: bool = False def __post_init__(self): if not self.verify_tls: @@ -61,6 +80,46 @@ def __post_init__(self): ) self._memory_to_string() self._str_mem_no_unit_add_GB() + self._gpu_to_resource() + self._combine_custom_resource_mapping() + + def _combine_custom_resource_mapping(self): + if overwritten := set(self.custom_resource_mapping.keys()).intersection( + DEFAULT_RESOURCE_MAPPING.keys() + ): + if self.overwrite_default_resource_mapping: + warnings.warn( + f"Overwriting default resource mapping for {overwritten}", + UserWarning, + ) + else: + raise ValueError( + f"Resource mapping already exists for {overwritten}, set overwrite_default_resource_mapping to True to overwrite" + ) + self.custom_resource_mapping = { + **DEFAULT_RESOURCE_MAPPING, + **self.custom_resource_mapping, + } + + def _gpu_to_resource(self): + if self.head_gpus: + warnings.warn( + 'head_gpus is being deprecated, use head_custom_resource_requests={"nvidia.com/gpu": }' + ) + if "nvidia.com/gpu" in self.head_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in head_custom_resource_requests" + ) + self.head_extended_resource_requests["nvidia.com/gpu"] = self.head_gpus + if self.num_gpus: + warnings.warn( + 'num_gpus is being deprecated, use worker_custom_resource_requests={"nvidia.com/gpu": }' + ) + if "nvidia.com/gpu" in self.worker_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in worker_custom_resource_requests" + ) + self.worker_extended_resource_requests["nvidia.com/gpu"] = self.num_gpus def _str_mem_no_unit_add_GB(self): if isinstance(self.head_memory, str) and self.head_memory.isdecimal(): diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index 2547de254..5d6e2ed2a 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -18,8 +18,9 @@ dataclasses to store information for Ray clusters and AppWrappers. """ -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum +import typing class RayClusterStatus(Enum): @@ -74,14 +75,14 @@ class RayCluster: status: RayClusterStatus head_cpus: int head_mem: str - head_gpu: int workers: int worker_mem_min: str worker_mem_max: str worker_cpu: int - worker_gpu: int namespace: str dashboard: str + worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) + head_extended_resources: typing.Dict[str, int] = field(default_factory=dict) @dataclass diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 5c0c919d5..f980dbc56 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -86,11 +86,9 @@ spec: limits: cpu: 2 memory: "8G" - nvidia.com/gpu: 0 requests: cpu: 2 memory: "8G" - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -163,11 +161,9 @@ spec: limits: cpu: "2" memory: "12G" - nvidia.com/gpu: "1" requests: cpu: "2" memory: "12G" - nvidia.com/gpu: "1" volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 30edcd913..5c2d5ed86 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -17,6 +17,7 @@ (in the cluster sub-module) for AppWrapper generation. """ +import json from typing import Optional import typing import yaml @@ -30,6 +31,7 @@ from os import urandom from base64 import b64encode from urllib3.util import parse_url +import codeflare_sdk def read_template(template): @@ -77,10 +79,13 @@ def is_kind_cluster(): return False -def update_names(cluster_yaml, cluster_name, namespace): - meta = cluster_yaml.get("metadata") - meta["name"] = cluster_name - meta["namespace"] = namespace +def update_names( + cluster_yaml: dict, + cluster: "codeflare_sdk.cluster.Cluster", +): + metadata = cluster_yaml.get("metadata") + metadata["name"] = cluster.config.name + metadata["namespace"] = cluster.config.namespace def update_image(spec, image): @@ -106,60 +111,119 @@ def update_env(spec, env): container["env"] = env -def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu): +def update_resources( + spec, + min_cpus, + max_cpus, + min_memory, + max_memory, + custom_resources, +): container = spec.get("containers") for resource in container: requests = resource.get("resources").get("requests") if requests is not None: - requests["cpu"] = min_cpu + requests["cpu"] = min_cpus requests["memory"] = min_memory - requests["nvidia.com/gpu"] = gpu + for k in custom_resources.keys(): + requests[k] = custom_resources[k] limits = resource.get("resources").get("limits") if limits is not None: - limits["cpu"] = max_cpu + limits["cpu"] = max_cpus limits["memory"] = max_memory - limits["nvidia.com/gpu"] = gpu + for k in custom_resources.keys(): + limits[k] = custom_resources[k] + + +def head_worker_gpu_count_from_cluster( + cluster: "codeflare_sdk.cluster.Cluster", +) -> typing.Tuple[int, int]: + head_gpus = 0 + worker_gpus = 0 + for k in cluster.config.head_extended_resource_requests.keys(): + resource_type = cluster.config.custom_resource_mapping[k] + if resource_type == "GPU": + head_gpus += int(cluster.config.head_extended_resource_requests[k]) + for k in cluster.config.worker_extended_resource_requests.keys(): + resource_type = cluster.config.custom_resource_mapping[k] + if resource_type == "GPU": + worker_gpus += int(cluster.config.worker_extended_resource_requests[k]) + + return head_gpus, worker_gpus + + +FORBIDDEN_CUSTOM_RESOURCE_TYPES = ["GPU", "CPU", "memory"] + + +def head_worker_resources_from_cluster( + cluster: "codeflare_sdk.cluster.Cluster", +) -> typing.Tuple[dict, dict]: + to_return = {}, {} + for k in cluster.config.head_extended_resource_requests.keys(): + resource_type = cluster.config.custom_resource_mapping[k] + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: + continue + to_return[0][resource_type] = cluster.config.head_extended_resource_requests[ + k + ] + to_return[0].get(resource_type, 0) + + for k in cluster.config.worker_extended_resource_requests.keys(): + resource_type = cluster.config.custom_resource_mapping[k] + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: + continue + to_return[1][resource_type] = cluster.config.worker_extended_resource_requests[ + k + ] + to_return[1].get(resource_type, 0) + return to_return def update_nodes( - cluster_yaml, - appwrapper_name, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, - workers, - image, - env, - image_pull_secrets, - head_cpus, - head_memory, - head_gpus, + ray_cluster_dict: dict, + cluster: "codeflare_sdk.cluster.Cluster", ): - head = cluster_yaml.get("spec").get("headGroupSpec") - head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) + head = ray_cluster_dict.get("spec").get("headGroupSpec") + worker = ray_cluster_dict.get("spec").get("workerGroupSpecs")[0] + head_gpus, worker_gpus = head_worker_gpu_count_from_cluster(cluster) + head_resources, worker_resources = head_worker_resources_from_cluster(cluster) + head_resources = json.dumps(head_resources).replace('"', '\\"') + head_resources = f'"{head_resources}"' + worker_resources = json.dumps(worker_resources).replace('"', '\\"') + worker_resources = f'"{worker_resources}"' + head["rayStartParams"]["num-gpus"] = str(head_gpus) + head["rayStartParams"]["resources"] = head_resources - worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0] # Head counts as first worker - worker["replicas"] = workers - worker["minReplicas"] = workers - worker["maxReplicas"] = workers - worker["groupName"] = "small-group-" + appwrapper_name - worker["rayStartParams"]["num-gpus"] = str(int(gpu)) + worker["replicas"] = cluster.config.num_workers + worker["minReplicas"] = cluster.config.num_workers + worker["maxReplicas"] = cluster.config.num_workers + worker["groupName"] = "small-group-" + cluster.config.name + worker["rayStartParams"]["num-gpus"] = str(worker_gpus) + worker["rayStartParams"]["resources"] = worker_resources for comp in [head, worker]: spec = comp.get("template").get("spec") - update_image_pull_secrets(spec, image_pull_secrets) - update_image(spec, image) - update_env(spec, env) + update_image_pull_secrets(spec, cluster.config.image_pull_secrets) + update_image(spec, cluster.config.image) + update_env(spec, cluster.config.envs) if comp == head: # TODO: Eventually add head node configuration outside of template update_resources( - spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus + spec, + cluster.config.head_cpus, + cluster.config.head_cpus, + cluster.config.head_memory, + cluster.config.head_memory, + cluster.config.head_extended_resource_requests, ) else: - update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) + update_resources( + spec, + cluster.config.min_cpus, + cluster.config.max_cpus, + cluster.config.min_memory, + cluster.config.max_memory, + cluster.config.worker_extended_resource_requests, + ) def del_from_list_by_name(l: list, target: typing.List[str]) -> list: @@ -260,63 +324,30 @@ def write_user_yaml(user_yaml, output_file_name): print(f"Written to: {output_file_name}") -def generate_appwrapper( - name: str, - namespace: str, - head_cpus: int, - head_memory: int, - head_gpus: int, - min_cpu: int, - max_cpu: int, - min_memory: int, - max_memory: int, - gpu: int, - workers: int, - template: str, - image: str, - appwrapper: bool, - env, - image_pull_secrets: list, - write_to_file: bool, - local_queue: Optional[str], - labels, -): - cluster_yaml = read_template(template) - appwrapper_name, cluster_name = gen_names(name) - update_names(cluster_yaml, cluster_name, namespace) - update_nodes( +def generate_appwrapper(cluster: "codeflare_sdk.cluster.Cluster"): + cluster_yaml = read_template(cluster.config.template) + appwrapper_name, _ = gen_names(cluster.config.name) + update_names( cluster_yaml, - appwrapper_name, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, - workers, - image, - env, - image_pull_secrets, - head_cpus, - head_memory, - head_gpus, + cluster, ) - augment_labels(cluster_yaml, labels) + update_nodes(cluster_yaml, cluster) + augment_labels(cluster_yaml, cluster.config.labels) notebook_annotations(cluster_yaml) - user_yaml = ( - wrap_cluster(cluster_yaml, appwrapper_name, namespace) - if appwrapper + wrap_cluster(cluster_yaml, appwrapper_name, cluster.config.namespace) + if cluster.config.appwrapper else cluster_yaml ) - add_queue_label(user_yaml, namespace, local_queue) + add_queue_label(user_yaml, cluster.config.namespace, cluster.config.local_queue) - if write_to_file: + if cluster.config.write_to_file: directory_path = os.path.expanduser("~/.codeflare/resources/") outfile = os.path.join(directory_path, appwrapper_name + ".yaml") write_user_yaml(user_yaml, outfile) return outfile else: user_yaml = yaml.dump(user_yaml) - print(f"Yaml resources loaded for {name}") + print(f"Yaml resources loaded for {cluster.config.name}") return user_yaml diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 42ef8398b..9431ffd75 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -138,7 +138,7 @@ def print_clusters(clusters: List[RayCluster]): workers = str(cluster.workers) memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" cpu = str(cluster.worker_cpu) - gpu = str(cluster.worker_gpu) + gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) #'table0' to display the cluster name, status, url, and dashboard link table0 = Table(box=None, show_header=False) diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml index 18dcb7d71..46ebdcceb 100644 --- a/tests/test-case-bad.yaml +++ b/tests/test-case-bad.yaml @@ -33,6 +33,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -63,11 +64,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 rayVersion: 2.20.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster @@ -76,6 +75,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls index d8d2516c5..0a964533d 100644 --- a/tests/test-case-no-mcad.yamls +++ b/tests/test-case-no-mcad.yamls @@ -29,6 +29,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -54,11 +55,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -97,6 +96,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-case.yaml b/tests/test-case.yaml index c5229ce79..fc81471ac 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -34,6 +34,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -59,11 +60,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -102,6 +101,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-default-appwrapper.yaml b/tests/test-default-appwrapper.yaml index 8fd1873f1..6401b7457 100644 --- a/tests/test-default-appwrapper.yaml +++ b/tests/test-default-appwrapper.yaml @@ -34,9 +34,11 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: + imagePullSecrets: [] containers: - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 imagePullPolicy: Always @@ -59,11 +61,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -77,7 +77,6 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt - imagePullSecrets: [] volumes: - configMap: items: @@ -101,6 +100,7 @@ spec: rayStartParams: block: 'true' num-gpus: '0' + resources: '"{}"' replicas: 1 template: metadata: @@ -109,6 +109,7 @@ spec: labels: key: value spec: + imagePullSecrets: [] containers: - image: quay.io/project-codeflare/ray:2.20.0-py39-cu118 lifecycle: @@ -123,11 +124,9 @@ spec: limits: cpu: 1 memory: 2G - nvidia.com/gpu: 0 requests: cpu: 1 memory: 2G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -141,7 +140,6 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt - imagePullSecrets: [] volumes: - configMap: items: diff --git a/tests/unit_test.py b/tests/unit_test.py index 61870b2a5..9e9590d99 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -874,12 +874,10 @@ def test_ray_details(mocker, capsys): worker_mem_min="2G", worker_mem_max="2G", worker_cpu=1, - worker_gpu=0, namespace="ns", dashboard="fake-uri", head_cpus=2, head_mem=8, - head_gpu=0, ) mocker.patch( "codeflare_sdk.cluster.cluster.Cluster.status", @@ -913,7 +911,7 @@ def test_ray_details(mocker, capsys): assert ray1.worker_mem_min == ray2.worker_mem_min assert ray1.worker_mem_max == ray2.worker_mem_max assert ray1.worker_cpu == ray2.worker_cpu - assert ray1.worker_gpu == ray2.worker_gpu + assert ray1.worker_extended_resources == ray2.worker_extended_resources try: print_clusters([ray1, ray2]) print_cluster_status(ray1) @@ -1120,12 +1118,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, "volumeMounts": [ @@ -1189,7 +1185,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "groupName": "small-group-quicktest", "maxReplicas": 1, "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, "replicas": 1, "scaleStrategy": {}, "template": { @@ -1240,12 +1239,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, "volumeMounts": [ @@ -1404,12 +1401,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1423,7 +1418,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "groupName": "small-group-quicktest2", "maxReplicas": 1, "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, "replicas": 1, "template": { "metadata": { @@ -1460,12 +1458,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -1582,12 +1578,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1641,12 +1635,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -1777,12 +1769,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1836,12 +1826,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -2283,12 +2271,10 @@ def test_cluster_status(mocker): worker_mem_min=2, worker_mem_max=2, worker_cpu=1, - worker_gpu=0, namespace="ns", dashboard="fake-uri", head_cpus=2, head_mem=8, - head_gpu=0, ) cf = Cluster( ClusterConfiguration(