diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index b823cfd54..471174cd2 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -131,48 +131,7 @@ def create_app_wrapper(self): # Validate image configuration self.validate_image_config() - # Before attempting to create the cluster AW, let's evaluate the ClusterConfig - - name = self.config.name - namespace = self.config.namespace - head_cpus = self.config.head_cpus - head_memory = self.config.head_memory - num_head_gpus = self.config.num_head_gpus - worker_cpu_requests = self.config.worker_cpu_requests - worker_cpu_limits = self.config.worker_cpu_limits - worker_memory_requests = self.config.worker_memory_requests - worker_memory_limits = self.config.worker_memory_limits - num_worker_gpus = self.config.num_worker_gpus - workers = self.config.num_workers - template = self.config.template - image = self.config.image - appwrapper = self.config.appwrapper - env = self.config.envs - image_pull_secrets = self.config.image_pull_secrets - write_to_file = self.config.write_to_file - local_queue = self.config.local_queue - labels = self.config.labels - return generate_appwrapper( - name=name, - namespace=namespace, - head_cpus=head_cpus, - head_memory=head_memory, - num_head_gpus=num_head_gpus, - worker_cpu_requests=worker_cpu_requests, - worker_cpu_limits=worker_cpu_limits, - worker_memory_requests=worker_memory_requests, - worker_memory_limits=worker_memory_limits, - num_worker_gpus=num_worker_gpus, - workers=workers, - template=template, - image=image, - appwrapper=appwrapper, - env=env, - image_pull_secrets=image_pull_secrets, - write_to_file=write_to_file, - local_queue=local_queue, - labels=labels, - ) + return generate_appwrapper(self) # creates a new cluster with the provided or default spec def up(self): @@ -456,6 +415,29 @@ def job_logs(self, job_id: str) -> str: """ return self.job_client.get_job_logs(job_id) + @staticmethod + def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]: + head_extended_resources, worker_extended_resources = {}, {} + for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][ + "template" + ]["spec"]["containers"][0]["resources"]["limits"][resource] + + for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + head_extended_resources[resource] = rc["spec"]["headGroupSpec"]["template"][ + "spec" + ]["containers"][0]["resources"]["limits"][resource] + + return head_extended_resources, worker_extended_resources + def from_k8_cluster_object( rc, appwrapper=True, @@ -469,6 +451,11 @@ def from_k8_cluster_object( else [] ) + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], @@ -486,11 +473,8 @@ def from_k8_cluster_object( worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - num_worker_gpus=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"] - ), + worker_extended_resource_requests=worker_extended_resources, + head_extended_resource_requests=head_extended_resources, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], @@ -871,6 +855,11 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: protocol = "https" dashboard_url = f"{protocol}://{ingress.spec.rules[0].host}" + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + return RayCluster( name=rc["metadata"]["name"], status=status, @@ -885,7 +874,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["resources"]["limits"]["cpu"], - worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for + worker_extended_resources=worker_extended_resources, namespace=rc["metadata"]["namespace"], head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" @@ -893,9 +882,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" ]["limits"]["memory"], - head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"], + head_extended_resources=head_extended_resources, dashboard=dashboard_url, ) @@ -920,12 +907,12 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: worker_mem_min=cluster.config.worker_memory_requests, worker_mem_max=cluster.config.worker_memory_limits, worker_cpu=cluster.config.worker_cpu_requests, - worker_gpu=cluster.config.num_worker_gpus, + worker_extended_resources=cluster.config.worker_extended_resource_requests, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), head_cpus=cluster.config.head_cpus, head_mem=cluster.config.head_memory, - head_gpu=cluster.config.num_head_gpus, + head_extended_resources=cluster.config.head_extended_resource_requests, ) if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index cb8e3d3d0..5be8da4f2 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -25,12 +25,51 @@ dir = pathlib.Path(__file__).parent.parent.resolve() +# https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html +DEFAULT_RESOURCE_MAPPING = { + "nvidia.com/gpu": "GPU", + "intel.com/gpu": "GPU", + "amd.com/gpu": "GPU", + "aws.amazon.com/neuroncore": "neuron_cores", + "google.com/tpu": "TPU", + "habana.ai/gaudi": "HPU", + "huawei.com/Ascend910": "NPU", + "huawei.com/Ascend310": "NPU", +} + @dataclass class ClusterConfiguration: """ This dataclass is used to specify resource requirements and other details, and is passed in as an argument when creating a Cluster object. + + Attributes: + - name: The name of the cluster. + - namespace: The namespace in which the cluster should be created. + - head_info: A list of strings containing information about the head node. + - head_cpus: The number of CPUs to allocate to the head node. + - head_memory: The amount of memory to allocate to the head node. + - head_gpus: The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests) + - head_extended_resource_requests: A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1} + - machine_types: A list of machine types to use for the cluster. + - min_cpus: The minimum number of CPUs to allocate to each worker. + - max_cpus: The maximum number of CPUs to allocate to each worker. + - num_workers: The number of workers to create. + - min_memory: The minimum amount of memory to allocate to each worker. + - max_memory: The maximum amount of memory to allocate to each worker. + - num_gpus: The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests) + - template: The path to the template file to use for the cluster. + - appwrapper: A boolean indicating whether to use an AppWrapper. + - envs: A dictionary of environment variables to set for the cluster. + - image: The image to use for the cluster. + - image_pull_secrets: A list of image pull secrets to use for the cluster. + - write_to_file: A boolean indicating whether to write the cluster configuration to a file. + - verify_tls: A boolean indicating whether to verify TLS when connecting to the cluster. + - labels: A dictionary of labels to apply to the cluster. + - worker_extended_resource_requests: A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1} + - extended_resource_mapping: A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names + - overwrite_default_resource_mapping: A boolean indicating whether to overwrite the default resource mapping. """ name: str @@ -39,7 +78,7 @@ class ClusterConfiguration: head_cpus: typing.Union[int, str] = 2 head_memory: typing.Union[int, str] = 8 head_gpus: int = None # Deprecating - num_head_gpus: int = 0 + head_extended_resource_requests: typing.Dict[str, int] = field(default_factory=dict) machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] worker_cpu_requests: typing.Union[int, str] = 1 worker_cpu_limits: typing.Union[int, str] = 1 @@ -50,7 +89,6 @@ class ClusterConfiguration: worker_memory_limits: typing.Union[int, str] = 2 min_memory: typing.Union[int, str] = None # Deprecating max_memory: typing.Union[int, str] = None # Deprecating - num_worker_gpus: int = 0 num_gpus: int = None # Deprecating template: str = f"{dir}/templates/base-template.yaml" appwrapper: bool = False @@ -60,6 +98,11 @@ class ClusterConfiguration: write_to_file: bool = False verify_tls: bool = True labels: dict = field(default_factory=dict) + worker_extended_resource_requests: typing.Dict[str, int] = field( + default_factory=dict + ) + extended_resource_mapping: typing.Dict[str, str] = field(default_factory=dict) + overwrite_default_resource_mapping: bool = False def __post_init__(self): if not self.verify_tls: @@ -69,9 +112,64 @@ def __post_init__(self): self._memory_to_string() self._str_mem_no_unit_add_GB() + self._old_gpu_arg_conversion() self._memory_to_resource() - self._gpu_to_resource() self._cpu_to_resource() + self._gpu_to_resource() + self._combine_extended_resource_mapping() + self._validate_extended_resource_requests() + + def _combine_extended_resource_mapping(self): + if overwritten := set(self.extended_resource_mapping.keys()).intersection( + DEFAULT_RESOURCE_MAPPING.keys() + ): + if self.overwrite_default_resource_mapping: + warnings.warn( + f"Overwriting default resource mapping for {overwritten}", + UserWarning, + ) + else: + raise ValueError( + f"Resource mapping already exists for {overwritten}, set overwrite_default_resource_mapping to True to overwrite" + ) + self.extended_resource_mapping = { + **DEFAULT_RESOURCE_MAPPING, + **self.extended_resource_mapping, + } + + def _validate_extended_resource_requests(self): + for k in self.head_extended_resource_requests.keys(): + if k not in self.extended_resource_mapping.keys(): + raise ValueError( + f"Head extended resource {k} not found in extended_resource_mapping" + ) + for k in self.worker_extended_resource_requests.keys(): + if k not in self.extended_resource_mapping.keys(): + raise ValueError( + f"Worker extended resource {k} not found in extended_resource_mapping" + ) + + def _gpu_to_resource(self): + if self.head_gpus: + warnings.warn( + "head_gpus is being deprecated, use head_extended_resource_requests i.e. head_extended_resource_requests = {'nvidia.com/gpu': 1}" + ) + if "nvidia.com/gpu" in self.head_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in head_extended_resource_requests" + ) + self.head_extended_resource_requests["nvidia.com/gpu"] = self.num_head_gpus + if self.num_gpus: + warnings.warn( + "num_gpus is being deprecated, use worker_extended_resource_requests instead i.e. worker_extended_resource_requests = {'nvidia.com/gpu': 1}" + ) + if "nvidia.com/gpu" in self.worker_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in worker_extended_resource_requests" + ) + self.worker_extended_resource_requests[ + "nvidia.com/gpu" + ] = self.num_worker_gpus def _str_mem_no_unit_add_GB(self): if isinstance(self.head_memory, str) and self.head_memory.isdecimal(): @@ -95,7 +193,7 @@ def _memory_to_string(self): if isinstance(self.worker_memory_limits, int): self.worker_memory_limits = f"{self.worker_memory_limits}G" - def _gpu_to_resource(self): + def _old_gpu_arg_conversion(self): if self.head_gpus: warnings.warn("head_gpus is being deprecated, use num_head_gpus") self.num_head_gpus = self.head_gpus diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index 2547de254..5d6e2ed2a 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -18,8 +18,9 @@ dataclasses to store information for Ray clusters and AppWrappers. """ -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum +import typing class RayClusterStatus(Enum): @@ -74,14 +75,14 @@ class RayCluster: status: RayClusterStatus head_cpus: int head_mem: str - head_gpu: int workers: int worker_mem_min: str worker_mem_max: str worker_cpu: int - worker_gpu: int namespace: str dashboard: str + worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) + head_extended_resources: typing.Dict[str, int] = field(default_factory=dict) @dataclass diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 7b36146a7..076bd2623 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -86,11 +86,9 @@ spec: limits: cpu: 2 memory: "8G" - nvidia.com/gpu: 0 requests: cpu: 2 memory: "8G" - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -163,11 +161,9 @@ spec: limits: cpu: "2" memory: "12G" - nvidia.com/gpu: "1" requests: cpu: "2" memory: "12G" - nvidia.com/gpu: "1" volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 3e692480e..de292658c 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -17,6 +17,7 @@ (in the cluster sub-module) for AppWrapper generation. """ +import json from typing import Optional import typing import yaml @@ -31,6 +32,7 @@ from base64 import b64encode from urllib3.util import parse_url from kubernetes.client.exceptions import ApiException +import codeflare_sdk def read_template(template): @@ -78,10 +80,13 @@ def is_kind_cluster(): return False -def update_names(cluster_yaml, cluster_name, namespace): - meta = cluster_yaml.get("metadata") - meta["name"] = cluster_name - meta["namespace"] = namespace +def update_names( + cluster_yaml: dict, + cluster: "codeflare_sdk.cluster.Cluster", +): + metadata = cluster_yaml.get("metadata") + metadata["name"] = cluster.config.name + metadata["namespace"] = cluster.config.namespace def update_image(spec, image): @@ -113,7 +118,7 @@ def update_resources( worker_cpu_limits, worker_memory_requests, worker_memory_limits, - num_worker_gpus, + custom_resources, ): container = spec.get("containers") for resource in container: @@ -121,59 +126,103 @@ def update_resources( if requests is not None: requests["cpu"] = worker_cpu_requests requests["memory"] = worker_memory_requests - requests["nvidia.com/gpu"] = num_worker_gpus limits = resource.get("resources").get("limits") if limits is not None: limits["cpu"] = worker_cpu_limits limits["memory"] = worker_memory_limits - limits["nvidia.com/gpu"] = num_worker_gpus + for k in custom_resources.keys(): + limits[k] = custom_resources[k] + requests[k] = custom_resources[k] + + +def head_worker_gpu_count_from_cluster( + cluster: "codeflare_sdk.cluster.Cluster", +) -> typing.Tuple[int, int]: + head_gpus = 0 + worker_gpus = 0 + for k in cluster.config.head_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type == "GPU": + head_gpus += int(cluster.config.head_extended_resource_requests[k]) + for k in cluster.config.worker_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type == "GPU": + worker_gpus += int(cluster.config.worker_extended_resource_requests[k]) + + return head_gpus, worker_gpus + + +FORBIDDEN_CUSTOM_RESOURCE_TYPES = ["GPU", "CPU", "memory"] + + +def head_worker_resources_from_cluster( + cluster: "codeflare_sdk.cluster.Cluster", +) -> typing.Tuple[dict, dict]: + to_return = {}, {} + for k in cluster.config.head_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: + continue + to_return[0][resource_type] = cluster.config.head_extended_resource_requests[ + k + ] + to_return[0].get(resource_type, 0) + + for k in cluster.config.worker_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: + continue + to_return[1][resource_type] = cluster.config.worker_extended_resource_requests[ + k + ] + to_return[1].get(resource_type, 0) + return to_return def update_nodes( - cluster_yaml, - appwrapper_name, - worker_cpu_requests, - worker_cpu_limits, - worker_memory_requests, - worker_memory_limits, - num_worker_gpus, - workers, - image, - env, - image_pull_secrets, - head_cpus, - head_memory, - num_head_gpus, + ray_cluster_dict: dict, + cluster: "codeflare_sdk.cluster.Cluster", ): - head = cluster_yaml.get("spec").get("headGroupSpec") - head["rayStartParams"]["num-gpus"] = str(int(num_head_gpus)) + head = ray_cluster_dict.get("spec").get("headGroupSpec") + worker = ray_cluster_dict.get("spec").get("workerGroupSpecs")[0] + head_gpus, worker_gpus = head_worker_gpu_count_from_cluster(cluster) + head_resources, worker_resources = head_worker_resources_from_cluster(cluster) + head_resources = json.dumps(head_resources).replace('"', '\\"') + head_resources = f'"{head_resources}"' + worker_resources = json.dumps(worker_resources).replace('"', '\\"') + worker_resources = f'"{worker_resources}"' + head["rayStartParams"]["num-gpus"] = str(head_gpus) + head["rayStartParams"]["resources"] = head_resources - worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0] # Head counts as first worker - worker["replicas"] = workers - worker["minReplicas"] = workers - worker["maxReplicas"] = workers - worker["groupName"] = "small-group-" + appwrapper_name - worker["rayStartParams"]["num-gpus"] = str(int(num_worker_gpus)) + worker["replicas"] = cluster.config.num_workers + worker["minReplicas"] = cluster.config.num_workers + worker["maxReplicas"] = cluster.config.num_workers + worker["groupName"] = "small-group-" + cluster.config.name + worker["rayStartParams"]["num-gpus"] = str(worker_gpus) + worker["rayStartParams"]["resources"] = worker_resources for comp in [head, worker]: spec = comp.get("template").get("spec") - update_image_pull_secrets(spec, image_pull_secrets) - update_image(spec, image) - update_env(spec, env) + update_image_pull_secrets(spec, cluster.config.image_pull_secrets) + update_image(spec, cluster.config.image) + update_env(spec, cluster.config.envs) if comp == head: # TODO: Eventually add head node configuration outside of template update_resources( - spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus + spec, + cluster.config.head_cpus, + cluster.config.head_cpus, + cluster.config.head_memory, + cluster.config.head_memory, + cluster.config.head_extended_resource_requests, ) else: update_resources( spec, - worker_cpu_requests, - worker_cpu_limits, - worker_memory_requests, - worker_memory_limits, - num_worker_gpus, + cluster.config.worker_cpu_requests, + cluster.config.worker_cpu_limits, + cluster.config.worker_memory_requests, + cluster.config.worker_memory_limits, + cluster.config.worker_extended_resource_requests, ) @@ -277,63 +326,30 @@ def write_user_yaml(user_yaml, output_file_name): print(f"Written to: {output_file_name}") -def generate_appwrapper( - name: str, - namespace: str, - head_cpus: int, - head_memory: int, - num_head_gpus: int, - worker_cpu_requests: int, - worker_cpu_limits: int, - worker_memory_requests: int, - worker_memory_limits: int, - num_worker_gpus: int, - workers: int, - template: str, - image: str, - appwrapper: bool, - env, - image_pull_secrets: list, - write_to_file: bool, - local_queue: Optional[str], - labels, -): - cluster_yaml = read_template(template) - appwrapper_name, cluster_name = gen_names(name) - update_names(cluster_yaml, cluster_name, namespace) - update_nodes( +def generate_appwrapper(cluster: "codeflare_sdk.cluster.Cluster"): + cluster_yaml = read_template(cluster.config.template) + appwrapper_name, _ = gen_names(cluster.config.name) + update_names( cluster_yaml, - appwrapper_name, - worker_cpu_requests, - worker_cpu_limits, - worker_memory_requests, - worker_memory_limits, - num_worker_gpus, - workers, - image, - env, - image_pull_secrets, - head_cpus, - head_memory, - num_head_gpus, + cluster, ) - augment_labels(cluster_yaml, labels) + update_nodes(cluster_yaml, cluster) + augment_labels(cluster_yaml, cluster.config.labels) notebook_annotations(cluster_yaml) - user_yaml = ( - wrap_cluster(cluster_yaml, appwrapper_name, namespace) - if appwrapper + wrap_cluster(cluster_yaml, appwrapper_name, cluster.config.namespace) + if cluster.config.appwrapper else cluster_yaml ) - add_queue_label(user_yaml, namespace, local_queue) + add_queue_label(user_yaml, cluster.config.namespace, cluster.config.local_queue) - if write_to_file: + if cluster.config.write_to_file: directory_path = os.path.expanduser("~/.codeflare/resources/") outfile = os.path.join(directory_path, appwrapper_name + ".yaml") write_user_yaml(user_yaml, outfile) return outfile else: user_yaml = yaml.dump(user_yaml) - print(f"Yaml resources loaded for {name}") + print(f"Yaml resources loaded for {cluster.config.name}") return user_yaml diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 42ef8398b..9431ffd75 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -138,7 +138,7 @@ def print_clusters(clusters: List[RayCluster]): workers = str(cluster.workers) memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" cpu = str(cluster.worker_cpu) - gpu = str(cluster.worker_gpu) + gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) #'table0' to display the cluster name, status, url, and dashboard link table0 = Table(box=None, show_header=False) diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml index d4d230d48..cb2f4a0aa 100644 --- a/tests/test-case-bad.yaml +++ b/tests/test-case-bad.yaml @@ -33,6 +33,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -63,11 +64,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 rayVersion: 2.23.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster @@ -76,6 +75,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-case-no-kueue-no-aw.yaml b/tests/test-case-no-kueue-no-aw.yaml index dfca7951d..3ea7a22d1 100644 --- a/tests/test-case-no-kueue-no-aw.yaml +++ b/tests/test-case-no-kueue-no-aw.yaml @@ -26,6 +26,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -51,11 +52,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -94,6 +93,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls index 2d0e7e9bd..45a3dfb9f 100644 --- a/tests/test-case-no-mcad.yamls +++ b/tests/test-case-no-mcad.yamls @@ -29,6 +29,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -54,11 +55,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -97,6 +96,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-case.yaml b/tests/test-case.yaml index 00b241afb..461ed7dfc 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -34,6 +34,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -59,11 +60,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -102,6 +101,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-default-appwrapper.yaml b/tests/test-default-appwrapper.yaml index cc44e2340..cc27e37a1 100644 --- a/tests/test-default-appwrapper.yaml +++ b/tests/test-default-appwrapper.yaml @@ -34,9 +34,11 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: + imagePullSecrets: [] containers: - image: quay.io/rhoai/ray:2.23.0-py39-cu121 imagePullPolicy: Always @@ -59,11 +61,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -77,7 +77,6 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt - imagePullSecrets: [] volumes: - configMap: items: @@ -101,6 +100,7 @@ spec: rayStartParams: block: 'true' num-gpus: '0' + resources: '"{}"' replicas: 1 template: metadata: @@ -109,6 +109,7 @@ spec: labels: key: value spec: + imagePullSecrets: [] containers: - image: quay.io/rhoai/ray:2.23.0-py39-cu121 lifecycle: @@ -123,11 +124,9 @@ spec: limits: cpu: 1 memory: 2G - nvidia.com/gpu: 0 requests: cpu: 1 memory: 2G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -141,7 +140,6 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt - imagePullSecrets: [] volumes: - configMap: items: diff --git a/tests/unit_test.py b/tests/unit_test.py index ca6cb9584..03feb675c 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -896,12 +896,10 @@ def test_ray_details(mocker, capsys): worker_mem_min="2G", worker_mem_max="2G", worker_cpu=1, - worker_gpu=0, namespace="ns", dashboard="fake-uri", head_cpus=2, head_mem=8, - head_gpu=0, ) mocker.patch( "codeflare_sdk.cluster.cluster.Cluster.status", @@ -935,7 +933,7 @@ def test_ray_details(mocker, capsys): assert ray1.worker_mem_min == ray2.worker_mem_min assert ray1.worker_mem_max == ray2.worker_mem_max assert ray1.worker_cpu == ray2.worker_cpu - assert ray1.worker_gpu == ray2.worker_gpu + assert ray1.worker_extended_resources == ray2.worker_extended_resources try: print_clusters([ray1, ray2]) print_cluster_status(ray1) @@ -1142,12 +1140,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, "volumeMounts": [ @@ -1211,7 +1207,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "groupName": "small-group-quicktest", "maxReplicas": 1, "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, "replicas": 1, "scaleStrategy": {}, "template": { @@ -1262,12 +1261,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, "volumeMounts": [ @@ -1426,12 +1423,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1445,7 +1440,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "groupName": "small-group-quicktest2", "maxReplicas": 1, "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, "replicas": 1, "template": { "metadata": { @@ -1482,12 +1480,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -1604,12 +1600,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1663,12 +1657,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -1799,12 +1791,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1858,12 +1848,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -2323,12 +2311,10 @@ def test_cluster_status(mocker): worker_mem_min=2, worker_mem_max=2, worker_cpu=1, - worker_gpu=0, namespace="ns", dashboard="fake-uri", head_cpus=2, head_mem=8, - head_gpu=0, ) cf = Cluster( ClusterConfiguration(