Skip to content

Commit

Permalink
simplify function calls and add option for custom resources
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin <[email protected]>
  • Loading branch information
KPostOffice committed May 15, 2024
1 parent 179fd75 commit 2173f28
Show file tree
Hide file tree
Showing 12 changed files with 326 additions and 306 deletions.
101 changes: 40 additions & 61 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,56 +164,7 @@ def create_app_wrapper(self):
)
else:
priority_val = None

name = self.config.name
namespace = self.config.namespace
head_cpus = self.config.head_cpus
head_memory = self.config.head_memory
head_gpus = self.config.head_gpus
min_cpu = self.config.min_cpus
max_cpu = self.config.max_cpus
min_memory = self.config.min_memory
max_memory = self.config.max_memory
gpu = self.config.num_gpus
workers = self.config.num_workers
template = self.config.template
image = self.config.image
instascale = self.config.instascale
mcad = self.config.mcad
instance_types = self.config.machine_types
env = self.config.envs
image_pull_secrets = self.config.image_pull_secrets
dispatch_priority = self.config.dispatch_priority
write_to_file = self.config.write_to_file
verify_tls = self.config.verify_tls
local_queue = self.config.local_queue
labels = self.config.labels
return generate_appwrapper(
name=name,
namespace=namespace,
head_cpus=head_cpus,
head_memory=head_memory,
head_gpus=head_gpus,
min_cpu=min_cpu,
max_cpu=max_cpu,
min_memory=min_memory,
max_memory=max_memory,
gpu=gpu,
workers=workers,
template=template,
image=image,
instascale=instascale,
mcad=mcad,
instance_types=instance_types,
env=env,
image_pull_secrets=image_pull_secrets,
dispatch_priority=dispatch_priority,
priority_val=priority_val,
write_to_file=write_to_file,
verify_tls=verify_tls,
local_queue=local_queue,
labels=labels,
)
return generate_appwrapper(self)

# creates a new cluster with the provided or default spec
def up(self):
Expand Down Expand Up @@ -499,6 +450,29 @@ def job_logs(self, job_id: str) -> str:
"""
return self.job_client.get_job_logs(job_id)

@staticmethod
def _head_worker_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]:
head_custom_resources, worker_custom_resources = {}, {}
for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"].keys():
if resource in ["memory", "cpu"]:
continue
worker_custom_resources[resource] = rc["spec"]["workerGroupSpecs"][0][
"template"
]["spec"]["containers"][0]["resources"]["limits"][resource]

for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["limits"].keys():
if resource in ["memory", "cpu"]:
continue
head_custom_resources[resource] = rc["spec"]["headGroupSpec"][0][
"template"
]["spec"]["containers"][0]["resources"]["limits"][resource]

return head_custom_resources, worker_custom_resources

def from_k8_cluster_object(
rc,
mcad=True,
Expand All @@ -512,6 +486,11 @@ def from_k8_cluster_object(
else []
)

(
head_custom_resources,
worker_custom_resources,
) = Cluster._head_worker_resources_from_rc_dict(rc)

cluster_config = ClusterConfiguration(
name=rc["metadata"]["name"],
namespace=rc["metadata"]["namespace"],
Expand All @@ -529,11 +508,8 @@ def from_k8_cluster_object(
max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
num_gpus=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"]
),
worker_custom_resource_requests=worker_custom_resources,
head_custom_resource_requests=head_custom_resources,
instascale=True if machine_types else False,
image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
Expand Down Expand Up @@ -900,6 +876,11 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
protocol = "https"
dashboard_url = f"{protocol}://{ingress.spec.rules[0].host}"

(
head_custom_resources,
worker_custom_resources,
) = Cluster._head_worker_resources_from_rc_dict(rc)

return RayCluster(
name=rc["metadata"]["name"],
status=status,
Expand All @@ -914,17 +895,15 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
]["resources"]["limits"]["cpu"],
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
worker_custom_resources=worker_custom_resources,
namespace=rc["metadata"]["namespace"],
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["cpu"],
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["memory"],
head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"],
head_custom_resources=head_custom_resources,
dashboard=dashboard_url,
)

Expand Down Expand Up @@ -953,12 +932,12 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
worker_mem_min=cluster.config.min_memory,
worker_mem_max=cluster.config.max_memory,
worker_cpu=cluster.config.min_cpus,
worker_gpu=cluster.config.num_gpus,
worker_custom_resources=cluster.config.worker_custom_resource_requests,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
head_cpus=cluster.config.head_cpus,
head_mem=cluster.config.head_memory,
head_gpu=cluster.config.head_gpus,
head_custom_resources=cluster.config.head_custom_resource_requests,
)
if ray.status == CodeFlareClusterStatus.READY:
ray.status = RayClusterStatus.READY
Expand Down
57 changes: 57 additions & 0 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,22 @@
from dataclasses import dataclass, field
import pathlib
import typing
import warnings

dir = pathlib.Path(__file__).parent.parent.resolve()

# https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html
DEFAULT_RESOURCE_MAPPING = {
"nvidia.com/gpu": "GPU",
"intel.com/gpu": "GPU",
"amd.com/gpu": "GPU",
"aws.amazon.com/neuroncore": "neuron_cores",
"google.com/tpu": "TPU",
"habana.ai/gaudi": "HPU",
"huawei.com/Ascend910": "NPU",
"huawei.com/Ascend310": "NPU",
}


@dataclass
class ClusterConfiguration:
Expand All @@ -38,6 +51,7 @@ class ClusterConfiguration:
head_cpus: typing.Union[int, str] = 2
head_memory: typing.Union[int, str] = 8
head_gpus: int = 0
head_custom_resource_requests: typing.Dict[str, int] = field(default_factory=dict)
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
min_cpus: typing.Union[int, str] = 1
max_cpus: typing.Union[int, str] = 1
Expand All @@ -55,6 +69,9 @@ class ClusterConfiguration:
write_to_file: bool = False
verify_tls: bool = True
labels: dict = field(default_factory=dict)
worker_custom_resource_requests: typing.Dict[str, int] = field(default_factory=dict)
custom_resource_mapping: typing.Dict[str, str] = field(default_factory=dict)
overwrite_default_resource_mapping: bool = False

def __post_init__(self):
if not self.verify_tls:
Expand All @@ -63,6 +80,46 @@ def __post_init__(self):
)
self._memory_to_string()
self._str_mem_no_unit_add_GB()
self._gpu_to_resource()
self._combine_custom_resource_mapping()

def _combine_custom_resource_mapping(self):
if overwritten := set(self.custom_resource_mapping.keys()).intersection(
DEFAULT_RESOURCE_MAPPING.keys()
):
if self.overwrite_default_resource_mapping:
warnings.warn(
f"Overwriting default resource mapping for {overwritten}",
UserWarning,
)
else:
raise ValueError(
f"Resource mapping already exists for {overwritten}, set overwrite_default_resource_mapping to True to overwrite"
)
self.custom_resource_mapping = {
**DEFAULT_RESOURCE_MAPPING,
**self.custom_resource_mapping,
}

def _gpu_to_resource(self):
if self.head_gpus:
warnings.warn(
"head_gpus is being deprecated, use head_custom_resource_requests"
)
if "nvidia.com/gpu" in self.head_custom_resource_requests:
raise ValueError(
"nvidia.com/gpu already exists in head_custom_resource_requests"
)
self.head_custom_resource_requests["nvidia.com/gpu"] = self.head_gpus
if self.num_gpus:
warnings.warn(
"num_gpus is being deprecated, use worker_custom_resource_requests"
)
if "nvidia.com/gpu" in self.worker_custom_resource_requests:
raise ValueError(
"nvidia.com/gpu already exists in worker_custom_resource_requests"
)
self.worker_custom_resource_requests["nvidia.com/gpu"] = self.num_gpus

def _str_mem_no_unit_add_GB(self):
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
Expand Down
7 changes: 4 additions & 3 deletions src/codeflare_sdk/cluster/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
dataclasses to store information for Ray clusters and AppWrappers.
"""

from dataclasses import dataclass
from dataclasses import dataclass, field
from enum import Enum
import typing


class RayClusterStatus(Enum):
Expand Down Expand Up @@ -73,14 +74,14 @@ class RayCluster:
status: RayClusterStatus
head_cpus: int
head_mem: str
head_gpu: int
workers: int
worker_mem_min: str
worker_mem_max: str
worker_cpu: int
worker_gpu: int
namespace: str
dashboard: str
worker_custom_resources: typing.Dict[str, int] = field(default_factory=dict)
head_custom_resources: typing.Dict[str, int] = field(default_factory=dict)


@dataclass
Expand Down
8 changes: 0 additions & 8 deletions src/codeflare_sdk/templates/base-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,16 @@ spec:
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
- replicas: 3
requests:
cpu: 2
memory: 12G
nvidia.com/gpu: 1
limits:
cpu: 2
memory: 12G
nvidia.com/gpu: 1
generictemplate:
# This config demonstrates KubeRay's Ray autoscaler integration.
# The resource requests and limits in this config are too small for production!
Expand Down Expand Up @@ -133,11 +129,9 @@ spec:
limits:
cpu: 2
memory: "8G"
nvidia.com/gpu: 0
requests:
cpu: 2
memory: "8G"
nvidia.com/gpu: 0
volumeMounts:
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
Expand Down Expand Up @@ -219,11 +213,9 @@ spec:
limits:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "1"
requests:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "1"
volumeMounts:
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
Expand Down
Loading

0 comments on commit 2173f28

Please sign in to comment.