Skip to content

Commit

Permalink
simplify function calls and add option for custom resources
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin <[email protected]>
  • Loading branch information
KPostOffice committed Jun 21, 2024
1 parent a03c72f commit 0ced913
Show file tree
Hide file tree
Showing 11 changed files with 235 additions and 177 deletions.
93 changes: 40 additions & 53 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,48 +131,7 @@ def create_app_wrapper(self):
# Validate image configuration
self.validate_image_config()

# Before attempting to create the cluster AW, let's evaluate the ClusterConfig

name = self.config.name
namespace = self.config.namespace
head_cpus = self.config.head_cpus
head_memory = self.config.head_memory
head_gpus = self.config.head_gpus
min_cpu = self.config.min_cpus
max_cpu = self.config.max_cpus
min_memory = self.config.min_memory
max_memory = self.config.max_memory
gpu = self.config.num_gpus
workers = self.config.num_workers
template = self.config.template
image = self.config.image
appwrapper = self.config.appwrapper
env = self.config.envs
image_pull_secrets = self.config.image_pull_secrets
write_to_file = self.config.write_to_file
local_queue = self.config.local_queue
labels = self.config.labels
return generate_appwrapper(
name=name,
namespace=namespace,
head_cpus=head_cpus,
head_memory=head_memory,
head_gpus=head_gpus,
min_cpu=min_cpu,
max_cpu=max_cpu,
min_memory=min_memory,
max_memory=max_memory,
gpu=gpu,
workers=workers,
template=template,
image=image,
appwrapper=appwrapper,
env=env,
image_pull_secrets=image_pull_secrets,
write_to_file=write_to_file,
local_queue=local_queue,
labels=labels,
)
return generate_appwrapper(self)

# creates a new cluster with the provided or default spec
def up(self):
Expand Down Expand Up @@ -456,6 +415,29 @@ def job_logs(self, job_id: str) -> str:
"""
return self.job_client.get_job_logs(job_id)

@staticmethod
def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]:
head_extended_resources, worker_extended_resources = {}, {}
for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"].keys():
if resource in ["memory", "cpu"]:
continue
worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][
"template"
]["spec"]["containers"][0]["resources"]["limits"][resource]

for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["limits"].keys():
if resource in ["memory", "cpu"]:
continue
head_extended_resources[resource] = rc["spec"]["headGroupSpec"]["template"][
"spec"
]["containers"][0]["resources"]["limits"][resource]

return head_extended_resources, worker_extended_resources

def from_k8_cluster_object(
rc,
appwrapper=True,
Expand All @@ -469,6 +451,11 @@ def from_k8_cluster_object(
else []
)

(
head_extended_resources,
worker_extended_resources,
) = Cluster._head_worker_extended_resources_from_rc_dict(rc)

cluster_config = ClusterConfiguration(
name=rc["metadata"]["name"],
namespace=rc["metadata"]["namespace"],
Expand All @@ -486,11 +473,8 @@ def from_k8_cluster_object(
max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
num_gpus=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"]
),
worker_extended_resource_requests=worker_extended_resources,
head_extended_resource_requests=head_extended_resources,
image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
]["image"],
Expand Down Expand Up @@ -871,6 +855,11 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
protocol = "https"
dashboard_url = f"{protocol}://{ingress.spec.rules[0].host}"

(
head_extended_resources,
worker_extended_resources,
) = Cluster._head_worker_extended_resources_from_rc_dict(rc)

return RayCluster(
name=rc["metadata"]["name"],
status=status,
Expand All @@ -885,17 +874,15 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
]["resources"]["limits"]["cpu"],
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
worker_extended_resources=worker_extended_resources,
namespace=rc["metadata"]["namespace"],
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["cpu"],
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["memory"],
head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"],
head_extended_resources=head_extended_resources,
dashboard=dashboard_url,
)

Expand All @@ -920,12 +907,12 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
worker_mem_min=cluster.config.min_memory,
worker_mem_max=cluster.config.max_memory,
worker_cpu=cluster.config.min_cpus,
worker_gpu=cluster.config.num_gpus,
worker_extended_resources=cluster.config.worker_extended_resource_requests,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
head_cpus=cluster.config.head_cpus,
head_mem=cluster.config.head_memory,
head_gpu=cluster.config.head_gpus,
head_extended_resources=cluster.config.head_extended_resource_requests,
)
if ray.status == CodeFlareClusterStatus.READY:
ray.status = RayClusterStatus.READY
Expand Down
59 changes: 59 additions & 0 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,22 @@
from dataclasses import dataclass, field
import pathlib
import typing
import warnings

dir = pathlib.Path(__file__).parent.parent.resolve()

# https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html
DEFAULT_RESOURCE_MAPPING = {
"nvidia.com/gpu": "GPU",
"intel.com/gpu": "GPU",
"amd.com/gpu": "GPU",
"aws.amazon.com/neuroncore": "neuron_cores",
"google.com/tpu": "TPU",
"habana.ai/gaudi": "HPU",
"huawei.com/Ascend910": "NPU",
"huawei.com/Ascend310": "NPU",
}


@dataclass
class ClusterConfiguration:
Expand All @@ -38,6 +51,7 @@ class ClusterConfiguration:
head_cpus: typing.Union[int, str] = 2
head_memory: typing.Union[int, str] = 8
head_gpus: int = 0
head_extended_resource_requests: typing.Dict[str, int] = field(default_factory=dict)
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
min_cpus: typing.Union[int, str] = 1
max_cpus: typing.Union[int, str] = 1
Expand All @@ -53,6 +67,11 @@ class ClusterConfiguration:
write_to_file: bool = False
verify_tls: bool = True
labels: dict = field(default_factory=dict)
worker_extended_resource_requests: typing.Dict[str, int] = field(
default_factory=dict
)
custom_resource_mapping: typing.Dict[str, str] = field(default_factory=dict)
overwrite_default_resource_mapping: bool = False

def __post_init__(self):
if not self.verify_tls:
Expand All @@ -61,6 +80,46 @@ def __post_init__(self):
)
self._memory_to_string()
self._str_mem_no_unit_add_GB()
self._gpu_to_resource()
self._combine_custom_resource_mapping()

def _combine_custom_resource_mapping(self):
if overwritten := set(self.custom_resource_mapping.keys()).intersection(
DEFAULT_RESOURCE_MAPPING.keys()
):
if self.overwrite_default_resource_mapping:
warnings.warn(
f"Overwriting default resource mapping for {overwritten}",
UserWarning,
)
else:
raise ValueError(
f"Resource mapping already exists for {overwritten}, set overwrite_default_resource_mapping to True to overwrite"
)
self.custom_resource_mapping = {
**DEFAULT_RESOURCE_MAPPING,
**self.custom_resource_mapping,
}

def _gpu_to_resource(self):
if self.head_gpus:
warnings.warn(
'head_gpus is being deprecated, use head_custom_resource_requests={"nvidia.com/gpu": <num_gpus>}'
)
if "nvidia.com/gpu" in self.head_extended_resource_requests:
raise ValueError(
"nvidia.com/gpu already exists in head_custom_resource_requests"
)
self.head_extended_resource_requests["nvidia.com/gpu"] = self.head_gpus
if self.num_gpus:
warnings.warn(
'num_gpus is being deprecated, use worker_custom_resource_requests={"nvidia.com/gpu": <num_gpus>}'
)
if "nvidia.com/gpu" in self.worker_extended_resource_requests:
raise ValueError(
"nvidia.com/gpu already exists in worker_custom_resource_requests"
)
self.worker_extended_resource_requests["nvidia.com/gpu"] = self.num_gpus

def _str_mem_no_unit_add_GB(self):
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
Expand Down
7 changes: 4 additions & 3 deletions src/codeflare_sdk/cluster/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
dataclasses to store information for Ray clusters and AppWrappers.
"""

from dataclasses import dataclass
from dataclasses import dataclass, field
from enum import Enum
import typing


class RayClusterStatus(Enum):
Expand Down Expand Up @@ -74,14 +75,14 @@ class RayCluster:
status: RayClusterStatus
head_cpus: int
head_mem: str
head_gpu: int
workers: int
worker_mem_min: str
worker_mem_max: str
worker_cpu: int
worker_gpu: int
namespace: str
dashboard: str
worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict)
head_extended_resources: typing.Dict[str, int] = field(default_factory=dict)


@dataclass
Expand Down
4 changes: 0 additions & 4 deletions src/codeflare_sdk/templates/base-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,9 @@ spec:
limits:
cpu: 2
memory: "8G"
nvidia.com/gpu: 0
requests:
cpu: 2
memory: "8G"
nvidia.com/gpu: 0
volumeMounts:
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
Expand Down Expand Up @@ -163,11 +161,9 @@ spec:
limits:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "1"
requests:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "1"
volumeMounts:
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
Expand Down
Loading

0 comments on commit 0ced913

Please sign in to comment.