Skip to content

Commit

Permalink
Split head memory and cpu requests/limits
Browse files Browse the repository at this point in the history
  • Loading branch information
Bobbins228 committed Jul 2, 2024
1 parent a36ebdb commit 96e3a8e
Show file tree
Hide file tree
Showing 11 changed files with 119 additions and 54 deletions.
36 changes: 24 additions & 12 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,10 @@ def create_app_wrapper(self):

name = self.config.name
namespace = self.config.namespace
head_cpus = self.config.head_cpus
head_memory = self.config.head_memory
head_cpu_requests = self.config.head_cpu_requests
head_cpu_limits = self.config.head_cpu_limits
head_memory_requests = self.config.head_memory_requests
head_memory_limits = self.config.head_memory_limits
num_head_gpus = self.config.num_head_gpus
worker_cpu_requests = self.config.worker_cpu_requests
worker_cpu_limits = self.config.worker_cpu_limits
Expand All @@ -155,8 +157,10 @@ def create_app_wrapper(self):
return generate_appwrapper(
name=name,
namespace=namespace,
head_cpus=head_cpus,
head_memory=head_memory,
head_cpu_requests=head_cpu_requests,
head_cpu_limits=head_cpu_limits,
head_memory_requests=head_memory_requests,
head_memory_limits=head_memory_limits,
num_head_gpus=num_head_gpus,
worker_cpu_requests=worker_cpu_requests,
worker_cpu_limits=worker_cpu_limits,
Expand Down Expand Up @@ -887,12 +891,18 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
]["resources"]["limits"]["cpu"],
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
namespace=rc["metadata"]["namespace"],
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["cpu"],
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["memory"],
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["requests"]["cpu"],
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["limits"]["cpu"],
head_mem_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["requests"]["memory"],
head_mem_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["limits"]["memory"],
head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"],
Expand Down Expand Up @@ -923,8 +933,10 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
worker_gpu=cluster.config.num_worker_gpus,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
head_cpus=cluster.config.head_cpus,
head_mem=cluster.config.head_memory,
head_mem_requests=cluster.config.head_memory_requests,
head_mem_limits=cluster.config.head_memory_limits,
head_cpu_requests=cluster.config.head_cpu_requests,
head_cpu_limits=cluster.config.head_cpu_limits,
head_gpu=cluster.config.num_head_gpus,
)
if ray.status == CodeFlareClusterStatus.READY:
Expand Down
36 changes: 30 additions & 6 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,12 @@ class ClusterConfiguration:
name: str
namespace: str = None
head_info: list = field(default_factory=list)
head_cpus: typing.Union[int, str] = 2
head_memory: typing.Union[int, str] = 8
head_cpu_requests: typing.Union[int, str] = 2
head_cpu_limits: typing.Union[int, str] = 2
head_cpus: typing.Union[int, str] = None # Deprecating
head_memory_requests: typing.Union[int, str] = 8
head_memory_limits: typing.Union[int, str] = 8
head_memory: typing.Union[int, str] = None # Deprecating
head_gpus: int = None # Deprecating
num_head_gpus: int = 0
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
Expand Down Expand Up @@ -74,8 +78,16 @@ def __post_init__(self):
self._cpu_to_resource()

def _str_mem_no_unit_add_GB(self):
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
self.head_memory = f"{self.head_memory}G"
if (
isinstance(self.head_memory_requests, str)
and self.head_memory_requests.isdecimal()
):
self.head_memory_requests = f"{self.head_memory_requests}G"
if (
isinstance(self.head_memory_limits, str)
and self.head_memory_limits.isdecimal()
):
self.head_memory_limits = f"{self.head_memory_limits}G"
if (
isinstance(self.worker_memory_requests, str)
and self.worker_memory_requests.isdecimal()
Expand All @@ -88,8 +100,10 @@ def _str_mem_no_unit_add_GB(self):
self.worker_memory_limits = f"{self.worker_memory_limits}G"

def _memory_to_string(self):
if isinstance(self.head_memory, int):
self.head_memory = f"{self.head_memory}G"
if isinstance(self.head_memory_requests, int):
self.head_memory_requests = f"{self.head_memory_requests}G"
if isinstance(self.head_memory_limits, int):
self.head_memory_limits = f"{self.head_memory_limits}G"
if isinstance(self.worker_memory_requests, int):
self.worker_memory_requests = f"{self.worker_memory_requests}G"
if isinstance(self.worker_memory_limits, int):
Expand All @@ -104,6 +118,11 @@ def _gpu_to_resource(self):
self.num_worker_gpus = self.num_gpus

def _cpu_to_resource(self):
if self.head_cpus:
warnings.warn(
"head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits"
)
self.head_cpu_requests = self.head_cpu_limits = self.head_cpus
if self.min_cpus:
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
self.worker_cpu_requests = self.min_cpus
Expand All @@ -112,6 +131,11 @@ def _cpu_to_resource(self):
self.worker_cpu_limits = self.max_cpus

def _memory_to_resource(self):
if self.head_memory:
warnings.warn(
"head_memory is being deprecated, use head_memory_requests and head_memory_limits"
)
self.head_memory_requests = self.head_memory_limits = self.head_memory
if self.min_memory:
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
self.worker_memory_requests = f"{self.min_memory}G"
Expand Down
6 changes: 4 additions & 2 deletions src/codeflare_sdk/cluster/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ class RayCluster:

name: str
status: RayClusterStatus
head_cpus: int
head_mem: str
head_cpu_requests: int
head_cpu_limits: int
head_mem_requests: str
head_mem_limits: str
head_gpu: int
workers: int
worker_mem_min: str
Expand Down
47 changes: 29 additions & 18 deletions src/codeflare_sdk/utils/generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,24 +108,24 @@ def update_env(spec, env):

def update_resources(
spec,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
cpu_requests,
cpu_limits,
memory_requests,
memory_limits,
num_gpus,
):
container = spec.get("containers")
for resource in container:
requests = resource.get("resources").get("requests")
if requests is not None:
requests["cpu"] = worker_cpu_requests
requests["memory"] = worker_memory_requests
requests["nvidia.com/gpu"] = num_worker_gpus
requests["cpu"] = cpu_requests
requests["memory"] = memory_requests
requests["nvidia.com/gpu"] = num_gpus
limits = resource.get("resources").get("limits")
if limits is not None:
limits["cpu"] = worker_cpu_limits
limits["memory"] = worker_memory_limits
limits["nvidia.com/gpu"] = num_worker_gpus
limits["cpu"] = cpu_limits
limits["memory"] = memory_limits
limits["nvidia.com/gpu"] = num_gpus


def update_nodes(
Expand All @@ -140,8 +140,10 @@ def update_nodes(
image,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_cpu_requests,
head_cpu_limits,
head_memory_requests,
head_memory_limits,
num_head_gpus,
):
head = cluster_yaml.get("spec").get("headGroupSpec")
Expand All @@ -163,7 +165,12 @@ def update_nodes(
if comp == head:
# TODO: Eventually add head node configuration outside of template
update_resources(
spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus
spec,
head_cpu_requests,
head_cpu_limits,
head_memory_requests,
head_memory_limits,
num_head_gpus,
)
else:
update_resources(
Expand Down Expand Up @@ -277,8 +284,10 @@ def write_user_yaml(user_yaml, output_file_name):
def generate_appwrapper(
name: str,
namespace: str,
head_cpus: int,
head_memory: int,
head_cpu_requests: int,
head_cpu_limits: int,
head_memory_requests: int,
head_memory_limits: int,
num_head_gpus: int,
worker_cpu_requests: int,
worker_cpu_limits: int,
Expand Down Expand Up @@ -310,8 +319,10 @@ def generate_appwrapper(
image,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_cpu_requests,
head_cpu_limits,
head_memory_requests,
head_memory_limits,
num_head_gpus,
)
augment_labels(cluster_yaml, labels)
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/local_interactive_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ def run_local_interactives(self):
name=cluster_name,
namespace=self.namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ def run_mnist_raycluster_sdk_kind(self):
name="mnist",
namespace=self.namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/mnist_raycluster_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ def run_mnist_raycluster_sdk_kind(self):
name="mnist",
namespace=self.namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/mnist_raycluster_sdk_oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ def run_mnist_raycluster_sdk_oauth(self):
name="mnist",
namespace=self.namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/start_ray_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
name="mnist",
namespace=namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
12 changes: 8 additions & 4 deletions tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,8 +877,10 @@ def test_ray_details(mocker, capsys):
worker_gpu=0,
namespace="ns",
dashboard="fake-uri",
head_cpus=2,
head_mem=8,
head_cpu_requests=2,
head_cpu_limits=2,
head_mem_limits=8,
head_mem_requests=8,
head_gpu=0,
)
mocker.patch(
Expand Down Expand Up @@ -2304,8 +2306,10 @@ def test_cluster_status(mocker):
worker_gpu=0,
namespace="ns",
dashboard="fake-uri",
head_cpus=2,
head_mem=8,
head_cpu_requests=2,
head_cpu_limits=2,
head_mem_limits=8,
head_mem_requests=8,
head_gpu=0,
)
cf = Cluster(
Expand Down
6 changes: 4 additions & 2 deletions tests/upgrade/raycluster_sdk_upgrade_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ def run_mnist_raycluster_sdk_oauth(self):
name="mnist",
namespace=self.namespace,
num_workers=1,
head_cpus=1,
head_memory=2,
head_cpu_requests=1,
head_cpu_limits=1,
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests=1,
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down

0 comments on commit 96e3a8e

Please sign in to comment.