From 6f77f8d75ac379604a4b90995649a9deefa81c48 Mon Sep 17 00:00:00 2001 From: MichaelClifford Date: Tue, 25 Apr 2023 18:17:17 -0400 Subject: [PATCH 1/6] add priorities and schedulingSpec to SDK --- src/codeflare_sdk/cluster/cluster.py | 8 +++++++ src/codeflare_sdk/cluster/config.py | 1 + src/codeflare_sdk/utils/generate_yaml.py | 29 ++++++++++++++++++++++++ tests/test-case.yaml | 5 +++- tests/unit_test.py | 3 +++ 5 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index d698331e6..d46f1d9f5 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -89,8 +89,12 @@ def create_app_wrapper(self): instascale = self.config.instascale instance_types = self.config.machine_types env = self.config.envs +<<<<<<< HEAD local_interactive = self.config.local_interactive image_pull_secrets = self.config.image_pull_secrets +======= + priority = self.config.priority +>>>>>>> 7e7a311 ( add priorities and schedulingSpec to SDK) return generate_appwrapper( name=name, namespace=namespace, @@ -105,8 +109,12 @@ def create_app_wrapper(self): instascale=instascale, instance_types=instance_types, env=env, +<<<<<<< HEAD local_interactive=local_interactive, image_pull_secrets=image_pull_secrets, +======= + priority=priority, +>>>>>>> 7e7a311 ( add priorities and schedulingSpec to SDK) ) # creates a new cluster with the provided or default spec diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index aed3674eb..6518aeb3e 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -47,3 +47,4 @@ class ClusterConfiguration: image: str = "quay.io/project-codeflare/ray:2.5.0-py38-cu116" local_interactive: bool = False image_pull_secrets: list = field(default_factory=list) + diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 9c5804cd5..1f67983a1 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -89,6 +89,27 @@ def update_labels(yaml, instascale, instance_types): metadata.pop("labels") +def update_priority(yaml, item, priority): + if priority not in ["low", "default", "high"]: + sys.exit("Priority must be 'low', 'default', or 'high'") + + priority_levels = { + "low": (1, "low-priority"), + "default": (5, "default-priority"), + "high": (10, "high-priority"), + } + + priority_level = priority_levels[priority] + spec = yaml.get("spec") + spec["priority"] = priority_level[0] + # spec["SchedulingSpec"]["priorityClassName"] = priority_level + if "generictemplate" in item.keys(): + head = item.get("generictemplate").get("spec").get("headGroupSpec") + worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] + head["template"]["spec"]["priorityClassName"] = priority_level[1] + worker["template"]["spec"]["priorityClassName"] = priority_level[1] + + def update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers ): @@ -175,6 +196,11 @@ def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu): limits["nvidia.com/gpu"] = gpu +def update_scheduling_spec(yaml, workers): + spec = yaml.get("spec") + spec["schedulingSpec"]["minAvailable"] = workers + 1 + + def update_nodes( item, appwrapper_name, @@ -346,6 +372,7 @@ def generate_appwrapper( env, local_interactive: bool, image_pull_secrets: list, + priority: str, ): user_yaml = read_template(template) appwrapper_name, cluster_name = gen_names(name) @@ -354,6 +381,8 @@ def generate_appwrapper( route_item = resources["resources"].get("GenericItems")[1] update_names(user_yaml, item, appwrapper_name, cluster_name, namespace) update_labels(user_yaml, instascale, instance_types) + update_priority(user_yaml, item, priority) + update_scheduling_spec(user_yaml, workers) update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers ) diff --git a/tests/test-case.yaml b/tests/test-case.yaml index da0845ed9..e154124de 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -6,7 +6,7 @@ metadata: name: unit-test-cluster namespace: ns spec: - priority: 9 + priority: 1 resources: GenericItems: - custompodresources: @@ -176,6 +176,7 @@ spec: do echo waiting for myservice; sleep 2; done image: busybox:1.28 name: init-myservice + priorityClassName: low-priority replicas: 1 - generictemplate: apiVersion: route.openshift.io/v1 @@ -193,3 +194,5 @@ spec: name: unit-test-cluster-head-svc replicas: 1 Items: [] + schedulingSpec: + minAvailable: 3 diff --git a/tests/unit_test.py b/tests/unit_test.py index 58f55bfa1..087c814ea 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -228,6 +228,7 @@ def test_config_creation(): instascale=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], + priority="low", ) assert config.name == "unit-test-cluster" and config.namespace == "ns" @@ -240,11 +241,13 @@ def test_config_creation(): assert config.instascale assert config.machine_types == ["cpu.small", "gpu.large"] assert config.image_pull_secrets == ["unit-test-pull-secret"] + assert config.priority == "low" return config def test_cluster_creation(): cluster = Cluster(test_config_creation()) + print(cluster.app_wrapper_yaml) assert cluster.app_wrapper_yaml == "unit-test-cluster.yaml" assert cluster.app_wrapper_name == "unit-test-cluster" assert filecmp.cmp( From 3627ecb7d96a1d25dd0a8e55c4596ae036b8bff4 Mon Sep 17 00:00:00 2001 From: MichaelClifford Date: Fri, 28 Apr 2023 11:19:55 -0400 Subject: [PATCH 2/6] change 'priority' to 'dispatch priority' --- src/codeflare_sdk/cluster/cluster.py | 8 ++++++++ src/codeflare_sdk/cluster/config.py | 2 +- src/codeflare_sdk/utils/generate_yaml.py | 4 ++-- tests/test-case.yaml | 4 ++-- tests/unit_test.py | 4 ++-- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index d46f1d9f5..c6610f08f 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -89,12 +89,16 @@ def create_app_wrapper(self): instascale = self.config.instascale instance_types = self.config.machine_types env = self.config.envs +<<<<<<< HEAD <<<<<<< HEAD local_interactive = self.config.local_interactive image_pull_secrets = self.config.image_pull_secrets ======= priority = self.config.priority >>>>>>> 7e7a311 ( add priorities and schedulingSpec to SDK) +======= + dispatch_priority = self.config.dispatch_priority +>>>>>>> b1d1d16 (change 'priority' to 'dispatch priority') return generate_appwrapper( name=name, namespace=namespace, @@ -109,12 +113,16 @@ def create_app_wrapper(self): instascale=instascale, instance_types=instance_types, env=env, +<<<<<<< HEAD <<<<<<< HEAD local_interactive=local_interactive, image_pull_secrets=image_pull_secrets, ======= priority=priority, >>>>>>> 7e7a311 ( add priorities and schedulingSpec to SDK) +======= + dispatch_priority=dispatch_priority, +>>>>>>> b1d1d16 (change 'priority' to 'dispatch priority') ) # creates a new cluster with the provided or default spec diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 6518aeb3e..04d5780d4 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -47,4 +47,4 @@ class ClusterConfiguration: image: str = "quay.io/project-codeflare/ray:2.5.0-py38-cu116" local_interactive: bool = False image_pull_secrets: list = field(default_factory=list) - + dispatch_priority: str = "default" diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 1f67983a1..d508ce629 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -372,7 +372,7 @@ def generate_appwrapper( env, local_interactive: bool, image_pull_secrets: list, - priority: str, + dispatch_priority: str, ): user_yaml = read_template(template) appwrapper_name, cluster_name = gen_names(name) @@ -381,7 +381,7 @@ def generate_appwrapper( route_item = resources["resources"].get("GenericItems")[1] update_names(user_yaml, item, appwrapper_name, cluster_name, namespace) update_labels(user_yaml, instascale, instance_types) - update_priority(user_yaml, item, priority) + update_priority(user_yaml, item, dispatch_priority) update_scheduling_spec(user_yaml, workers) update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers diff --git a/tests/test-case.yaml b/tests/test-case.yaml index e154124de..463c66aa4 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -6,7 +6,7 @@ metadata: name: unit-test-cluster namespace: ns spec: - priority: 1 + priority: 5 resources: GenericItems: - custompodresources: @@ -176,7 +176,7 @@ spec: do echo waiting for myservice; sleep 2; done image: busybox:1.28 name: init-myservice - priorityClassName: low-priority + priorityClassName: default-priority replicas: 1 - generictemplate: apiVersion: route.openshift.io/v1 diff --git a/tests/unit_test.py b/tests/unit_test.py index 087c814ea..862122df9 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -228,7 +228,7 @@ def test_config_creation(): instascale=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], - priority="low", + dispatch_priority="default", ) assert config.name == "unit-test-cluster" and config.namespace == "ns" @@ -241,7 +241,7 @@ def test_config_creation(): assert config.instascale assert config.machine_types == ["cpu.small", "gpu.large"] assert config.image_pull_secrets == ["unit-test-pull-secret"] - assert config.priority == "low" + assert config.dispatch_priority == "default" return config From b05a4e13a9cd3bb266b7eb9059431a2c9ec00bc6 Mon Sep 17 00:00:00 2001 From: MichaelClifford Date: Fri, 16 Jun 2023 17:17:39 -0400 Subject: [PATCH 3/6] make priority levels global --- src/codeflare_sdk/utils/generate_yaml.py | 17 +++++++++-------- tests/unit_test.py | 1 - 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index d508ce629..3b07eac13 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -89,17 +89,18 @@ def update_labels(yaml, instascale, instance_types): metadata.pop("labels") +PRIORITY_LEVELS = { + "low": (1, "low-priority"), + "default": (5, "default-priority"), + "high": (10, "high-priority"), +} + + def update_priority(yaml, item, priority): - if priority not in ["low", "default", "high"]: + if priority not in PRIORITY_LEVELS: sys.exit("Priority must be 'low', 'default', or 'high'") - priority_levels = { - "low": (1, "low-priority"), - "default": (5, "default-priority"), - "high": (10, "high-priority"), - } - - priority_level = priority_levels[priority] + priority_level = PRIORITY_LEVELS[priority] spec = yaml.get("spec") spec["priority"] = priority_level[0] # spec["SchedulingSpec"]["priorityClassName"] = priority_level diff --git a/tests/unit_test.py b/tests/unit_test.py index 862122df9..1795868f2 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -247,7 +247,6 @@ def test_config_creation(): def test_cluster_creation(): cluster = Cluster(test_config_creation()) - print(cluster.app_wrapper_yaml) assert cluster.app_wrapper_yaml == "unit-test-cluster.yaml" assert cluster.app_wrapper_name == "unit-test-cluster" assert filecmp.cmp( From e97c33b2ec625e454020a21ee04542f8ebbeb7bd Mon Sep 17 00:00:00 2001 From: MichaelClifford Date: Wed, 21 Jun 2023 16:24:18 -0400 Subject: [PATCH 4/6] update base template with priorities and update cluster --- src/codeflare_sdk/cluster/cluster.py | 14 -------------- src/codeflare_sdk/templates/base-template.yaml | 4 ++++ tests/test-case.yaml | 2 +- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index c6610f08f..c3848a555 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -89,16 +89,9 @@ def create_app_wrapper(self): instascale = self.config.instascale instance_types = self.config.machine_types env = self.config.envs -<<<<<<< HEAD -<<<<<<< HEAD local_interactive = self.config.local_interactive image_pull_secrets = self.config.image_pull_secrets -======= - priority = self.config.priority ->>>>>>> 7e7a311 ( add priorities and schedulingSpec to SDK) -======= dispatch_priority = self.config.dispatch_priority ->>>>>>> b1d1d16 (change 'priority' to 'dispatch priority') return generate_appwrapper( name=name, namespace=namespace, @@ -113,16 +106,9 @@ def create_app_wrapper(self): instascale=instascale, instance_types=instance_types, env=env, -<<<<<<< HEAD -<<<<<<< HEAD local_interactive=local_interactive, image_pull_secrets=image_pull_secrets, -======= - priority=priority, ->>>>>>> 7e7a311 ( add priorities and schedulingSpec to SDK) -======= dispatch_priority=dispatch_priority, ->>>>>>> b1d1d16 (change 'priority' to 'dispatch priority') ) # creates a new cluster with the provided or default spec diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 56014a068..686336e76 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -8,6 +8,8 @@ metadata: orderedinstance: "m4.xlarge_g4dn.xlarge" spec: priority: 9 + schedulingSpec: + minAvailable: 3 resources: Items: [] GenericItems: @@ -112,6 +114,7 @@ spec: operator: In values: - "aw-kuberay" + priorityClassName: "default-priority" containers: # The Ray head pod - env: @@ -221,6 +224,7 @@ spec: operator: In values: - "aw-kuberay" + priorityClassName: "default-priority" initContainers: # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - name: init-myservice diff --git a/tests/test-case.yaml b/tests/test-case.yaml index 463c66aa4..ad2eef502 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -192,7 +192,7 @@ spec: to: kind: Service name: unit-test-cluster-head-svc - replicas: 1 + replica: 1 Items: [] schedulingSpec: minAvailable: 3 From 957fc357fdce81abf35e95909d106dbcfdcbe34c Mon Sep 17 00:00:00 2001 From: MichaelClifford Date: Sun, 6 Aug 2023 19:35:59 -0400 Subject: [PATCH 5/6] evaluate priority class name --- src/codeflare_sdk/cluster/cluster.py | 39 +++++++++++++++++++ src/codeflare_sdk/cluster/config.py | 2 +- .../templates/base-template.yaml | 2 - src/codeflare_sdk/utils/generate_yaml.py | 24 +++--------- 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index c3848a555..b0bff0883 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -61,6 +61,39 @@ def __init__(self, config: ClusterConfiguration): self.app_wrapper_yaml = self.create_app_wrapper() self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0] + def evaluate_config(self): + if not self.evaluate_dispatch_priority(): + return False + else: + return True + + def evaluate_dispatch_priority(self): + priority_class = self.config.dispatch_priority + if priority_class is None: + return True + else: + try: + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + priority_classes = api_instance.list_cluster_custom_object( + group="scheduling.k8s.io", + version="v1", + plural="priorityclasses", + ) + available_priority_classes = [ + i["metadata"]["name"] for i in priority_classes["items"] + ] + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + if priority_class in available_priority_classes: + return True + else: + print( + f"Priority class {priority_class} is not available in the cluster" + ) + return False + def create_app_wrapper(self): """ Called upon cluster object creation, creates an AppWrapper yaml based on @@ -117,6 +150,12 @@ def up(self): Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ + + # Before attempting to bring up the cluster let's evaluate the ClusterConfig + if not self.evaluate_config(): + print("Invalid Cluster Configuration") + return False + namespace = self.config.namespace try: config_check() diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 04d5780d4..cb935e79d 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -47,4 +47,4 @@ class ClusterConfiguration: image: str = "quay.io/project-codeflare/ray:2.5.0-py38-cu116" local_interactive: bool = False image_pull_secrets: list = field(default_factory=list) - dispatch_priority: str = "default" + dispatch_priority: str = None diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 686336e76..386dd86bf 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -114,7 +114,6 @@ spec: operator: In values: - "aw-kuberay" - priorityClassName: "default-priority" containers: # The Ray head pod - env: @@ -224,7 +223,6 @@ spec: operator: In values: - "aw-kuberay" - priorityClassName: "default-priority" initContainers: # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - name: init-myservice diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 3b07eac13..697122a2c 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -89,26 +89,12 @@ def update_labels(yaml, instascale, instance_types): metadata.pop("labels") -PRIORITY_LEVELS = { - "low": (1, "low-priority"), - "default": (5, "default-priority"), - "high": (10, "high-priority"), -} - - -def update_priority(yaml, item, priority): - if priority not in PRIORITY_LEVELS: - sys.exit("Priority must be 'low', 'default', or 'high'") - - priority_level = PRIORITY_LEVELS[priority] - spec = yaml.get("spec") - spec["priority"] = priority_level[0] - # spec["SchedulingSpec"]["priorityClassName"] = priority_level - if "generictemplate" in item.keys(): +def update_priority(item, dispatch_priority): + if dispatch_priority is not None: head = item.get("generictemplate").get("spec").get("headGroupSpec") worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] - head["template"]["spec"]["priorityClassName"] = priority_level[1] - worker["template"]["spec"]["priorityClassName"] = priority_level[1] + head["template"]["spec"]["priorityClassName"] = dispatch_priority + worker["template"]["spec"]["priorityClassName"] = dispatch_priority def update_custompodresources( @@ -382,7 +368,7 @@ def generate_appwrapper( route_item = resources["resources"].get("GenericItems")[1] update_names(user_yaml, item, appwrapper_name, cluster_name, namespace) update_labels(user_yaml, instascale, instance_types) - update_priority(user_yaml, item, dispatch_priority) + update_priority(item, dispatch_priority) update_scheduling_spec(user_yaml, workers) update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers From e09cdf72ad184bfce3ae8e804e7cc25fa1ceffc9 Mon Sep 17 00:00:00 2001 From: MichaelClifford Date: Sun, 6 Aug 2023 20:08:06 -0400 Subject: [PATCH 6/6] update tests --- tests/test-case.yaml | 7 ++++--- tests/unit_test.py | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test-case.yaml b/tests/test-case.yaml index ad2eef502..df88e7bb0 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -6,7 +6,7 @@ metadata: name: unit-test-cluster namespace: ns spec: - priority: 5 + priority: 9 resources: GenericItems: - custompodresources: @@ -109,6 +109,7 @@ spec: nvidia.com/gpu: 0 imagePullSecrets: - name: unit-test-pull-secret + priorityClassName: default rayVersion: 2.5.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster @@ -176,7 +177,7 @@ spec: do echo waiting for myservice; sleep 2; done image: busybox:1.28 name: init-myservice - priorityClassName: default-priority + priorityClassName: default replicas: 1 - generictemplate: apiVersion: route.openshift.io/v1 @@ -192,7 +193,7 @@ spec: to: kind: Service name: unit-test-cluster-head-svc - replica: 1 + replicas: 1 Items: [] schedulingSpec: minAvailable: 3 diff --git a/tests/unit_test.py b/tests/unit_test.py index 1795868f2..c518fa310 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -302,6 +302,10 @@ def test_cluster_up_down(mocker): "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", side_effect=arg_check_del_effect, ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_cluster_custom_object", + return_value={"items": []}, + ) cluster = test_cluster_creation() cluster.up() cluster.down()