From 574e8a0d2c10537a7d811aba1af9ea75bd0c2952 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Mon, 10 Jul 2023 11:51:17 -0700 Subject: [PATCH 1/7] Update sample YAML Signed-off-by: Archit Kulkarni --- .../config/samples/ray_v1alpha1_rayjob.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml b/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml index bfde58eb18..2c21a82b04 100644 --- a/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml +++ b/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml @@ -13,7 +13,13 @@ spec: # "counter_name": "test_counter" # } #}' + shutdownAfterJobFinishes: true runtimeEnv: ewogICAgInBpcCI6IFsKICAgICAgICAicmVxdWVzdHM9PTIuMjYuMCIsCiAgICAgICAgInBlbmR1bHVtPT0yLjEuMiIKICAgIF0sCiAgICAiZW52X3ZhcnMiOiB7ImNvdW50ZXJfbmFtZSI6ICJ0ZXN0X2NvdW50ZXIifQp9Cg== + # Suspend specifies whether the RayJob controller should create a RayCluster instance. + # If a job is applied with the suspend field set to true, the RayCluster will not be created and we will wait for the transition to false. + # If the RayCluster is already created, it will be deleted. In the case of transition to false, a new RayCluste rwill be created. + # suspend: false + # rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller. rayClusterSpec: rayVersion: '2.5.0' # should match the Ray version in the image of the containers # Ray head pod template @@ -82,6 +88,16 @@ spec: cpu: "1" requests: cpu: "200m" + # SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster. + # If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container. + # submitterPodTemplate: + # spec: + # containers: + # - name: rayjob-submitter-pod + # image: rayproject/ray:2.5.0 + # # If command is not specified, a default command will be supplied using the RayJob spec `entrypoint` field. + # command: ["ray", "job", "submit", "--", "echo", "hello", "world"] + ######################Ray code sample################################# # this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example # it is mounted into the container and executed to show the Ray job at work From ea519d2435c2d02f75d49117df111159c23b5687 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Mon, 10 Jul 2023 11:59:33 -0700 Subject: [PATCH 2/7] Update doc file with submitterPodTemplate Signed-off-by: Archit Kulkarni --- docs/guidance/rayjob.md | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/docs/guidance/rayjob.md b/docs/guidance/rayjob.md index f6d45af783..6d7092ee59 100644 --- a/docs/guidance/rayjob.md +++ b/docs/guidance/rayjob.md @@ -1,15 +1,16 @@ -## Ray Job (alpha) +# Ray Job (alpha) > Note: This is the alpha version of Ray Job Support in KubeRay. There will be ongoing improvements for Ray Job in the future releases. -### Prerequisites +## Prerequisites * Ray 1.10 or higher * KubeRay v0.3.0+. (v0.5.0+ is recommended) -### What is a RayJob? +## What is a RayJob? A RayJob manages 2 things: + * Ray Cluster: Manages resources in a Kubernetes cluster. * Job: Manages jobs in a Ray Cluster. @@ -17,14 +18,13 @@ A RayJob manages 2 things: * **Kubernetes-native support for Ray clusters and Ray Jobs.** You can use a Kubernetes config to define a Ray cluster and job, and use `kubectl` to create them. The cluster can be deleted automatically once the job is finished. - -### Deploy KubeRay +## Deploy KubeRay Make sure your KubeRay operator version is at least v0.3.0. The latest released KubeRay version is recommended. For installation instructions, please follow [the documentation](../deploy/installation.md). -### Run an example Job +## Run an example Job There is one example config file to deploy a RayJob included here: [ray_v1alpha1_rayjob.yaml](https://github.com/ray-project/kuberay/blob/master/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml) @@ -48,21 +48,23 @@ $ kubectl get rayclusters $ kubectl get pod ``` -### RayJob Configuration +## RayJob Configuration -- `entrypoint` - The shell command to run for this job. job_id. -- `jobId` - _(Optional)_ Job ID to specify for the job. If not provided, one will be generated. -- `metadata` - Arbitrary user-provided metadata for the job. -- `runtimeEnv` - base64 string of the runtime json string. -- `shutdownAfterJobFinishes` - whether to recycle the cluster after job finishes. -- `ttlSecondsAfterFinished` - TTL to clean up the cluster. This only works if `shutdownAfterJobFinishes` is set. +* `entrypoint` - The shell command to run for this job. +* `rayClusterSpec` - The spec for the Ray cluster to run the job on. +* `jobId` - _(Optional)_ Job ID to specify for the job. If not provided, one will be generated. +* `metadata` - _(Optional)_ Arbitrary user-provided metadata for the job. +* `runtimeEnv` - _(Optional)_ base64-encoded string of the runtime env json string. +* `shutdownAfterJobFinishes` - _(Optional)_ whether to recycle the cluster after the job finishes. +* `ttlSecondsAfterFinished` - _(Optional)_ TTL to clean up the cluster. This only works if `shutdownAfterJobFinishes` is set. +* `submitterPodTemplate` - _(Optional)_ Pod template spec for the pod that runs `ray job submit` against the Ray cluster. -### RayJob Observability +## RayJob Observability You can use `kubectl logs` to check the operator logs or the head/worker nodes logs. You can also use `kubectl describe rayjobs rayjob-sample` to check the states and event logs of your RayJob instance: -``` +```text Status: Dashboard URL: rayjob-sample-raycluster-vnl8w-head-svc.ray-system.svc.cluster.local:8265 End Time: 2022-07-24T02:04:56Z @@ -88,9 +90,9 @@ Events: Normal Deleted 15s rayjob-controller Deleted cluster rayjob-sample-raycluster-vnl8w ``` - If the job doesn't run successfully, the above `describe` command will provide information about that too: -``` + +```text Status: Dashboard URL: rayjob-sample-raycluster-nrdm8-head-svc.ray-system.svc.cluster.local:8265 End Time: 2022-07-24T02:01:39Z @@ -118,9 +120,8 @@ Events: Normal Deleted 58s rayjob-controller Deleted cluster rayjob-sample-raycluster-nrdm8 ``` - -### Delete the RayJob instance +## Delete the RayJob instance ```shell -$ kubectl delete -f config/samples/ray_v1alpha1_rayjob.yaml +kubectl delete -f config/samples/ray_v1alpha1_rayjob.yaml ``` From 54b207e7c04cbdc531b772c68385b24b6f57ba80 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Tue, 11 Jul 2023 10:44:20 -0700 Subject: [PATCH 3/7] Update sample YAML Signed-off-by: Archit Kulkarni --- .../config/samples/ray_v1alpha1_rayjob.yaml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml b/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml index 2c21a82b04..4960cac989 100644 --- a/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml +++ b/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml @@ -13,6 +13,7 @@ spec: # "counter_name": "test_counter" # } #}' + # ShutdownAfterJobFinishes specifies whether the RayCluster should be deleted after the RayJob finishes. Default is false. shutdownAfterJobFinishes: true runtimeEnv: ewogICAgInBpcCI6IFsKICAgICAgICAicmVxdWVzdHM9PTIuMjYuMCIsCiAgICAgICAgInBlbmR1bHVtPT0yLjEuMiIKICAgIF0sCiAgICAiZW52X3ZhcnMiOiB7ImNvdW50ZXJfbmFtZSI6ICJ0ZXN0X2NvdW50ZXIifQp9Cg== # Suspend specifies whether the RayJob controller should create a RayCluster instance. @@ -90,13 +91,16 @@ spec: cpu: "200m" # SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster. # If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container. - # submitterPodTemplate: - # spec: - # containers: - # - name: rayjob-submitter-pod - # image: rayproject/ray:2.5.0 - # # If command is not specified, a default command will be supplied using the RayJob spec `entrypoint` field. - # command: ["ray", "job", "submit", "--", "echo", "hello", "world"] + submitterPodTemplate: + spec: + restartPolicy: Never + containers: + - name: my-custom-rayjob-submitter-pod + image: rayproject/ray:2.5.0 + # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field. + # Specifying Command is not recommended. + # command: ["ray job submit --address=http://rayjob-sample-raycluster-v6qcq-head-svc.default.svc.cluster.local:8265 -- echo hello world"] + ######################Ray code sample################################# # this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example From 99aec2548942e35f3494bee900e591b6c3f3ec2c Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Tue, 11 Jul 2023 10:44:50 -0700 Subject: [PATCH 4/7] Update doc Signed-off-by: Archit Kulkarni --- docs/guidance/rayjob.md | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/docs/guidance/rayjob.md b/docs/guidance/rayjob.md index 6d7092ee59..8d664b577a 100644 --- a/docs/guidance/rayjob.md +++ b/docs/guidance/rayjob.md @@ -66,28 +66,38 @@ You can also use `kubectl describe rayjobs rayjob-sample` to check the states an ```text Status: - Dashboard URL: rayjob-sample-raycluster-vnl8w-head-svc.ray-system.svc.cluster.local:8265 - End Time: 2022-07-24T02:04:56Z + Dashboard URL: rayjob-sample-raycluster-v6qcq-head-svc.default.svc.cluster.local:8265 + End Time: 2023-07-11T17:39:56Z Job Deployment Status: Complete - Job Id: test-hehe + Job Id: rayjob-sample-66z5m Job Status: SUCCEEDED Message: Job finished successfully. - Ray Cluster Name: rayjob-sample-raycluster-vnl8w + Observed Generation: 2 + Ray Cluster Name: rayjob-sample-raycluster-v6qcq Ray Cluster Status: Available Worker Replicas: 1 + Desired Worker Replicas: 1 Endpoints: - Client: 32572 - Dashboard: 32276 - Gcs - Server: 30679 - Last Update Time: 2022-07-24T02:04:43Z - State: ready - Start Time: 2022-07-24T02:04:49Z + Client: 10001 + Dashboard: 8265 + Gcs - Server: 6379 + Metrics: 8080 + Serve: 8000 + Head: + Pod IP: 10.244.0.6 + Service IP: 10.96.31.68 + Last Update Time: 2023-07-11T17:39:32Z + Max Worker Replicas: 5 + Min Worker Replicas: 1 + Observed Generation: 1 + State: ready + Start Time: 2023-07-11T17:39:39Z Events: - Type Reason Age From Message - ---- ------ ---- ---- ------- - Normal Created 90s rayjob-controller Created cluster rayjob-sample-raycluster-vnl8w - Normal Submitted 82s rayjob-controller Submit Job test-hehe - Normal Deleted 15s rayjob-controller Deleted cluster rayjob-sample-raycluster-vnl8w + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Created 3m37s rayjob-controller Created cluster rayjob-sample-raycluster-v6qcq + Normal Created 2m11s rayjob-controller Created k8s job rayjob-sample + Normal Deleted 107s rayjob-controller Deleted cluster rayjob-sample-raycluster-v6qcq ``` If the job doesn't run successfully, the above `describe` command will provide information about that too: From 7b9dba8724927dfe22e5d9e8b450481461a31b6b Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Tue, 11 Jul 2023 10:45:26 -0700 Subject: [PATCH 5/7] Add "Defaults to false" Signed-off-by: Archit Kulkarni --- docs/guidance/rayjob.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guidance/rayjob.md b/docs/guidance/rayjob.md index 8d664b577a..e6b584e9ec 100644 --- a/docs/guidance/rayjob.md +++ b/docs/guidance/rayjob.md @@ -55,7 +55,7 @@ $ kubectl get pod * `jobId` - _(Optional)_ Job ID to specify for the job. If not provided, one will be generated. * `metadata` - _(Optional)_ Arbitrary user-provided metadata for the job. * `runtimeEnv` - _(Optional)_ base64-encoded string of the runtime env json string. -* `shutdownAfterJobFinishes` - _(Optional)_ whether to recycle the cluster after the job finishes. +* `shutdownAfterJobFinishes` - _(Optional)_ whether to recycle the cluster after the job finishes. Defaults to false. * `ttlSecondsAfterFinished` - _(Optional)_ TTL to clean up the cluster. This only works if `shutdownAfterJobFinishes` is set. * `submitterPodTemplate` - _(Optional)_ Pod template spec for the pod that runs `ray job submit` against the Ray cluster. From 217c3a919cd163bb284288301a1971828d35ab40 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Tue, 11 Jul 2023 10:54:24 -0700 Subject: [PATCH 6/7] Re-comment submitterpodtemplate Signed-off-by: Archit Kulkarni --- .../config/samples/ray_v1alpha1_rayjob.yaml | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml b/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml index 4960cac989..82df204f32 100644 --- a/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml +++ b/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml @@ -14,7 +14,7 @@ spec: # } #}' # ShutdownAfterJobFinishes specifies whether the RayCluster should be deleted after the RayJob finishes. Default is false. - shutdownAfterJobFinishes: true + shutdownAfterJobFinishes: false runtimeEnv: ewogICAgInBpcCI6IFsKICAgICAgICAicmVxdWVzdHM9PTIuMjYuMCIsCiAgICAgICAgInBlbmR1bHVtPT0yLjEuMiIKICAgIF0sCiAgICAiZW52X3ZhcnMiOiB7ImNvdW50ZXJfbmFtZSI6ICJ0ZXN0X2NvdW50ZXIifQp9Cg== # Suspend specifies whether the RayJob controller should create a RayCluster instance. # If a job is applied with the suspend field set to true, the RayCluster will not be created and we will wait for the transition to false. @@ -91,15 +91,15 @@ spec: cpu: "200m" # SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster. # If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container. - submitterPodTemplate: - spec: - restartPolicy: Never - containers: - - name: my-custom-rayjob-submitter-pod - image: rayproject/ray:2.5.0 - # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field. - # Specifying Command is not recommended. - # command: ["ray job submit --address=http://rayjob-sample-raycluster-v6qcq-head-svc.default.svc.cluster.local:8265 -- echo hello world"] + # submitterPodTemplate: + # spec: + # restartPolicy: Never + # containers: + # - name: my-custom-rayjob-submitter-pod + # image: rayproject/ray:2.5.0 + # # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field. + # # Specifying Command is not recommended. + # # command: ["ray job submit --address=http://rayjob-sample-raycluster-v6qcq-head-svc.default.svc.cluster.local:8265 -- echo hello world"] ######################Ray code sample################################# From 2301b340a8442bc0c957b6293eaaacb1665b6a75 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Tue, 11 Jul 2023 10:55:24 -0700 Subject: [PATCH 7/7] Update doc Signed-off-by: Archit Kulkarni --- docs/guidance/rayjob.md | 43 +++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/docs/guidance/rayjob.md b/docs/guidance/rayjob.md index e6b584e9ec..1b496ebe14 100644 --- a/docs/guidance/rayjob.md +++ b/docs/guidance/rayjob.md @@ -104,30 +104,39 @@ If the job doesn't run successfully, the above `describe` command will provide i ```text Status: - Dashboard URL: rayjob-sample-raycluster-nrdm8-head-svc.ray-system.svc.cluster.local:8265 - End Time: 2022-07-24T02:01:39Z + Dashboard URL: rayjob-sample-raycluster-2h7ds-head-svc.default.svc.cluster.local:8265 + End Time: 2023-07-11T17:51:31Z Job Deployment Status: Complete - Job Id: test-hehe + Job Id: rayjob-sample-prbts Job Status: FAILED - Message: Job failed due to an application error, last available logs: -python: can't open file '/tmp/code/script.ppy': [Errno 2] No such file or directory + Message: Job failed due to an application error, last available logs (truncated to 20,000 chars): +python: can't open file '/home/ray/samples/sample_code.ppy': [Errno 2] No such file or directory - Ray Cluster Name: rayjob-sample-raycluster-nrdm8 + Observed Generation: 2 + Ray Cluster Name: rayjob-sample-raycluster-2h7ds Ray Cluster Status: Available Worker Replicas: 1 + Desired Worker Replicas: 1 Endpoints: - Client: 31852 - Dashboard: 32606 - Gcs - Server: 32436 - Last Update Time: 2022-07-24T02:01:30Z - State: ready - Start Time: 2022-07-24T02:01:38Z + Client: 10001 + Dashboard: 8265 + Gcs - Server: 6379 + Metrics: 8080 + Serve: 8000 + Head: + Pod IP: 10.244.0.7 + Service IP: 10.96.24.232 + Last Update Time: 2023-07-11T17:51:12Z + Max Worker Replicas: 5 + Min Worker Replicas: 1 + Observed Generation: 1 + State: ready + Start Time: 2023-07-11T17:51:16Z Events: - Type Reason Age From Message - ---- ------ ---- ---- ------- - Normal Created 2m9s rayjob-controller Created cluster rayjob-sample-raycluster-nrdm8 - Normal Submitted 2m rayjob-controller Submit Job test-hehe - Normal Deleted 58s rayjob-controller Deleted cluster rayjob-sample-raycluster-nrdm8 + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Created 3m57s rayjob-controller Created cluster rayjob-sample-raycluster-2h7ds + Normal Created 2m31s rayjob-controller Created k8s job rayjob-sample ``` ## Delete the RayJob instance