From 36c2978b9d971b337e6d4524ed1003cfceca5e82 Mon Sep 17 00:00:00 2001 From: Laurentiu Bradin <109964136+z103cb@users.noreply.github.com> Date: Fri, 22 Sep 2023 17:57:43 +0300 Subject: [PATCH] [Bug] Incorrect payload doc for creating ray jobs in apiserver Fixes #1290 --- apiserver/README.md | 1174 ++++++++++++++++++++----------------------- 1 file changed, 540 insertions(+), 634 deletions(-) diff --git a/apiserver/README.md b/apiserver/README.md index ed0ebd43c7..ec6eb9a1da 100644 --- a/apiserver/README.md +++ b/apiserver/README.md @@ -35,26 +35,30 @@ helm version # Install the KubeRay helm repo helm repo add kuberay https://ray-project.github.io/kuberay-helm/ - # Install KubeRay Operator v0.6.0. - helm install kuberay-operator kuberay/kuberay-operator --version 0.6.0 + # Install KubeRay Operator v1.0.0-rc.0. + helm install kuberay-operator kuberay/kuberay-operator --version v1.0.0-rc.0 # Check the KubeRay Operator Pod in `default` namespace kubectl get pods # NAME READY STATUS RESTARTS AGE # kuberay-operator-7456c6b69b-t6pt7 1/1 Running 0 172m - ``` ### Install KubeRay APIServer +```text +Please note that examples show here will only work with the nightly builds of the api-server. `v1.0.0-rc.0` does not yet contain critical fixes +to the api server that would allow Kuberay Serve endpoints to work properly +``` + * Install a stable version via Helm repository (only supports KubeRay v0.4.0+) ```sh # Install the KubeRay helm repo helm repo add kuberay https://ray-project.github.io/kuberay-helm/ - # Install KubeRay APIServer v0.6.0. - helm install kuberay-apiserver kuberay/kuberay-apiserver --version 0.6.0 + # Install KubeRay APIServer. + helm install kuberay-apiserver kuberay/kuberay-apiserver # Check the KubeRay APIServer Pod in `default` namespace kubectl get pods @@ -93,9 +97,9 @@ To list the deployments: ```sh helm ls -# NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION -# kuberay-apiserver default 1 2023-09-15 11:34:33.895054 +0300 EEST deployed #kuberay-apiserver-0.6.0 -# kuberay-operator default 1 2023-09-15 10:08:44.637539 +0300 EEST deployed kuberay-operator-0.6.0 +# NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +# kuberay-apiserver default 1 2023-09-25 10:42:34.267328 +0300 EEST deployed kuberay-apiserver-1.0.0-rc.0 +# kuberay-operator default 1 2023-09-25 10:41:48.355831 +0300 EEST deployed kuberay-operator-1.0.0-rc.0 ``` ### Uninstall the Chart @@ -111,13 +115,13 @@ kubectl get pods ## Usage -After the deployment we may use the `{{baseUrl}}` to access the +After the deployment we may use the `{{baseUrl}}` to access the service. See [swagger support section](https://ray-project.github.io/kuberay/components/apiserver/#swagger-support) to get the complete definitions of APIs. -* (default) for nodeport access, we provide the default http port `31888` for connection and you can connect to the server using it +* (default) for nodeport access, use port `31888` for connection * for ingress access, you will need to create your own ingress -The requests parameters detail can be seen in [KubeRay swagger](https://github.com/ray-project/kuberay/tree/master/proto/swagger), here we only present some basic example: +The requests parameters detail can be seen in [KubeRay swagger](https://github.com/ray-project/kuberay/tree/master/proto/swagger), this document only presents basic examples. ### Setup a smoke test @@ -160,16 +164,17 @@ The following steps allow you to validate that the KubeRay API Server components image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb - role: worker image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb + EOF ``` 2. Deploy the KubeRay APIServer within the same cluster of KubeRay operator ```sh helm repo add kuberay https://ray-project.github.io/kuberay-helm/ - helm -n ray-system install kuberay-apiserver kuberay/kuberay-apiserver + helm -n ray-system install kuberay-apiserver kuberay/kuberay-apiserver -n ray-system --create-namespace ``` -3. The APIServer expose service using `NodePort` by default. You can test access by your host and port, the default port is set to `31888`. +3. The APIServer expose service using `NodePort` by default. You can test access by your host and port, the default port is set to `31888`. The examples below assume a kind (localhost) deployment. If Kuberay API server is deployed on another type of cluster, you'll need to adjust the hostname to match your environment. ```sh curl localhost:31888 @@ -186,7 +191,7 @@ The following steps allow you to validate that the KubeRay API Server components * The following examples use the `ray-system` namespace. If not already created by using the helm install steps above, you can create it prior to executing the curl examples by running `kubectl create namespace ray-system` * The examples assume that the cluster has at least 2 CPUs available and 4 GB of free memory. You can either increase the CPUs available to your cluster (docker settings) or reduce the CPU request in the `compute_templates` request. - * If you are running the service and the kuberay operator on Apple Silicon Machine, you might want to use the `rayproject/ray:2.6.3-aarch64`image. + * If you are running the service and the kuberay operator on Apple Silicon Machine, you might want to use the `rayproject/ray:2.7.0-aarch64`image. ```sh # Create a template @@ -248,7 +253,7 @@ The following steps allow you to validate that the KubeRay API Server components "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310", + "image": "rayproject/ray:2.7.0-py310", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -260,7 +265,7 @@ The following steps allow you to validate that the KubeRay API Server components { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310", + "image": "rayproject/ray:2.7.0-py310", "replicas": 1, "minReplicas": 0, "maxReplicas": 5, @@ -275,8 +280,9 @@ The following steps allow you to validate that the KubeRay API Server components # Get the pods running in the namespace kubectl get pods -n ray-system # NAME READY STATUS RESTARTS AGE - # test-v1-raycluster-7c2n7-head-5qpwj 1/1 Running 0 14m - # test-v1-raycluster-7c2n7-worker-small-wg-9dbdk 1/1 Running 0 14m + # kuberay-apiserver-5dd7c9b4c8-rnrcq 1/1 Running 0 17m + # test-v1-raycluster-42vss-head-m4qj2 1/1 Running 0 91s + # test-v1-raycluster-42vss-worker-small-wg-svr79 1/1 Running 0 90s # Delete the RayService in the namespace curl --silent -X 'DELETE' \ @@ -286,13 +292,19 @@ The following steps allow you to validate that the KubeRay API Server components ## Swagger Support -Kuberay API server has support for Swagger UI and can be reached at [localhost:31888/swagger-ui](localhost:31888/swagger-ui) for local deployments or for nodeport deployments `:31888/swagger-ui` +Kuberay API server has support for Swagger UI. The swagger page can be reached at: + +* [localhost:31888/swagger-ui](localhost:31888/swagger-ui) for local kind deployments +* [localhost:8888/swagger-ui](localhost:8888/swagger-ui) for instances started with `make run` (development machine builds) +* `:31888/swagger-ui` for nodeport deployments ## Full definition endpoints ### Compute Template -For the purpose to simplify the setting of resource, we abstract the resource of the pods template resource to the `compute template` for usage, you can define the resource in the `compute template` and then choose the appropriate template for your `head` and `workergroup` when you are creating the real objects of `RayCluster`, `RayJobs` or `RayService`. +For the purpose to simplify the setting of resources, the Kuberay API server abstracts the resource of the pods template resource to the `compute template`. You can define the resources in the `compute template` and then choose the appropriate template for your `head` and `workergroup` when you are creating the objects of `RayCluster`, `RayJobs` or `RayService`. + +The full definition of the compute template resource can be found in [config.proto](../proto/config.proto) or the Kuberay API server swagger doc. #### Create compute templates in a given namespace @@ -380,12 +392,6 @@ Examples: ```json { "computeTemplates": [ - { - "name": "default-template", - "namespace": "default", - "cpu": 2, - "memory": 4 - }, { "name": "default-template", "namespace": "ray-system", @@ -466,12 +472,12 @@ Examples: "name": "test-cluster", "namespace": "ray-system", "user": "3cpo", - "version": "2.6.3", + "version": "2.7.0", "environment": "DEV", "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -483,10 +489,10 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", - "replicas": 2, - "minReplicas": 0, - "maxReplicas": 5, + "image": "rayproject/ray:2.7.0", + "replicas": 1, + "minReplicas": 1, + "maxReplicas": 1, "rayStartParams": { "node-ip-address": "$MY_POD_IP" } @@ -503,11 +509,11 @@ Examples: "name": "test-cluster", "namespace": "ray-system", "user": "3cpo", - "version": "2.6.3", + "version": "2.7.0", "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -518,10 +524,10 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", - "replicas": 2, - "minReplicas": 5, - "maxReplicas": 2, + "image": "rayproject/ray:2.7.0", + "replicas": 1, + "minReplicas": 1, + "maxReplicas": 1, "rayStartParams": { "node-ip-address": "$MY_POD_IP" } @@ -529,9 +535,44 @@ Examples: ] }, "annotations": { - "ray.io/creation-timestamp": "2023-09-14 12:31:58.070884 +0000 UTC" + "ray.io/creation-timestamp": "2023-09-25 10:48:35.766443417 +0000 UTC" }, - "createdAt": "2023-09-14T12:31:58Z" + "createdAt": "2023-09-25T10:48:35Z", + "events": [ + { + "id": "test-cluster.178817bd10374138", + "name": "test-cluster-test-cluster.178817bd10374138", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:40Z", + "reason": "Created", + "message": "Created service test-cluster-head-svc", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.178817bd251e9c7c", + "name": "test-cluster-test-cluster.178817bd251e9c7c", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:40Z", + "reason": "Created", + "message": "Created head pod test-cluster-head-rsbmm", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.178817bd2b74493f", + "name": "test-cluster-test-cluster.178817bd2b74493f", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:41Z", + "reason": "Created", + "message": "Created worker pod ", + "type": "Normal", + "count": 2 + } + ] } ``` @@ -560,11 +601,11 @@ Examples: "name": "test-cluster", "namespace": "ray-system", "user": "3cpo", - "version": "2.6.3", + "version": "2.7.0", "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -575,10 +616,10 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", - "replicas": 2, - "minReplicas": 5, - "maxReplicas": 2, + "image": "rayproject/ray:2.7.0", + "replicas": 1, + "minReplicas": 1, + "maxReplicas": 1, "rayStartParams": { "node-ip-address": "$MY_POD_IP" } @@ -586,54 +627,87 @@ Examples: ] }, "annotations": { - "ray.io/creation-timestamp": "2023-09-15 11:31:08.304211723 +0000 UTC" + "ray.io/creation-timestamp": "2023-09-25 10:48:35.766443417 +0000 UTC" }, - "createdAt": "2023-09-15T11:31:08Z", + "createdAt": "2023-09-25T10:48:35Z", "clusterState": "ready", "events": [ { - "id": "test-cluster.17850f20cdb520b5", - "name": "test-cluster-test-cluster.17850f20cdb520b5", - "createdAt": "2023-09-15T11:31:08Z", - "firstTimestamp": "2023-09-15T11:31:08Z", - "lastTimestamp": "2023-09-15T11:31:08Z", + "id": "test-cluster.178817bd10374138", + "name": "test-cluster-test-cluster.178817bd10374138", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:40Z", "reason": "Created", "message": "Created service test-cluster-head-svc", "type": "Normal", "count": 1 }, { - "id": "test-cluster.17850f20d04c4601", - "name": "test-cluster-test-cluster.17850f20d04c4601", - "createdAt": "2023-09-15T11:31:08Z", - "firstTimestamp": "2023-09-15T11:31:08Z", - "lastTimestamp": "2023-09-15T11:31:08Z", + "id": "test-cluster.178817bd251e9c7c", + "name": "test-cluster-test-cluster.178817bd251e9c7c", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:40Z", "reason": "Created", - "message": "Created head pod test-cluster-head-v4dmh", + "message": "Created head pod test-cluster-head-rsbmm", "type": "Normal", "count": 1 }, { - "id": "test-cluster.17850f20d642aa31", - "name": "test-cluster-test-cluster.17850f20d642aa31", - "createdAt": "2023-09-15T11:31:08Z", - "firstTimestamp": "2023-09-15T11:31:08Z", - "lastTimestamp": "2023-09-15T11:31:08Z", + "id": "test-cluster.178817bd2b74493f", + "name": "test-cluster-test-cluster.178817bd2b74493f", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:41Z", "reason": "Created", "message": "Created worker pod ", "type": "Normal", "count": 2 + }, + { + "id": "test-cluster.17881e9c2b82c449", + "name": "test-cluster-test-cluster.17881e9c2b82c449", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created service test-cluster-head-svc", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.17881e9c2e9cd4b8", + "name": "test-cluster-test-cluster.17881e9c2e9cd4b8", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created head pod test-cluster-head-nglmx", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.17881e9c34460442", + "name": "test-cluster-test-cluster.17881e9c34460442", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created worker pod ", + "type": "Normal", + "count": 1 } ], "serviceEndpoint": { - "dashboard": "30529", - "head": "32208", - "metrics": "32623", - "redis": "30105" + "dashboard": "31476", + "head": "31850", + "metrics": "32189", + "redis": "30736" } } ] - } + } ``` #### List all clusters in all namespaces @@ -647,7 +721,7 @@ Examples: * Request ```sh - curl -X 'GET' \ + curl --silent -X 'GET' \ 'http://localhost:31888/apis/v1alpha2/clusters' \ -H 'accept: application/json' ``` @@ -661,11 +735,11 @@ Examples: "name": "test-cluster", "namespace": "ray-system", "user": "3cpo", - "version": "2.6.3", + "version": "2.7.0", "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -676,10 +750,10 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", - "replicas": 2, - "minReplicas": 5, - "maxReplicas": 2, + "image": "rayproject/ray:2.7.0", + "replicas": 1, + "minReplicas": 1, + "maxReplicas": 1, "rayStartParams": { "node-ip-address": "$MY_POD_IP" } @@ -687,54 +761,87 @@ Examples: ] }, "annotations": { - "ray.io/creation-timestamp": "2023-09-15 11:31:08.304211723 +0000 UTC" + "ray.io/creation-timestamp": "2023-09-25 10:48:35.766443417 +0000 UTC" }, - "createdAt": "2023-09-15T11:31:08Z", + "createdAt": "2023-09-25T10:48:35Z", "clusterState": "ready", "events": [ { - "id": "test-cluster.17850f20cdb520b5", - "name": "test-cluster-test-cluster.17850f20cdb520b5", - "createdAt": "2023-09-15T11:31:08Z", - "firstTimestamp": "2023-09-15T11:31:08Z", - "lastTimestamp": "2023-09-15T11:31:08Z", + "id": "test-cluster.178817bd10374138", + "name": "test-cluster-test-cluster.178817bd10374138", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:40Z", "reason": "Created", "message": "Created service test-cluster-head-svc", "type": "Normal", "count": 1 }, { - "id": "test-cluster.17850f20d04c4601", - "name": "test-cluster-test-cluster.17850f20d04c4601", - "createdAt": "2023-09-15T11:31:08Z", - "firstTimestamp": "2023-09-15T11:31:08Z", - "lastTimestamp": "2023-09-15T11:31:08Z", + "id": "test-cluster.178817bd251e9c7c", + "name": "test-cluster-test-cluster.178817bd251e9c7c", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:40Z", "reason": "Created", - "message": "Created head pod test-cluster-head-v4dmh", + "message": "Created head pod test-cluster-head-rsbmm", "type": "Normal", "count": 1 }, { - "id": "test-cluster.17850f20d642aa31", - "name": "test-cluster-test-cluster.17850f20d642aa31", - "createdAt": "2023-09-15T11:31:08Z", - "firstTimestamp": "2023-09-15T11:31:08Z", - "lastTimestamp": "2023-09-15T11:31:08Z", + "id": "test-cluster.178817bd2b74493f", + "name": "test-cluster-test-cluster.178817bd2b74493f", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:41Z", "reason": "Created", "message": "Created worker pod ", "type": "Normal", "count": 2 + }, + { + "id": "test-cluster.17881e9c2b82c449", + "name": "test-cluster-test-cluster.17881e9c2b82c449", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created service test-cluster-head-svc", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.17881e9c2e9cd4b8", + "name": "test-cluster-test-cluster.17881e9c2e9cd4b8", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created head pod test-cluster-head-nglmx", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.17881e9c34460442", + "name": "test-cluster-test-cluster.17881e9c34460442", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created worker pod ", + "type": "Normal", + "count": 1 } ], "serviceEndpoint": { - "dashboard": "30529", - "head": "32208", - "metrics": "32623", - "redis": "30105" + "dashboard": "31476", + "head": "31850", + "metrics": "32189", + "redis": "30736" } } ] - } + } ``` #### Get cluster by its name and namespace @@ -757,109 +864,117 @@ Examples: ```json { - "name": "test-cluster", - "namespace": "ray-system", - "user": "3cpo", - "version": "2.6.3", - "clusterSpec": { - "headGroupSpec": { - "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", - "serviceType": "NodePort", - "rayStartParams": { - "dashboard-host": "0.0.0.0", - "metrics-export-port": "8080" - } - }, - "workerGroupSpec": [ - { - "groupName": "small-wg", - "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", - "replicas": 2, - "minReplicas": 5, - "maxReplicas": 2, - "rayStartParams": { - "node-ip-address": "$MY_POD_IP" + "clusters": [ + { + "name": "test-cluster", + "namespace": "ray-system", + "user": "3cpo", + "version": "2.7.0", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "default-template", + "image": "rayproject/ray:2.7.0", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + } + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "default-template", + "image": "rayproject/ray:2.7.0", + "replicas": 1, + "minReplicas": 1, + "maxReplicas": 1, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + } + ] + }, + "annotations": { + "ray.io/creation-timestamp": "2023-09-25 10:48:35.766443417 +0000 UTC" + }, + "createdAt": "2023-09-25T10:48:35Z", + "clusterState": "ready", + "events": [ + { + "id": "test-cluster.178817bd10374138", + "name": "test-cluster-test-cluster.178817bd10374138", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:40Z", + "reason": "Created", + "message": "Created service test-cluster-head-svc", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.178817bd251e9c7c", + "name": "test-cluster-test-cluster.178817bd251e9c7c", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:40Z", + "reason": "Created", + "message": "Created head pod test-cluster-head-rsbmm", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.178817bd2b74493f", + "name": "test-cluster-test-cluster.178817bd2b74493f", + "createdAt": "2023-09-25T08:42:40Z", + "firstTimestamp": "2023-09-25T08:42:40Z", + "lastTimestamp": "2023-09-25T08:42:41Z", + "reason": "Created", + "message": "Created worker pod ", + "type": "Normal", + "count": 2 + }, + { + "id": "test-cluster.17881e9c2b82c449", + "name": "test-cluster-test-cluster.17881e9c2b82c449", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created service test-cluster-head-svc", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.17881e9c2e9cd4b8", + "name": "test-cluster-test-cluster.17881e9c2e9cd4b8", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created head pod test-cluster-head-nglmx", + "type": "Normal", + "count": 1 + }, + { + "id": "test-cluster.17881e9c34460442", + "name": "test-cluster-test-cluster.17881e9c34460442", + "createdAt": "2023-09-25T10:48:35Z", + "firstTimestamp": "2023-09-25T10:48:35Z", + "lastTimestamp": "2023-09-25T10:48:35Z", + "reason": "Created", + "message": "Created worker pod ", + "type": "Normal", + "count": 1 } + ], + "serviceEndpoint": { + "dashboard": "31476", + "head": "31850", + "metrics": "32189", + "redis": "30736" } - ] - }, - "createdAt": "2023-09-06T11:42:03Z", - "events": [ - { - "id": "test-cluster.17824c260cc502ff", - "name": "test-cluster-test-cluster.17824c260cc502ff", - "createdAt": "2023-09-06T11:35:36Z", - "firstTimestamp": "2023-09-06T11:35:36Z", - "lastTimestamp": "2023-09-06T11:35:36Z", - "reason": "Created", - "message": "Created service test-cluster-head-svc", - "type": "Normal", - "count": 1 - }, - { - "id": "test-cluster.17824c26117bd44f", - "name": "test-cluster-test-cluster.17824c26117bd44f", - "createdAt": "2023-09-06T11:35:36Z", - "firstTimestamp": "2023-09-06T11:35:36Z", - "lastTimestamp": "2023-09-06T11:35:36Z", - "reason": "Created", - "message": "Created head pod test-cluster-head-bh75l", - "type": "Normal", - "count": 1 - }, - { - "id": "test-cluster.17824c261783b274", - "name": "test-cluster-test-cluster.17824c261783b274", - "createdAt": "2023-09-06T11:35:36Z", - "firstTimestamp": "2023-09-06T11:35:36Z", - "lastTimestamp": "2023-09-06T11:35:36Z", - "reason": "Created", - "message": "Created worker pod ", - "type": "Normal", - "count": 2 - }, - { - "id": "test-cluster.17824c8032b97213", - "name": "test-cluster-test-cluster.17824c8032b97213", - "createdAt": "2023-09-06T11:42:03Z", - "firstTimestamp": "2023-09-06T11:42:03Z", - "lastTimestamp": "2023-09-06T11:42:03Z", - "reason": "Created", - "message": "Created service test-cluster-head-svc", - "type": "Normal", - "count": 1 - }, - { - "id": "test-cluster.17824c8033d66898", - "name": "test-cluster-test-cluster.17824c8033d66898", - "createdAt": "2023-09-06T11:42:03Z", - "firstTimestamp": "2023-09-06T11:42:03Z", - "lastTimestamp": "2023-09-06T11:42:03Z", - "reason": "Created", - "message": "Created head pod test-cluster-head-m4nng", - "type": "Normal", - "count": 1 - }, - { - "id": "test-cluster.17824c803897ec83", - "name": "test-cluster-test-cluster.17824c803897ec83", - "createdAt": "2023-09-06T11:42:03Z", - "firstTimestamp": "2023-09-06T11:42:03Z", - "lastTimestamp": "2023-09-06T11:42:03Z", - "reason": "Created", - "message": "Created worker pod ", - "type": "Normal", - "count": 2 } - ], - "serviceEndpoint": { - "dashboard": "32348", - "head": "32349", - "metrics": "32134", - "redis": "30092" - } + ] } ``` @@ -910,7 +1025,7 @@ Examples: "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0" @@ -920,7 +1035,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "replicas": 1, "minReplicas": 0, "maxReplicas": 1, @@ -945,7 +1060,7 @@ Examples: "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0" @@ -955,7 +1070,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "replicas": 1, "minReplicas": 1, "maxReplicas": 1, @@ -965,7 +1080,7 @@ Examples: } ] }, - "createdAt": "2023-09-18T07:13:34Z" + "createdAt": "2023-09-25T11:36:02Z" } ``` @@ -995,11 +1110,11 @@ Examples: "namespace": "ray-system", "user": "3cp0", "entrypoint": "python -V", - "jobId": "rayjob-test-k58tz", + "jobId": "rayjob-test-drhlq", "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0" @@ -1009,7 +1124,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "replicas": 1, "minReplicas": 1, "maxReplicas": 1, @@ -1019,7 +1134,7 @@ Examples: } ] }, - "createdAt": "2023-09-18T07:13:34Z", + "createdAt": "2023-09-25T11:36:02Z", "jobStatus": "SUCCEEDED", "jobDeploymentStatus": "Running", "message": "Job finished successfully." @@ -1041,8 +1156,7 @@ Examples: ```sh curl --silent -X 'GET' \ 'http://localhost:31888/apis/v1alpha2/jobs' \ - -H 'accept: application/json' - + -H 'accept: application/json' ``` * Response @@ -1055,11 +1169,11 @@ Examples: "namespace": "ray-system", "user": "3cp0", "entrypoint": "python -V", - "jobId": "rayjob-test-k58tz", + "jobId": "rayjob-test-drhlq", "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0" @@ -1069,7 +1183,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "replicas": 1, "minReplicas": 1, "maxReplicas": 1, @@ -1079,13 +1193,13 @@ Examples: } ] }, - "createdAt": "2023-09-18T07:13:34Z", + "createdAt": "2023-09-25T11:36:02Z", "jobStatus": "SUCCEEDED", "jobDeploymentStatus": "Running", "message": "Job finished successfully." } ] - } + } ``` #### Get job by its name and namespace @@ -1112,11 +1226,11 @@ Examples: "namespace": "ray-system", "user": "3cp0", "entrypoint": "python -V", - "jobId": "rayjob-test-k58tz", + "jobId": "rayjob-test-drhlq", "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0" @@ -1126,7 +1240,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3", + "image": "rayproject/ray:2.7.0", "replicas": 1, "minReplicas": 1, "maxReplicas": 1, @@ -1136,7 +1250,7 @@ Examples: } ] }, - "createdAt": "2023-09-18T07:13:34Z", + "createdAt": "2023-09-25T11:36:02Z", "jobStatus": "SUCCEEDED", "jobDeploymentStatus": "Running", "message": "Job finished successfully." @@ -1228,7 +1342,7 @@ Examples: "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310", + "image": "rayproject/ray:2.7.0-py310", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -1240,7 +1354,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310", + "image": "rayproject/ray:2.7.0-py310", "replicas": 1, "minReplicas": 0, "maxReplicas": 5, @@ -1256,7 +1370,7 @@ Examples: * Response (V1) ```json - { + { "name": "test-v1", "namespace": "ray-system", "user": "user", @@ -1302,7 +1416,7 @@ Examples: "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310", + "image": "rayproject/ray:2.7.0-py310", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -1313,7 +1427,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310", + "image": "rayproject/ray:2.7.0-py310", "replicas": 1, "minReplicas": 5, "maxReplicas": 1, @@ -1324,7 +1438,7 @@ Examples: ] }, "rayServiceStatus": {}, - "createdAt": "2023-09-18T07:55:23Z", + "createdAt": "2023-09-25T11:42:11Z", "deleteAt": "1969-12-31T23:59:59Z" } ``` @@ -1332,7 +1446,7 @@ Examples: * Request (V2) ```sh - curl -X 'POST' \ + curl --silent -X 'POST' \ 'http://localhost:31888/apis/v1alpha2/namespaces/ray-system/services' \ -H 'accept: application/json' \ -H 'Content-Type: application/json' \ @@ -1347,7 +1461,7 @@ Examples: "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310-aarch64", + "image": "rayproject/ray:2.7.0-py310", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -1359,7 +1473,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310-aarch64", + "image": "rayproject/ray:2.7.0-py310", "replicas": 1, "minReplicas": 0, "maxReplicas": 5, @@ -1385,7 +1499,7 @@ Examples: "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310-aarch64", + "image": "rayproject/ray:2.7.0-py310", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -1396,7 +1510,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310-aarch64", + "image": "rayproject/ray:2.7.0-py310", "replicas": 1, "minReplicas": 5, "maxReplicas": 1, @@ -1406,24 +1520,10 @@ Examples: } ] }, - "rayServiceStatus": { - "rayServiceEvents": [ - { - "id": "test-v2.1785f3479cd90185", - "name": "test-v2-test-v2.1785f3479cd90185", - "createdAt": "2023-09-18T09:12:03Z", - "firstTimestamp": "2023-09-18T09:12:03Z", - "lastTimestamp": "2023-09-18T09:12:35Z", - "reason": "ServiceUnhealthy", - "message": "The service is in an unhealthy state. Controller will perform a round of actions in 10s.", - "type": "Normal", - "count": 5 - } - ] - }, - "createdAt": "2023-09-18T09:13:24Z", + "rayServiceStatus": {}, + "createdAt": "2023-09-25T11:44:41Z", "deleteAt": "1969-12-31T23:59:59Z" - } + } ``` #### List all services in a given namespace @@ -1447,197 +1547,100 @@ Examples ```json { "services": [ - { - "name": "test-v1", - "namespace": "ray-system", - "user": "user", - "serveDeploymentGraphSpec": { - "importPath": "fruit.deployment_graph", - "runtimeEnv": "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", - "serveConfigs": [ - { - "deploymentName": "OrangeStand", - "replicas": 1, - "userConfig": "price: 2", - "actorOptions": { - "cpusPerActor": 0.1 - } - }, - { - "deploymentName": "PearStand", - "replicas": 1, - "userConfig": "price: 1", - "actorOptions": { - "cpusPerActor": 0.1 + { + "name": "test-v2", + "namespace": "ray-system", + "user": "user", + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "serviceUnhealthySecondThreshold": 900, + "deploymentUnhealthySecondThreshold": 300, + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "default-template", + "image": "rayproject/ray:2.7.0-py310", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" } }, - { - "deploymentName": "FruitMarket", - "replicas": 1, - "actorOptions": { - "cpusPerActor": 0.1 + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "default-template", + "image": "rayproject/ray:2.7.0-py310", + "replicas": 1, + "minReplicas": 5, + "maxReplicas": 1, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } } - }, - { - "deploymentName": "DAGDriver", - "replicas": 1, - "routePrefix": "/", - "actorOptions": { - "cpusPerActor": 0.1 + ] + }, + "rayServiceStatus": { + "rayServiceEvents": [ + { + "id": "test-v2.178821ac4b15c743", + "name": "test-v2-test-v2.178821ac4b15c743", + "createdAt": "2023-09-25T11:44:43Z", + "firstTimestamp": "2023-09-25T11:44:43Z", + "lastTimestamp": "2023-09-25T11:46:15Z", + "reason": "ServiceUnhealthy", + "message": "The service is in an unhealthy state. Controller will perform a round of actions in 10s.", + "type": "Normal", + "count": 11 } - } - ] - }, - "serviceUnhealthySecondThreshold": 900, - "deploymentUnhealthySecondThreshold": 300, - "clusterSpec": { - "headGroupSpec": { - "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310", - "serviceType": "NodePort", - "rayStartParams": { - "dashboard-host": "0.0.0.0", - "metrics-export-port": "8080" - } + ] }, - "workerGroupSpec": [ - { - "groupName": "small-wg", - "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310", - "replicas": 1, - "minReplicas": 5, - "maxReplicas": 1, + "createdAt": "2023-09-25T11:44:41Z", + "deleteAt": "1969-12-31T23:59:59Z" + }, + { + "name": "test-v2", + "namespace": "ray-system", + "user": "user", + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "serviceUnhealthySecondThreshold": 900, + "deploymentUnhealthySecondThreshold": 300, + "clusterSpec": { + "headGroupSpec": { + "serviceType": "NodePort", "rayStartParams": { - "node-ip-address": "$MY_POD_IP" + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" } - } - ] - }, - "rayServiceStatus": { - "rayServiceEvents": [ - { - "id": "test-v1.1785ef1932a53822", - "name": "test-v1-test-v1.1785ef1932a53822", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", - "reason": "ServiceUnhealthy", - "message": "The service is in an unhealthy state. Controller will perform a round of actions in 10s.", - "type": "Normal", - "count": 1 - }, - { - "id": "test-v1.1785ef1962bfb4c2", - "name": "test-v1-test-v1.1785ef1962bfb4c2", - "createdAt": "2023-09-18T07:55:27Z", - "firstTimestamp": "2023-09-18T07:55:27Z", - "lastTimestamp": "2023-09-18T07:55:30Z", - "reason": "WaitForServeDeploymentReady", - "message": "Put \"http://test-v1-raycluster-vcvz2-head-svc.ray-system.svc.cluster.local:52365/api/serve/deployments/\": dial tcp 10.96.98.134:52365: connect: connection refused", - "type": "Normal", - "count": 3 - }, - { - "id": "test-v1.1785ef1b0f379aa9", - "name": "test-v1-test-v1.1785ef1b0f379aa9", - "createdAt": "2023-09-18T07:55:34Z", - "firstTimestamp": "2023-09-18T07:55:34Z", - "lastTimestamp": "2023-09-18T07:55:34Z", - "reason": "SubmittedServeDeployment", - "message": "Controller sent API request to update Serve deployments on cluster test-v1-raycluster-vcvz2", - "type": "Normal", - "count": 1 - }, - { - "id": "test-v1.1785ef1bbaa5aa5f", - "name": "test-v1-test-v1.1785ef1bbaa5aa5f", - "createdAt": "2023-09-18T07:55:37Z", - "firstTimestamp": "2023-09-18T07:55:37Z", - "lastTimestamp": "2023-09-18T07:55:39Z", - "reason": "ServiceNotReady", - "message": "The service is not ready yet. Controller will perform a round of actions in 2s.", - "type": "Normal", - "count": 2 - }, - { - "id": "test-v1.1785ef1cd4c74326", - "name": "test-v1-test-v1.1785ef1cd4c74326", - "createdAt": "2023-09-18T07:55:41Z", - "firstTimestamp": "2023-09-18T07:55:41Z", - "lastTimestamp": "2023-09-18T08:00:27Z", - "reason": "Running", - "message": "The Serve applicaton is now running and healthy.", - "type": "Normal", - "count": 136 - }, - { - "id": "test-v1-raycluster-vcvz2.1785ef1920827324", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef1920827324", - "createdAt": "2023-09-18T07:55:25Z", - "firstTimestamp": "2023-09-18T07:55:25Z", - "lastTimestamp": "2023-09-18T07:55:25Z", - "reason": "Created", - "message": "Created service test-v1-raycluster-vcvz2-head-svc", - "type": "Normal", - "count": 1 - }, - { - "id": "test-v1-raycluster-vcvz2.1785ef1926720c3b", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef1926720c3b", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", - "reason": "Created", - "message": "Created head pod test-v1-raycluster-vcvz2-head-959rs", - "type": "Normal", - "count": 1 }, - { - "id": "test-v1-raycluster-vcvz2.1785ef192cb1eb9d", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef192cb1eb9d", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", - "reason": "Created", - "message": "Created worker pod ", - "type": "Normal", - "count": 1 - } - ], - "rayClusterName": "test-v1-raycluster-vcvz2", - "serveApplicationStatus": [ - { - "name": "default", - "status": "RUNNING", - "serveDeploymentStatus": [ - { - "deploymentName": "default_DAGDriver", - "status": "HEALTHY" - }, - { - "deploymentName": "default_FruitMarket", - "status": "HEALTHY" - }, - { - "deploymentName": "default_MangoStand", - "status": "HEALTHY" - }, - { - "deploymentName": "default_OrangeStand", - "status": "HEALTHY" - }, - { - "deploymentName": "default_PearStand", - "status": "HEALTHY" + "workerGroupSpec": [ + { + "groupName": "small-wg", + "replicas": 1, + "minReplicas": 5, + "maxReplicas": 1, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" } - ] - } - ] - }, - "createdAt": "2023-09-18T07:55:23Z", - "deleteAt": "1969-12-31T23:59:59Z" - } + } + ] + }, + "rayServiceStatus": { + "rayServiceEvents": [ + { + "id": "test-v2.178821ac4b15c743", + "name": "test-v2-test-v2.178821ac4b15c743", + "createdAt": "2023-09-25T11:44:43Z", + "firstTimestamp": "2023-09-25T11:44:43Z", + "lastTimestamp": "2023-09-25T11:46:15Z", + "reason": "ServiceUnhealthy", + "message": "The service is in an unhealthy state. Controller will perform a round of actions in 10s.", + "type": "Normal", + "count": 11 + } + ] + }, + "createdAt": "2023-09-25T11:44:41Z", + "deleteAt": "1969-12-31T23:59:59Z" + } ] } ``` @@ -1664,52 +1667,63 @@ Examples: { "services": [ { - "name": "test-v1", + "name": "test-v2", "namespace": "ray-system", "user": "user", - "serveDeploymentGraphSpec": { - "importPath": "fruit.deployment_graph", - "runtimeEnv": "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", - "serveConfigs": [ - { - "deploymentName": "OrangeStand", - "replicas": 1, - "userConfig": "price: 2", - "actorOptions": { - "cpusPerActor": 0.1 - } - }, - { - "deploymentName": "PearStand", - "replicas": 1, - "userConfig": "price: 1", - "actorOptions": { - "cpusPerActor": 0.1 - } - }, + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "serviceUnhealthySecondThreshold": 900, + "deploymentUnhealthySecondThreshold": 300, + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "default-template", + "image": "rayproject/ray:2.7.0-py310", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + } + }, + "workerGroupSpec": [ { - "deploymentName": "FruitMarket", + "groupName": "small-wg", + "computeTemplate": "default-template", + "image": "rayproject/ray:2.7.0-py310", "replicas": 1, - "actorOptions": { - "cpusPerActor": 0.1 + "minReplicas": 5, + "maxReplicas": 1, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" } - }, + } + ] + }, + "rayServiceStatus": { + "rayServiceEvents": [ { - "deploymentName": "DAGDriver", - "replicas": 1, - "routePrefix": "/", - "actorOptions": { - "cpusPerActor": 0.1 - } + "id": "test-v2.178821ac4b15c743", + "name": "test-v2-test-v2.178821ac4b15c743", + "createdAt": "2023-09-25T11:44:43Z", + "firstTimestamp": "2023-09-25T11:44:43Z", + "lastTimestamp": "2023-09-25T11:47:55Z", + "reason": "ServiceUnhealthy", + "message": "The service is in an unhealthy state. Controller will perform a round of actions in 10s.", + "type": "Normal", + "count": 21 } ] }, + "createdAt": "2023-09-25T11:44:41Z", + "deleteAt": "1969-12-31T23:59:59Z" + }, + { + "name": "test-v2", + "namespace": "ray-system", + "user": "user", + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", "serviceUnhealthySecondThreshold": 900, "deploymentUnhealthySecondThreshold": 300, "clusterSpec": { "headGroupSpec": { - "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310-aarch64", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -1719,8 +1733,6 @@ Examples: "workerGroupSpec": [ { "groupName": "small-wg", - "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310-aarch64", "replicas": 1, "minReplicas": 5, "maxReplicas": 1, @@ -1733,125 +1745,19 @@ Examples: "rayServiceStatus": { "rayServiceEvents": [ { - "id": "test-v1.1785ef1932a53822", - "name": "test-v1-test-v1.1785ef1932a53822", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", + "id": "test-v2.178821ac4b15c743", + "name": "test-v2-test-v2.178821ac4b15c743", + "createdAt": "2023-09-25T11:44:43Z", + "firstTimestamp": "2023-09-25T11:44:43Z", + "lastTimestamp": "2023-09-25T11:47:55Z", "reason": "ServiceUnhealthy", "message": "The service is in an unhealthy state. Controller will perform a round of actions in 10s.", "type": "Normal", - "count": 1 - }, - { - "id": "test-v1.1785ef1962bfb4c2", - "name": "test-v1-test-v1.1785ef1962bfb4c2", - "createdAt": "2023-09-18T07:55:27Z", - "firstTimestamp": "2023-09-18T07:55:27Z", - "lastTimestamp": "2023-09-18T07:55:30Z", - "reason": "WaitForServeDeploymentReady", - "message": "Put \"http://test-v1-raycluster-vcvz2-head-svc.ray-system.svc.cluster.local:52365/api/serve/deployments/\": dial tcp 10.96.98.134:52365: connect: connection refused", - "type": "Normal", - "count": 3 - }, - { - "id": "test-v1.1785ef1b0f379aa9", - "name": "test-v1-test-v1.1785ef1b0f379aa9", - "createdAt": "2023-09-18T07:55:34Z", - "firstTimestamp": "2023-09-18T07:55:34Z", - "lastTimestamp": "2023-09-18T07:55:34Z", - "reason": "SubmittedServeDeployment", - "message": "Controller sent API request to update Serve deployments on cluster test-v1-raycluster-vcvz2", - "type": "Normal", - "count": 1 - }, - { - "id": "test-v1.1785ef1bbaa5aa5f", - "name": "test-v1-test-v1.1785ef1bbaa5aa5f", - "createdAt": "2023-09-18T07:55:37Z", - "firstTimestamp": "2023-09-18T07:55:37Z", - "lastTimestamp": "2023-09-18T07:55:39Z", - "reason": "ServiceNotReady", - "message": "The service is not ready yet. Controller will perform a round of actions in 2s.", - "type": "Normal", - "count": 2 - }, - { - "id": "test-v1.1785ef1cd4c74326", - "name": "test-v1-test-v1.1785ef1cd4c74326", - "createdAt": "2023-09-18T07:55:41Z", - "firstTimestamp": "2023-09-18T07:55:41Z", - "lastTimestamp": "2023-09-18T08:05:27Z", - "reason": "Running", - "message": "The Serve applicaton is now running and healthy.", - "type": "Normal", - "count": 277 - }, - { - "id": "test-v1-raycluster-vcvz2.1785ef1920827324", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef1920827324", - "createdAt": "2023-09-18T07:55:25Z", - "firstTimestamp": "2023-09-18T07:55:25Z", - "lastTimestamp": "2023-09-18T07:55:25Z", - "reason": "Created", - "message": "Created service test-v1-raycluster-vcvz2-head-svc", - "type": "Normal", - "count": 1 - }, - { - "id": "test-v1-raycluster-vcvz2.1785ef1926720c3b", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef1926720c3b", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", - "reason": "Created", - "message": "Created head pod test-v1-raycluster-vcvz2-head-959rs", - "type": "Normal", - "count": 1 - }, - { - "id": "test-v1-raycluster-vcvz2.1785ef192cb1eb9d", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef192cb1eb9d", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", - "reason": "Created", - "message": "Created worker pod ", - "type": "Normal", - "count": 1 - } - ], - "rayClusterName": "test-v1-raycluster-vcvz2", - "serveApplicationStatus": [ - { - "name": "default", - "status": "RUNNING", - "serveDeploymentStatus": [ - { - "deploymentName": "default_FruitMarket", - "status": "HEALTHY" - }, - { - "deploymentName": "default_MangoStand", - "status": "HEALTHY" - }, - { - "deploymentName": "default_OrangeStand", - "status": "HEALTHY" - }, - { - "deploymentName": "default_PearStand", - "status": "HEALTHY" - }, - { - "deploymentName": "default_DAGDriver", - "status": "HEALTHY" - } - ] + "count": 21 } ] }, - "createdAt": "2023-09-18T07:55:23Z", + "createdAt": "2023-09-25T11:44:41Z", "deleteAt": "1969-12-31T23:59:59Z" } ] @@ -1923,7 +1829,7 @@ Examples: "clusterSpec": { "headGroupSpec": { "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310-aarch64", + "image": "rayproject/ray:2.7.0-py310", "serviceType": "NodePort", "rayStartParams": { "dashboard-host": "0.0.0.0", @@ -1934,7 +1840,7 @@ Examples: { "groupName": "small-wg", "computeTemplate": "default-template", - "image": "rayproject/ray:2.6.3-py310-aarch64", + "image": "rayproject/ray:2.7.0-py310", "replicas": 1, "minReplicas": 5, "maxReplicas": 1, @@ -1947,125 +1853,125 @@ Examples: "rayServiceStatus": { "rayServiceEvents": [ { - "id": "test-v1.1785ef1932a53822", - "name": "test-v1-test-v1.1785ef1932a53822", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", + "id": "test-v1.1788218987842e1a", + "name": "test-v1-test-v1.1788218987842e1a", + "createdAt": "2023-09-25T11:42:14Z", + "firstTimestamp": "2023-09-25T11:42:14Z", + "lastTimestamp": "2023-09-25T11:42:15Z", "reason": "ServiceUnhealthy", "message": "The service is in an unhealthy state. Controller will perform a round of actions in 10s.", "type": "Normal", - "count": 1 + "count": 2 }, { - "id": "test-v1.1785ef1962bfb4c2", - "name": "test-v1-test-v1.1785ef1962bfb4c2", - "createdAt": "2023-09-18T07:55:27Z", - "firstTimestamp": "2023-09-18T07:55:27Z", - "lastTimestamp": "2023-09-18T07:55:30Z", + "id": "test-v1.1788218a0a86434b", + "name": "test-v1-test-v1.1788218a0a86434b", + "createdAt": "2023-09-25T11:42:16Z", + "firstTimestamp": "2023-09-25T11:42:16Z", + "lastTimestamp": "2023-09-25T11:42:24Z", "reason": "WaitForServeDeploymentReady", - "message": "Put \"http://test-v1-raycluster-vcvz2-head-svc.ray-system.svc.cluster.local:52365/api/serve/deployments/\": dial tcp 10.96.98.134:52365: connect: connection refused", + "message": "Fail to create / update Serve deployments. If you observe this error consistently, please check \"Issue 5: Fail to create / update Serve applications.\" in https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayservice-troubleshooting.md for more details. err: Put \"http://test-v1-raycluster-7grg7-head-svc.ray-system.svc.cluster.local:52365/api/serve/deployments/\": dial tcp 10.96.69.78:52365: connect: connection refused", "type": "Normal", - "count": 3 + "count": 6 }, { - "id": "test-v1.1785ef1b0f379aa9", - "name": "test-v1-test-v1.1785ef1b0f379aa9", - "createdAt": "2023-09-18T07:55:34Z", - "firstTimestamp": "2023-09-18T07:55:34Z", - "lastTimestamp": "2023-09-18T07:55:34Z", + "id": "test-v1.1788218cf7437d6f", + "name": "test-v1-test-v1.1788218cf7437d6f", + "createdAt": "2023-09-25T11:42:29Z", + "firstTimestamp": "2023-09-25T11:42:29Z", + "lastTimestamp": "2023-09-25T11:42:29Z", "reason": "SubmittedServeDeployment", - "message": "Controller sent API request to update Serve deployments on cluster test-v1-raycluster-vcvz2", + "message": "Controller sent API request to update Serve deployments on cluster test-v1-raycluster-7grg7", "type": "Normal", "count": 1 }, { - "id": "test-v1.1785ef1bbaa5aa5f", - "name": "test-v1-test-v1.1785ef1bbaa5aa5f", - "createdAt": "2023-09-18T07:55:37Z", - "firstTimestamp": "2023-09-18T07:55:37Z", - "lastTimestamp": "2023-09-18T07:55:39Z", + "id": "test-v1.1788218cf9823bdf", + "name": "test-v1-test-v1.1788218cf9823bdf", + "createdAt": "2023-09-25T11:42:29Z", + "firstTimestamp": "2023-09-25T11:42:29Z", + "lastTimestamp": "2023-09-25T11:42:35Z", "reason": "ServiceNotReady", "message": "The service is not ready yet. Controller will perform a round of actions in 2s.", "type": "Normal", - "count": 2 + "count": 4 }, { - "id": "test-v1.1785ef1cd4c74326", - "name": "test-v1-test-v1.1785ef1cd4c74326", - "createdAt": "2023-09-18T07:55:41Z", - "firstTimestamp": "2023-09-18T07:55:41Z", - "lastTimestamp": "2023-09-18T08:36:25Z", + "id": "test-v1.1788218ee2f173bc", + "name": "test-v1-test-v1.1788218ee2f173bc", + "createdAt": "2023-09-25T11:42:37Z", + "firstTimestamp": "2023-09-25T11:42:37Z", + "lastTimestamp": "2023-09-25T11:47:16Z", "reason": "Running", "message": "The Serve applicaton is now running and healthy.", "type": "Normal", - "count": 837 + "count": 140 }, { - "id": "test-v1-raycluster-vcvz2.1785ef1920827324", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef1920827324", - "createdAt": "2023-09-18T07:55:25Z", - "firstTimestamp": "2023-09-18T07:55:25Z", - "lastTimestamp": "2023-09-18T07:55:25Z", + "id": "test-v1-raycluster-7grg7.1788218976cd5268", + "name": "test-v1-test-v1-raycluster-7grg7.1788218976cd5268", + "createdAt": "2023-09-25T11:42:13Z", + "firstTimestamp": "2023-09-25T11:42:13Z", + "lastTimestamp": "2023-09-25T11:42:13Z", "reason": "Created", - "message": "Created service test-v1-raycluster-vcvz2-head-svc", + "message": "Created service test-v1-raycluster-7grg7-head-svc", "type": "Normal", "count": 1 }, { - "id": "test-v1-raycluster-vcvz2.1785ef1926720c3b", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef1926720c3b", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", + "id": "test-v1-raycluster-7grg7.178821897b8cddee", + "name": "test-v1-test-v1-raycluster-7grg7.178821897b8cddee", + "createdAt": "2023-09-25T11:42:14Z", + "firstTimestamp": "2023-09-25T11:42:14Z", + "lastTimestamp": "2023-09-25T11:42:14Z", "reason": "Created", - "message": "Created head pod test-v1-raycluster-vcvz2-head-959rs", + "message": "Created head pod test-v1-raycluster-7grg7-head-jfkq5", "type": "Normal", "count": 1 }, { - "id": "test-v1-raycluster-vcvz2.1785ef192cb1eb9d", - "name": "test-v1-test-v1-raycluster-vcvz2.1785ef192cb1eb9d", - "createdAt": "2023-09-18T07:55:26Z", - "firstTimestamp": "2023-09-18T07:55:26Z", - "lastTimestamp": "2023-09-18T07:55:26Z", + "id": "test-v1-raycluster-7grg7.178821898168eedb", + "name": "test-v1-test-v1-raycluster-7grg7.178821898168eedb", + "createdAt": "2023-09-25T11:42:14Z", + "firstTimestamp": "2023-09-25T11:42:14Z", + "lastTimestamp": "2023-09-25T11:42:14Z", "reason": "Created", "message": "Created worker pod ", "type": "Normal", "count": 1 } ], - "rayClusterName": "test-v1-raycluster-vcvz2", + "rayClusterName": "test-v1-raycluster-7grg7", "serveApplicationStatus": [ { "name": "default", "status": "RUNNING", "serveDeploymentStatus": [ { - "deploymentName": "default_DAGDriver", + "deploymentName": "DAGDriver", "status": "HEALTHY" }, { - "deploymentName": "default_FruitMarket", + "deploymentName": "FruitMarket", "status": "HEALTHY" }, { - "deploymentName": "default_MangoStand", + "deploymentName": "MangoStand", "status": "HEALTHY" }, { - "deploymentName": "default_OrangeStand", + "deploymentName": "OrangeStand", "status": "HEALTHY" }, { - "deploymentName": "default_PearStand", + "deploymentName": "PearStand", "status": "HEALTHY" } ] } ] }, - "createdAt": "2023-09-18T07:55:23Z", + "createdAt": "2023-09-25T11:42:11Z", "deleteAt": "1969-12-31T23:59:59Z" } ```