ray-project · DmitriGekhtman · Dec 1, 2022 · Nov 25, 2022 · Nov 25, 2022 · Nov 25, 2022
diff --git a/.github/workflows/test-job.yaml b/.github/workflows/test-job.yaml
@@ -260,13 +260,13 @@ jobs:
       working-directory: ${{env.working-directory}}
       if: contains(fromJson('["refs/heads/master", "refs/heads/release-0.3"]'), github.ref)
 
-  test-compatibility-1_12_0:
+  test-compatibility-1_13_0:
     needs:
       - build_operator
       - build_apiserver
       - lint
     runs-on: ubuntu-latest
-    name: Compatibility Test - 1.12.0
+    name: Compatibility Test - 1.13.0
     steps:
       - name: Check out code into the Go module directory
         uses: actions/checkout@v2
@@ -278,15 +278,15 @@ jobs:
 
       - uses: ./.github/workflows/actions/compatibility
         with:
-          ray_version: 1.12.0
+          ray_version: 1.13.0
 
-  test-compatibility-1_13_0:
+  test-compatibility-2_0_0:
     needs:
       - build_operator
       - build_apiserver
       - lint
     runs-on: ubuntu-latest
-    name: Compatibility Test - 1.13.0
+    name: Compatibility Test - 2.0.0
     steps:
       - name: Check out code into the Go module directory
         uses: actions/checkout@v2
@@ -298,15 +298,15 @@ jobs:
 
       - uses: ./.github/workflows/actions/compatibility
         with:
-          ray_version: 1.13.0
+          ray_version: 2.0.0
 
-  test-compatibility-2_0_0:
+  test-compatibility-2_1_0:
     needs:
       - build_operator
       - build_apiserver
       - lint
     runs-on: ubuntu-latest
-    name: Compatibility Test - 2.0.0
+    name: Compatibility Test - 2.1.0
     steps:
       - name: Check out code into the Go module directory
         uses: actions/checkout@v2
@@ -318,7 +318,7 @@ jobs:
 
       - uses: ./.github/workflows/actions/compatibility
         with:
-          ray_version: 2.0.0
+          ray_version: 2.1.0
 
   test-compatibility-nightly:
     needs:
@@ -340,13 +340,13 @@ jobs:
         with:
           ray_version: nightly
 
-  sample-yaml-config-test-2_0_0:
+  sample-yaml-config-test-2_1_0:
     needs:
       - build_operator
       - build_apiserver
       - lint
     runs-on: ubuntu-latest
-    name: Sample YAML Config Test - 2.0.0
+    name: Sample YAML Config Test - 2.1.0
     steps:
       - name: Check out code into the Go module directory
         uses: actions/checkout@v2
@@ -357,4 +357,4 @@ jobs:
           ref: ${{github.event.pull_request.head.sha}}
       - uses: ./.github/workflows/actions/configuration
         with:
-          ray_version: 2.0.0
+          ray_version: 2.1.0
diff --git a/docs/guidance/autoscaler.md b/docs/guidance/autoscaler.md
@@ -3,7 +3,7 @@
 
 ### Prerequisite
 
-Ray Autoscaler integration is beta with KubeRay 0.3.0 and Ray 2.0.0.
+Ray Autoscaler integration is beta since KubeRay 0.3.0 and Ray 2.0.0.
 While autoscaling functionality is stable, the details of autoscaler behavior and configuration may change in future releases.
 
 Start by deploying the latest stable version of the KubeRay operator:

diff --git a/ray-operator/DEVELOPMENT.md b/ray-operator/DEVELOPMENT.md
@@ -142,6 +142,17 @@ These tests operate small Ray clusters running within a [kind](https://kind.sigs
   # [Usage]: RAY_IMAGE=$RAY_IMAGE OPERATOR_IMAGE=$OPERATOR_IMAGE python3 tests/compatibility-test.py
   #          These 3 environment variables are optional.
   # [Example]:
-  RAY_IMAGE=rayproject/ray:2.0.0 OPERATOR_IMAGE=kuberay/operator:nightly python3 tests/compatibility-test.py
+  RAY_IMAGE=rayproject/ray:2.1.0 OPERATOR_IMAGE=kuberay/operator:nightly python3 tests/compatibility-test.py
   ```
-
+### Running configuration tests locally.
+
+The sample RayCluster and RayService CRs under `ray-operator/config/samples` are tested in `tests/framework/test_sample_raycluster_yamls.py`
+and `tests/framework/test_sample_rayservice_yamls.py`.
+```bash
+# Test RayCluster doc examples.
+RAY_IMAGE=rayproject/ray:2.1.0 OPERATOR_IMAGE=kuberay/operator:nightly python3 tests/framework/test_sample_raycluster_yamls.py
+# Test RayService doc examples.
+RAY_IMAGE=rayproject/ray:2.1.0 OPERATOR_IMAGE=kuberay/operator:nightly python3 tests/framework/test_sample_rayservice_yamls.py
+```
+Currently, only a few of these sample configurations are tested in the CI. See
+[KubeRay issue #695](https://github.com/ray-project/kuberay/issues/695).
diff --git a/ray-operator/apis/ray/v1alpha1/raycluster_types_test.go b/ray-operator/apis/ray/v1alpha1/raycluster_types_test.go
@@ -23,7 +23,6 @@ var myRayCluster = &RayCluster{
 				"object-manager-port":         "12345",
 				"node-manager-port":           "12346",
 				"object-store-memory":         "100000000",
-				"redis-password":              "LetMeInRay",
 				"num-cpus":                    "1",
 				"dashboard-agent-listen-port": "52365",
 			},
@@ -65,7 +64,6 @@ var myRayCluster = &RayCluster{
 				GroupName:   "small-group",
 				RayStartParams: map[string]string{
 					"port":                        "6379",
-					"redis-password":              "LetMeInRay",
 					"num-cpus":                    "1",
 					"dashboard-agent-listen-port": "52365",
 				},

diff --git a/ray-operator/apis/ray/v1alpha1/rayjob_types_test.go b/ray-operator/apis/ray/v1alpha1/rayjob_types_test.go
@@ -49,7 +49,7 @@ var expectedRayJob = RayJob{
 						Containers: []corev1.Container{
 							{
 								Name:  "ray-head",
-								Image: "rayproject/ray:1.12.1",
+								Image: "rayproject/ray:2.1.0",
 								Env: []corev1.EnvVar{
 									{
 										Name: "MY_POD_IP",
@@ -111,7 +111,7 @@ var expectedRayJob = RayJob{
 							Containers: []corev1.Container{
 								{
 									Name:    "ray-worker",
-									Image:   "rayproject/ray:1.12.1",
+									Image:   "rayproject/ray:2.1.0",
 									Command: []string{"echo"},
 									Args:    []string{"Hello Ray"},
 									Env: []corev1.EnvVar{
@@ -175,7 +175,7 @@ var testRayJobJSON = `{
                         "containers": [
                             {
                                 "name": "ray-head",
-                                "image": "rayproject/ray:1.12.1",
+                                "image": "rayproject/ray:2.1.0",
                                 "ports": [
                                     {
                                         "name": "gcs-server",
@@ -238,7 +238,7 @@ var testRayJobJSON = `{
                             "containers": [
                                 {
                                     "name": "ray-worker",
-                                    "image": "rayproject/ray:1.12.1",
+                                    "image": "rayproject/ray:2.1.0",
                                     "command": [
                                         "echo"
                                     ],

diff --git a/ray-operator/apis/ray/v1alpha1/rayservice_types_test.go b/ray-operator/apis/ray/v1alpha1/rayservice_types_test.go
@@ -97,7 +97,7 @@ var myRayService = &RayService{
 						Containers: []corev1.Container{
 							{
 								Name:  "ray-head",
-								Image: "rayproject/ray:1.12.1",
+								Image: "rayproject/ray:2.1.0",
 								Env: []corev1.EnvVar{
 									{
 										Name: "MY_POD_IP",
@@ -164,7 +164,7 @@ var myRayService = &RayService{
 							Containers: []corev1.Container{
 								{
 									Name:    "ray-worker",
-									Image:   "rayproject/ray:1.12.1",
+									Image:   "rayproject/ray:2.1.0",
 									Command: []string{"echo"},
 									Args:    []string{"Hello Ray"},
 									Env: []corev1.EnvVar{
@@ -267,7 +267,7 @@ var expected = `{
                   "containers":[
                      {
                         "name":"ray-head",
-                        "image":"rayproject/ray:1.12.1",
+                        "image":"rayproject/ray:2.1.0",
                         "ports":[
                            {
                               "name":"gcs-server",
@@ -335,7 +335,7 @@ var expected = `{
                      "containers":[
                         {
                            "name":"ray-worker",
-                           "image":"rayproject/ray:1.12.1",
+                           "image":"rayproject/ray:2.1.0",
                            "command":[
                               "echo"
                            ],

diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml
@@ -20,7 +20,7 @@ metadata:
   name: raycluster-autoscaler
 spec:
   # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
-  rayVersion: '2.0.0'
+  rayVersion: '2.1.0'
   # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
   # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
   # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
@@ -51,8 +51,7 @@ spec:
       requests:
         cpu: "500m"
         memory: "512Mi"
-  ######################headGroupSpec#################################
-  # head group template and specs, (perhaps 'group' is not needed in the name)
+  # Ray head pod template
   headGroupSpec:
     # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
     serviceType: ClusterIP
@@ -72,7 +71,7 @@ spec:
         containers:
         # The Ray head container
         - name: ray-head
-          image: rayproject/ray:2.0.0
+          image: rayproject/ray:2.1.0
           imagePullPolicy: Always
           # Optimal resource allocation will depend on your Kubernetes infrastructure and might
           # require some experimentation.
@@ -103,10 +102,11 @@ spec:
     maxReplicas: 10
     # logical group name, for this called large-group, also can be functional
     groupName: large-group
-    # if worker pods need to be added, we can simply increment the replicas
-    # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
-    # the operator will remove pods from the list until the number of replicas is satisfied
-    # when a pod is confirmed to be deleted, its name will be removed from the list below
+    # If worker pods need to be added, we can increment the replicas.
+    # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.
+    # The operator will remove pods from the list until the desired number of replicas is satisfied.
+    # If the difference between the current replica count and the desired replicas is greater than the
+    # number of entries in workersToDelete, random worker pods will be deleted.
     #scaleStrategy:
     #  workersToDelete:
     #  - raycluster-complete-worker-large-group-bdtwh
@@ -130,8 +130,8 @@ spec:
           image: busybox:1.28
           command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
         containers:
-        - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-          image: rayproject/ray:2.0.0
+        - name: ray-worker
+          image: rayproject/ray:2.1.0
           # Optimal resource allocation will depend on your Kubernetes infrastructure and might
           # require some experimentation.
           # Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal

diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.yaml
@@ -11,7 +11,7 @@ metadata:
   name: raycluster-autoscaler
 spec:
   # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
-  rayVersion: '2.0.0'
+  rayVersion: '2.1.0'
   # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
   # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
   # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
@@ -34,22 +34,21 @@ spec:
     # imagePullPolicy optionally overrides the autoscaler container's image pull policy.
     imagePullPolicy: Always
     # resources specifies optional resource request and limit overrides for the autoscaler container.
-    # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
+    # The default autoscaler resource limits and requests should be sufficient for production use-cases.
+    # However, for large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
     resources:
       limits:
         cpu: "500m"
         memory: "512Mi"
       requests:
         cpu: "500m"
         memory: "512Mi"
-  ######################headGroupSpec#################################
-  # head group template and specs, (perhaps 'group' is not needed in the name)
+  # Ray head pod template
   headGroupSpec:
     # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
     serviceType: ClusterIP
     # the following params are used to complete the ray start: ray start --head --block ...
     rayStartParams:
-      # Flag "no-monitor" will be automatically set when autoscaling is enabled.
       dashboard-host: '0.0.0.0'
       block: 'true'
       # num-cpus: '1' # can be auto-completed from the limits
@@ -61,9 +60,9 @@ spec:
     template:
       spec:
         containers:
-        # The Ray head pod
+        # The Ray head container
         - name: ray-head
-          image: rayproject/ray:2.0.0
+          image: rayproject/ray:2.1.0
           imagePullPolicy: Always
           ports:
           - containerPort: 6379
@@ -76,24 +75,38 @@ spec:
             preStop:
               exec:
                 command: ["/bin/sh","-c","ray stop"]
+          # The resource requests and limits in this config are too small for production!
+          # For an example with more realistic resource configuration, see
+          # ray-cluster.autoscaler.large.yaml.
+          # It is better to use a few large Ray pod than many small ones.
+          # For production, it is ideal to size each Ray pod to take up the
+          # entire Kubernetes node on which it is scheduled.
           resources:
             limits:
               cpu: "1"
               memory: "2G"
             requests:
+              # For production use-cases, we recommend specifying integer CPU reqests and limits.
+              # We also recommend setting requests equal to limits for both CPU and memory.
+              # For this example, we use a 500m CPU request to accomodate resource-constrained local
+              # Kubernetes testing environments such as KinD and minikube.
               cpu: "500m"
-              memory: "1G"
+              # The rest state memory usage of the Ray head node is around 1Gb. We do not
+              # recommend allocating less than 2Gb memory for the Ray head pod.
+              # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.
+              memory: "2G"
   workerGroupSpecs:
   # the pod replicas in this group typed worker
   - replicas: 1
     minReplicas: 1
-    maxReplicas: 300
+    maxReplicas: 10
     # logical group name, for this called small-group, also can be functional
     groupName: small-group
-    # if worker pods need to be added, we can simply increment the replicas
-    # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
-    # the operator will remove pods from the list until the number of replicas is satisfied
-    # when a pod is confirmed to be deleted, its name will be removed from the list below
+    # If worker pods need to be added, we can increment the replicas.
+    # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.
+    # The operator will remove pods from the list until the desired number of replicas is satisfied.
+    # If the difference between the current replica count and the desired replicas is greater than the
+    # number of entries in workersToDelete, random worker pods will be deleted.
     #scaleStrategy:
     #  workersToDelete:
     #  - raycluster-complete-worker-small-group-bdtwh
@@ -104,31 +117,33 @@ spec:
       block: 'true'
     #pod template
     template:
-      metadata:
-        labels:
-          key: value
-        # annotations for pod
-        annotations:
-          key: value
       spec:
         initContainers:
         # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
         - name: init
           image: busybox:1.28
           command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
         containers:
-        - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-          image: rayproject/ray:2.0.0
-          # environment variables to set in the container.Optional.
-          # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
+        - name: ray-worker
+          image: rayproject/ray:2.1.0
           lifecycle:
             preStop:
               exec:
                 command: ["/bin/sh","-c","ray stop"]
+          # The resource requests and limits in this config are too small for production!
+          # For an example with more realistic resource configuration, see
+          # ray-cluster.autoscaler.large.yaml.
+          # It is better to use a few large Ray pod than many small ones.
+          # For production, it is ideal to size each Ray pod to take up the
+          # entire Kubernetes node on which it is scheduled.
           resources:
             limits:
               cpu: "1"
               memory: "1G"
+            # For production use-cases, we recommend specifying integer CPU reqests and limits.
+            # We also recommend setting requests equal to limits for both CPU and memory.
+            # For this example, we use a 500m CPU request to accomodate resource-constrained local
+            # Kubernetes testing environments such as KinD and minikube.
             requests:
               cpu: "500m"
-              memory: "512Mi"
+              memory: "1G"