diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml index b171aa361f..c8439e6870 100644 --- a/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml +++ b/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml @@ -16,8 +16,8 @@ kind: RayCluster metadata: labels: controller-tools.k8s.io: "1.0" - # An unique identifier for the head node and workers of this cluster. - name: raycluster-autoscaler-large + # A unique identifier for the head node and workers of this cluster. + name: raycluster-autoscaler spec: # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. rayVersion: '1.13.0' @@ -38,7 +38,7 @@ spec: idleTimeoutSeconds: 60 # image optionally overrides the autoscaler's container image. # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as - # the ray container by default. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. + # the ray container by. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. ## image: "my-repo/my-custom-autoscaler-image:tag" # imagePullPolicy optionally overrides the autoscaler container's image pull policy. imagePullPolicy: Always @@ -51,7 +51,7 @@ spec: requests: cpu: "500m" memory: "512Mi" - ######################headGroupSpecs################################# + ######################headGroupSpec################################# # head group template and specs, (perhaps 'group' is not needed in the name) headGroupSpec: # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' @@ -61,16 +61,13 @@ spec: # rayNodeType: head # Not needed since it is under the headgroup # the following params are used to complete the ray start: ray start --head --block --port=6379 ... rayStartParams: - # Flag "no-monitor" must be set when running the autoscaler in - # a sidecar container. - port: '6379' + # Flag "no-monitor" will be automatically set when autoscaling is enabled. dashboard-host: '0.0.0.0' - node-ip-address: $MY_POD_IP # auto-completed as the head pod IP block: 'true' - num-cpus: '1' # can be auto-completed from the limits + # num-cpus: '14' # can be auto-completed from the limits # Use `resources` to optionally specify custom resource annotations for the Ray node. # The value of `resources` is a string-integer mapping. - # Currently, `resources` must be provided in the unfortunate format demonstrated below: + # Currently, `resources` must be provided in the specific format demonstrated below: # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' #pod template template: @@ -86,36 +83,11 @@ spec: # resource accounting. K8s requests are not used by Ray. resources: limits: - cpu: "14" - memory: "54Gi" + cpu: 14 + memory: 54Gi requests: - cpu: "14" - memory: "54Gi" - env: - - name: CPU_REQUEST - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: requests.cpu - - name: CPU_LIMITS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: limits.cpu - - name: MEMORY_LIMITS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: limits.memory - - name: MEMORY_REQUESTS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: requests.memory - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP + cpu: 14 + memory: 54Gi ports: - containerPort: 6379 name: gcs @@ -132,7 +104,7 @@ spec: - replicas: 1 minReplicas: 1 maxReplicas: 10 - # logical group name, for this called small-group, also can be functional + # logical group name, for this called large-group, also can be functional groupName: large-group # if worker pods need to be added, we can simply increment the replicas # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list @@ -140,14 +112,11 @@ spec: # when a pod is confirmed to be deleted, its name will be removed from the list below #scaleStrategy: # workersToDelete: - # - raycluster-complete-worker-small-group-bdtwh - # - raycluster-complete-worker-small-group-hv457 - # - raycluster-complete-worker-small-group-k8tj7 + # - raycluster-complete-worker-large-group-bdtwh + # - raycluster-complete-worker-large-group-hv457 + # - raycluster-complete-worker-large-group-k8tj7 # the following params are used to complete the ray start: ray start --block --node-ip-address= ... rayStartParams: - #redis-password: '5241590000000000' - redis-password: 'LetMeInRay' # Deprecated since Ray 1.11 due to GCS bootstrapping enabled - node-ip-address: $MY_POD_IP block: 'true' #pod template template: @@ -172,48 +141,11 @@ spec: # resource accounting. K8s requests are not used by Ray. resources: limits: - cpu: "14" - memory: "54Gi" + cpu: 14 + memory: 54Gi requests: - cpu: "14" - memory: "54Gi" - # environment variables to set in the container.Optional. - # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ - env: - - name: RAY_DISABLE_DOCKER_CPU_WARNING - value: "1" - - name: TYPE - value: "worker" - - name: CPU_REQUEST - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: requests.cpu - - name: CPU_LIMITS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: limits.cpu - - name: MEMORY_LIMITS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: limits.memory - - name: MEMORY_REQUESTS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: requests.memory - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - ports: - - containerPort: 80 + cpu: 14 + memory: 54Gi lifecycle: preStop: exec: diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.yaml index 2c0d58776a..7322fcde95 100644 --- a/ray-operator/config/samples/ray-cluster.autoscaler.yaml +++ b/ray-operator/config/samples/ray-cluster.autoscaler.yaml @@ -7,7 +7,7 @@ kind: RayCluster metadata: labels: controller-tools.k8s.io: "1.0" - # An unique identifier for the head node and workers of this cluster. + # A unique identifier for the head node and workers of this cluster. name: raycluster-autoscaler spec: # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. @@ -29,7 +29,7 @@ spec: idleTimeoutSeconds: 60 # image optionally overrides the autoscaler's container image. # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as - # the ray container by default. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. + # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. ## image: "my-repo/my-custom-autoscaler-image:tag" # imagePullPolicy optionally overrides the autoscaler container's image pull policy. imagePullPolicy: Always @@ -42,7 +42,7 @@ spec: requests: cpu: "500m" memory: "512Mi" - ######################headGroupSpecs################################# + ######################headGroupSpec################################# # head group template and specs, (perhaps 'group' is not needed in the name) headGroupSpec: # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' @@ -50,18 +50,15 @@ spec: # logical group name, for this called head-group, also can be functional # pod type head or worker # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block --port=6379 ... + # the following params are used to complete the ray start: ray start --head --block ... rayStartParams: - # Flag "no-monitor" must be set when running the autoscaler in - # a sidecar container. - port: '6379' + # Flag "no-monitor" will be automatically set when autoscaling is enabled. dashboard-host: '0.0.0.0' - node-ip-address: $MY_POD_IP # auto-completed as the head pod IP block: 'true' - num-cpus: '1' # can be auto-completed from the limits + # num-cpus: '1' # can be auto-completed from the limits # Use `resources` to optionally specify custom resource annotations for the Ray node. # The value of `resources` is a string-integer mapping. - # Currently, `resources` must be provided in the unfortunate format demonstrated below: + # Currently, `resources` must be provided in the specific format demonstrated below: # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' #pod template template: @@ -71,31 +68,6 @@ spec: - name: ray-head image: rayproject/ray:1.13.0 imagePullPolicy: Always - env: - - name: CPU_REQUEST - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: requests.cpu - - name: CPU_LIMITS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: limits.cpu - - name: MEMORY_LIMITS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: limits.memory - - name: MEMORY_REQUESTS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: requests.memory - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP ports: - containerPort: 6379 name: gcs @@ -130,11 +102,8 @@ spec: # - raycluster-complete-worker-small-group-bdtwh # - raycluster-complete-worker-small-group-hv457 # - raycluster-complete-worker-small-group-k8tj7 - # the following params are used to complete the ray start: ray start --block --node-ip-address= ... + # the following params are used to complete the ray start: ray start --block ... rayStartParams: - #redis-password: '5241590000000000' - redis-password: 'LetMeInRay' # Deprecated since Ray 1.11 due to GCS bootstrapping enabled - node-ip-address: $MY_POD_IP block: 'true' #pod template template: @@ -155,41 +124,6 @@ spec: image: rayproject/ray:1.13.0 # environment variables to set in the container.Optional. # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ - env: - - name: RAY_DISABLE_DOCKER_CPU_WARNING - value: "1" - - name: TYPE - value: "worker" - - name: CPU_REQUEST - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: requests.cpu - - name: CPU_LIMITS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: limits.cpu - - name: MEMORY_LIMITS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: limits.memory - - name: MEMORY_REQUESTS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: requests.memory - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - ports: - - containerPort: 80 lifecycle: preStop: exec: diff --git a/ray-operator/config/samples/ray-cluster.complete.large.yaml b/ray-operator/config/samples/ray-cluster.complete.large.yaml index 9d14590d20..a0411f3c6e 100644 --- a/ray-operator/config/samples/ray-cluster.complete.large.yaml +++ b/ray-operator/config/samples/ray-cluster.complete.large.yaml @@ -10,8 +10,8 @@ kind: RayCluster metadata: labels: controller-tools.k8s.io: "1.0" - # An unique identifier for the head node and workers of this cluster. - name: raycluster-complete-large + # A unique identifier for the head node and workers of this cluster. + name: raycluster-complete spec: rayVersion: '1.13.0' ######################headGroupSpecs################################# @@ -22,12 +22,8 @@ spec: # for the head group, replicas should always be 1. # headGroupSpec.replicas is deprecated in KubeRay >= 0.3.0. replicas: 1 - # logical group name, for this called head-group, also can be functional - # pod type head or worker - # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ... + # the following params are used to complete the ray start: ray start --head --block --dashboard-host: '0.0.0.0' ... rayStartParams: - port: '6379' dashboard-host: '0.0.0.0' block: 'true' #pod template @@ -52,36 +48,11 @@ spec: # resource accounting. K8s requests are not used by Ray. resources: limits: - cpu: "14" - memory: "54Gi" + cpu: 14 + memory: 54Gi requests: - cpu: "14" - memory: "54Gi" - env: - - name: CPU_REQUEST - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: requests.cpu - - name: CPU_LIMITS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: limits.cpu - - name: MEMORY_LIMITS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: limits.memory - - name: MEMORY_REQUESTS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: requests.memory - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP + cpu: 14 + memory: 54Gi ports: - containerPort: 6379 name: gcs @@ -98,8 +69,8 @@ spec: - replicas: 1 minReplicas: 1 maxReplicas: 10 - # logical group name, for this called large-group, also can be functional - groupName: large-group + # logical group name, for this called small-group, also can be functional + groupName: small-group # if worker pods need to be added, we can simply increment the replicas # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list # the operator will remove pods from the list until the number of replicas is satisfied @@ -109,7 +80,7 @@ spec: # - raycluster-complete-worker-small-group-bdtwh # - raycluster-complete-worker-small-group-hv457 # - raycluster-complete-worker-small-group-k8tj7 - # the following params are used to complete the ray start: ray start --block --node-ip-address= ... + # the following params are used to complete the ray start: ray start --block ... rayStartParams: block: 'true' #pod template @@ -119,59 +90,21 @@ spec: rayCluster: raycluster-complete # will be injected if missing rayNodeType: worker # will be injected if missing groupName: small-group # will be injected if missing - # annotations for pod - annotations: - key: value spec: containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' image: rayproject/ray:1.13.0 + # Optimal resource allocation will depend on your Kubernetes infrastructure and might + # require some experimentation. # Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal # resource accounting. K8s requests are not used by Ray. resources: limits: - cpu: "14" - memory: "54Gi" + cpu: 14 + memory: 54Gi requests: - cpu: "14" - memory: "54Gi" - # environment variables to set in the container.Optional. - # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ - env: - - name: RAY_DISABLE_DOCKER_CPU_WARNING - value: "1" - - name: TYPE - value: "worker" - - name: CPU_REQUEST - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: requests.cpu - - name: CPU_LIMITS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: limits.cpu - - name: MEMORY_LIMITS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: limits.memory - - name: MEMORY_REQUESTS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: requests.memory - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - ports: - - containerPort: 80 + cpu: 14 + memory: 54Gi lifecycle: preStop: exec: @@ -186,7 +119,7 @@ spec: - name: init-myservice image: busybox:1.28 # Change the cluster postfix if you don't have a default setting - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] # use volumes # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ volumes: diff --git a/ray-operator/config/samples/ray-cluster.complete.yaml b/ray-operator/config/samples/ray-cluster.complete.yaml index b72c99f426..405e3aaf34 100644 --- a/ray-operator/config/samples/ray-cluster.complete.yaml +++ b/ray-operator/config/samples/ray-cluster.complete.yaml @@ -7,27 +7,21 @@ kind: RayCluster metadata: labels: controller-tools.k8s.io: "1.0" - # An unique identifier for the head node and workers of this cluster. + # A unique identifier for the head node and workers of this cluster. name: raycluster-complete spec: - rayVersion: '1.11.0' - ######################headGroupSpecs################################# + rayVersion: '1.13.0' + ######################headGroupSpec################################# # head group template and specs, (perhaps 'group' is not needed in the name) headGroupSpec: # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' serviceType: ClusterIP - # the pod replicas in this group typed head (assuming there could be more than 1 in the future) + # for the head group, replicas should always be 1. + # headGroupSpec.replicas is deprecated in KubeRay >= 0.3.0. replicas: 1 - # logical group name, for this called head-group, also can be functional - # pod type head or worker - # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ... + # the following params are used to complete the ray start: ray start --head --block --dashboard-host: '0.0.0.0' ... rayStartParams: - port: '6379' - object-store-memory: '100000000' - redis-password: 'LetMeInRay' # Deprecated since Ray 1.11 due to GCS bootstrapping enabled dashboard-host: '0.0.0.0' - node-ip-address: $MY_POD_IP # auto-completed as the head pod IP block: 'true' #pod template template: @@ -44,32 +38,7 @@ spec: spec: containers: - name: ray-head - image: rayproject/ray:1.11.0 - env: - - name: CPU_REQUEST - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: requests.cpu - - name: CPU_LIMITS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: limits.cpu - - name: MEMORY_LIMITS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: limits.memory - - name: MEMORY_REQUESTS - valueFrom: - resourceFieldRef: - containerName: ray-head - resource: requests.memory - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP + image: rayproject/ray:1.13.0 ports: - containerPort: 6379 name: gcs @@ -93,21 +62,19 @@ spec: - replicas: 1 minReplicas: 1 maxReplicas: 10 - # logical group name, for this called small-group, also can be functional - groupName: small-group + # logical group name, for this called large-group, also can be functional + groupName: large-group # if worker pods need to be added, we can simply increment the replicas # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list # the operator will remove pods from the list until the number of replicas is satisfied # when a pod is confirmed to be deleted, its name will be removed from the list below #scaleStrategy: # workersToDelete: - # - raycluster-complete-worker-small-group-bdtwh - # - raycluster-complete-worker-small-group-hv457 - # - raycluster-complete-worker-small-group-k8tj7 - # the following params are used to complete the ray start: ray start --block --node-ip-address= ... + # - raycluster-complete-worker-large-group-bdtwh + # - raycluster-complete-worker-large-group-hv457 + # - raycluster-complete-worker-large-group-k8tj7 + # the following params are used to complete the ray start: ray start --block rayStartParams: - redis-password: 'LetMeInRay' # Deprecated since Ray 1.11 due to GCS bootstrapping enabled - node-ip-address: $MY_POD_IP block: 'true' #pod template template: @@ -116,56 +83,12 @@ spec: rayCluster: raycluster-complete # will be injected if missing rayNodeType: worker # will be injected if missing groupName: small-group # will be injected if missing - # annotations for pod - annotations: - key: value spec: - initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - - name: init-myservice - image: busybox:1.28 - # Change the cluster postfix if you don't have a default setting - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: rayproject/ray:1.11.0 + image: rayproject/ray:1.13.0 # environment variables to set in the container.Optional. # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ - env: - - name: RAY_DISABLE_DOCKER_CPU_WARNING - value: "1" - - name: TYPE - value: "worker" - - name: CPU_REQUEST - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: requests.cpu - - name: CPU_LIMITS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: limits.cpu - - name: MEMORY_LIMITS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: limits.memory - - name: MEMORY_REQUESTS - valueFrom: - resourceFieldRef: - containerName: machine-learning - resource: requests.memory - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - ports: - - containerPort: 80 lifecycle: preStop: exec: @@ -182,6 +105,12 @@ spec: requests: cpu: "500m" memory: "256Mi" + initContainers: + # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + - name: init-myservice + image: busybox:1.28 + # Change the cluster postfix if you don't have a default setting + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] # use volumes # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ volumes: