diff --git a/apiserver/pkg/util/cluster.go b/apiserver/pkg/util/cluster.go index 56c32c1612..a0f45a56ba 100644 --- a/apiserver/pkg/util/cluster.go +++ b/apiserver/pkg/util/cluster.go @@ -223,7 +223,7 @@ func buildWorkerPodTemplate(imageVersion string, envs map[string]string, spec *a Command: []string{ "sh", "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done", + "until nslookup $FQ_RAY_IP; do echo waiting for K8s Service $FQ_RAY_IP; sleep 2; done", }, }, }, diff --git a/helm-chart/ray-cluster/templates/raycluster-cluster.yaml b/helm-chart/ray-cluster/templates/raycluster-cluster.yaml index 700ace391d..665560a7c2 100644 --- a/helm-chart/ray-cluster/templates/raycluster-cluster.yaml +++ b/helm-chart/ray-cluster/templates/raycluster-cluster.yaml @@ -97,7 +97,7 @@ spec: initContainers: - name: init image: {{ $values.initContainerImage | default "busybox:1.28" }} - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] + command: ['sh', '-c', "until nslookup $FQ_RAY_IP; do echo waiting for K8s Service $FQ_RAY_IP; sleep 2; done"] securityContext: {{- toYaml $values.initContainerSecurityContext | nindent 14 }} containers: @@ -163,7 +163,7 @@ spec: initContainers: - name: init image: {{ .Values.worker.initContainerImage | default "busybox:1.28" }} - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] + command: ['sh', '-c', "until nslookup $FQ_RAY_IP; do echo waiting for K8s Service $FQ_RAY_IP; sleep 2; done"] securityContext: {{- toYaml .Values.worker.initContainerSecurityContext | nindent 14 }} containers: diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml index 7df0f69312..820ddc3164 100644 --- a/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml +++ b/ray-operator/config/samples/ray-cluster.autoscaler.large.yaml @@ -128,7 +128,7 @@ spec: key: value spec: initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init image: busybox:1.28 command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.yaml index dd1dc263ef..9524cdb2f3 100644 --- a/ray-operator/config/samples/ray-cluster.autoscaler.yaml +++ b/ray-operator/config/samples/ray-cluster.autoscaler.yaml @@ -122,7 +122,7 @@ spec: template: spec: initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init image: busybox:1.28 command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] diff --git a/ray-operator/config/samples/ray-cluster.complete.large.yaml b/ray-operator/config/samples/ray-cluster.complete.large.yaml index fe6a6d3ff0..688992efdf 100644 --- a/ray-operator/config/samples/ray-cluster.complete.large.yaml +++ b/ray-operator/config/samples/ray-cluster.complete.large.yaml @@ -111,7 +111,7 @@ spec: - mountPath: /tmp/ray name: ray-logs initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init image: busybox:1.28 # Change the cluster postfix if you don't have a default setting diff --git a/ray-operator/config/samples/ray-cluster.complete.yaml b/ray-operator/config/samples/ray-cluster.complete.yaml index 1ff1beb8d2..f6b9588034 100644 --- a/ray-operator/config/samples/ray-cluster.complete.yaml +++ b/ray-operator/config/samples/ray-cluster.complete.yaml @@ -122,7 +122,7 @@ spec: # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container. memory: "1G" initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init image: busybox:1.28 # Change the cluster postfix if you don't have a default setting diff --git a/ray-operator/config/samples/ray-cluster.external-redis.yaml b/ray-operator/config/samples/ray-cluster.external-redis.yaml index a9f19e47ff..936821ba15 100644 --- a/ray-operator/config/samples/ray-cluster.external-redis.yaml +++ b/ray-operator/config/samples/ray-cluster.external-redis.yaml @@ -124,7 +124,7 @@ spec: template: spec: initContainers: # to avoid worker crashing before head service is created - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init image: busybox:1.28 # Change the cluster postfix if you don't have a default setting diff --git a/ray-operator/config/samples/ray-cluster.heterogeneous.yaml b/ray-operator/config/samples/ray-cluster.heterogeneous.yaml index 7b1047752d..c5a3bd4705 100644 --- a/ray-operator/config/samples/ray-cluster.heterogeneous.yaml +++ b/ray-operator/config/samples/ray-cluster.heterogeneous.yaml @@ -81,7 +81,7 @@ spec: template: spec: initContainers: # to avoid worker crashing before head service is created - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init image: busybox:1.28 # Change the cluster postfix if you don't have a default setting @@ -125,7 +125,7 @@ spec: - name: init image: busybox:1.28 # Change the cluster postfix if you don't have a default setting - command: ['sh', '-c', "until nslookup raycluster-heterogeneous-head-svc.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] containers: - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' image: rayproject/ray:2.2.0 diff --git a/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml b/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml index 0d4114228e..122c738a45 100644 --- a/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml +++ b/ray-operator/config/samples/ray_v1alpha1_rayjob.yaml @@ -4,9 +4,9 @@ metadata: name: rayjob-sample spec: entrypoint: python /home/ray/samples/sample_code.py - # runtimeEnv decoded to '{ - # "pip": [ - # "requests==2.26.0", + # runtimeEnv decoded to '{ + # "pip": [ + # "requests==2.26.0", # "pendulum==2.1.2" # ], # "env_vars": { @@ -65,7 +65,7 @@ spec: template: spec: initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init image: busybox:1.28 command: [ 'sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done" ] diff --git a/ray-operator/config/samples/ray_v1alpha1_rayservice.yaml b/ray-operator/config/samples/ray_v1alpha1_rayservice.yaml index 1ed488ca3d..2b0b6c7641 100644 --- a/ray-operator/config/samples/ray_v1alpha1_rayservice.yaml +++ b/ray-operator/config/samples/ray_v1alpha1_rayservice.yaml @@ -89,7 +89,7 @@ spec: template: spec: initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init image: busybox:1.28 command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] diff --git a/ray-operator/config/security/ray-cluster.pod-security.yaml b/ray-operator/config/security/ray-cluster.pod-security.yaml index fc8f776455..947c8dfe30 100644 --- a/ray-operator/config/security/ray-cluster.pod-security.yaml +++ b/ray-operator/config/security/ray-cluster.pod-security.yaml @@ -118,11 +118,11 @@ spec: seccompProfile: type: RuntimeDefault initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name + # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name - name: init-myservice image: busybox:1.28 # Change the cluster postfix if you don't have a default setting - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] securityContext: runAsUser: 1000 allowPrivilegeEscalation: false diff --git a/ray-operator/controllers/ray/common/constant.go b/ray-operator/controllers/ray/common/constant.go index 179626c52d..d7431b6b10 100644 --- a/ray-operator/controllers/ray/common/constant.go +++ b/ray-operator/controllers/ray/common/constant.go @@ -74,6 +74,7 @@ const ( NAMESPACE = "NAMESPACE" CLUSTER_NAME = "CLUSTER_NAME" RAY_IP = "RAY_IP" + FQ_RAY_IP = "FQ_RAY_IP" RAY_PORT = "RAY_PORT" RAY_ADDRESS = "RAY_ADDRESS" REDIS_PASSWORD = "REDIS_PASSWORD" diff --git a/ray-operator/controllers/ray/common/ingress.go b/ray-operator/controllers/ray/common/ingress.go index 32ed980d5f..1f5c7a8734 100644 --- a/ray-operator/controllers/ray/common/ingress.go +++ b/ray-operator/controllers/ray/common/ingress.go @@ -50,7 +50,7 @@ func BuildIngressForHeadService(cluster rayiov1alpha1.RayCluster) (*networkingv1 PathType: &pathType, Backend: networkingv1.IngressBackend{ Service: &networkingv1.IngressServiceBackend{ - Name: utils.CheckName(utils.GenerateServiceName(cluster.Name)), + Name: utils.GenerateServiceName(cluster.Name), Port: networkingv1.ServiceBackendPort{ Number: dashboardPort, }, diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 2bc0e1f271..88185a2a0f 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -85,7 +85,7 @@ func initTemplateAnnotations(instance rayiov1alpha1.RayCluster, podTemplate *v1. } // DefaultHeadPodTemplate sets the config values -func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1alpha1.HeadGroupSpec, podName string, svcName string, headPort string) v1.PodTemplateSpec { +func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1alpha1.HeadGroupSpec, podName string, headPort string) v1.PodTemplateSpec { // TODO (Dmitri) The argument headPort is essentially unused; // headPort is passed into setMissingRayStartParams but unused there for the head pod. // To mitigate this awkwardness and reduce code redundancy, unify head and worker pod configuration logic. @@ -100,7 +100,7 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a podTemplate.Labels = make(map[string]string) } podTemplate.Labels = labelPod(rayiov1alpha1.HeadNode, instance.Name, "headgroup", instance.Spec.HeadGroupSpec.Template.ObjectMeta.Labels) - headSpec.RayStartParams = setMissingRayStartParams(headSpec.RayStartParams, rayiov1alpha1.HeadNode, svcName, headPort) + headSpec.RayStartParams = setMissingRayStartParams(headSpec.RayStartParams, rayiov1alpha1.HeadNode, headPort, "") headSpec.RayStartParams = setAgentListPortStartParams(instance, headSpec.RayStartParams) initTemplateAnnotations(instance, &podTemplate) @@ -183,7 +183,7 @@ func autoscalerSupportIsStable(rayVersion string) bool { } // DefaultWorkerPodTemplate sets the config values -func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayiov1alpha1.WorkerGroupSpec, podName string, svcName string, headPort string) v1.PodTemplateSpec { +func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayiov1alpha1.WorkerGroupSpec, podName string, fqdnRayIP string, headPort string) v1.PodTemplateSpec { podTemplate := workerSpec.Template podTemplate.GenerateName = podName if podTemplate.ObjectMeta.Namespace == "" { @@ -198,7 +198,7 @@ func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayi podTemplate.Labels = make(map[string]string) } podTemplate.Labels = labelPod(rayiov1alpha1.WorkerNode, instance.Name, workerSpec.GroupName, workerSpec.Template.ObjectMeta.Labels) - workerSpec.RayStartParams = setMissingRayStartParams(workerSpec.RayStartParams, rayiov1alpha1.WorkerNode, svcName, headPort) + workerSpec.RayStartParams = setMissingRayStartParams(workerSpec.RayStartParams, rayiov1alpha1.WorkerNode, headPort, fqdnRayIP) workerSpec.RayStartParams = setAgentListPortStartParams(instance, workerSpec.RayStartParams) initTemplateAnnotations(instance, &podTemplate) @@ -271,7 +271,7 @@ func initReadinessProbeHandler(probe *v1.Probe, rayNodeType rayiov1alpha1.RayNod } // BuildPod a pod config -func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, svcName string, headPort string, enableRayAutoscaler *bool, creator string) (aPod v1.Pod) { +func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, headPort string, enableRayAutoscaler *bool, creator string, fqdnRayIP string) (aPod v1.Pod) { pod := v1.Pod{ TypeMeta: metav1.TypeMeta{ APIVersion: "v1", @@ -326,10 +326,10 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN } for index := range pod.Spec.InitContainers { - setInitContainerEnvVars(&pod.Spec.InitContainers[index], svcName) + setInitContainerEnvVars(&pod.Spec.InitContainers[index], fqdnRayIP) } - setContainerEnvVars(&pod, rayContainerIndex, rayNodeType, rayStartParams, svcName, headPort, creator) + setContainerEnvVars(&pod, rayContainerIndex, rayNodeType, rayStartParams, fqdnRayIP, headPort, creator) // health check only if FT enabled if podTemplateSpec.Annotations != nil { @@ -535,20 +535,20 @@ func labelPod(rayNodeType rayiov1alpha1.RayNodeType, rayClusterName string, grou return labels } -func setInitContainerEnvVars(container *v1.Container, svcName string) { - // RAY_IP can be used in the DNS lookup +func setInitContainerEnvVars(container *v1.Container, fqdnRayIP string) { if container.Env == nil || len(container.Env) == 0 { container.Env = []v1.EnvVar{} } - if !envVarExists("RAY_IP", container.Env) { - ip := v1.EnvVar{Name: "RAY_IP"} - ip.Value = svcName - container.Env = append(container.Env, ip) + if len(fqdnRayIP) != 0 { // Worker Pod + container.Env = append(container.Env, + v1.EnvVar{Name: FQ_RAY_IP, Value: fqdnRayIP}, + // RAY_IP is deprecated and should be kept for backward compatibility purposes only. + v1.EnvVar{Name: RAY_IP, Value: utils.ExtractRayIPFromFQDN(fqdnRayIP)}, + ) } } -func setContainerEnvVars(pod *v1.Pod, rayContainerIndex int, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, svcName string, headPort string, creator string) { - // set IP to local host if head, or the the svc otherwise RAY_IP +func setContainerEnvVars(pod *v1.Pod, rayContainerIndex int, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, fqdnRayIP string, headPort string, creator string) { // set the port RAY_PORT // set the password? container := &pod.Spec.Containers[rayContainerIndex] @@ -556,19 +556,18 @@ func setContainerEnvVars(pod *v1.Pod, rayContainerIndex int, rayNodeType rayiov1 container.Env = []v1.EnvVar{} } - var rayIP string - if rayNodeType == rayiov1alpha1.HeadNode { - // if head, use localhost - rayIP = LOCAL_HOST - } else { - // if worker, use the service name of the head - rayIP = svcName + // case 1: head => Use LOCAL_HOST + // case 2: worker => Use fqdnRayIP (fully qualified domain name) + ip := LOCAL_HOST + if rayNodeType == rayiov1alpha1.WorkerNode { + ip = fqdnRayIP + container.Env = append(container.Env, + v1.EnvVar{Name: FQ_RAY_IP, Value: ip}, + // RAY_IP is deprecated and should be kept for backward compatibility purposes only. + v1.EnvVar{Name: RAY_IP, Value: utils.ExtractRayIPFromFQDN(ip)}, + ) } - if !envVarExists(RAY_IP, container.Env) { - ipEnv := v1.EnvVar{Name: RAY_IP, Value: rayIP} - container.Env = append(container.Env, ipEnv) - } if !envVarExists(RAY_PORT, container.Env) { portEnv := v1.EnvVar{Name: RAY_PORT, Value: headPort} container.Env = append(container.Env, portEnv) @@ -595,7 +594,7 @@ func setContainerEnvVars(pod *v1.Pod, rayContainerIndex int, rayNodeType rayiov1 // Setting the RAY_ADDRESS env allows connecting to Ray using ray.init() when connecting // from within the cluster. if !envVarExists(RAY_ADDRESS, container.Env) { - rayAddress := fmt.Sprintf("%s:%s", rayIP, headPort) + rayAddress := fmt.Sprintf("%s:%s", ip, headPort) addressEnv := v1.EnvVar{Name: RAY_ADDRESS, Value: rayAddress} container.Env = append(container.Env, addressEnv) } @@ -636,11 +635,11 @@ func envVarExists(envName string, envVars []v1.EnvVar) bool { } // TODO auto complete params -func setMissingRayStartParams(rayStartParams map[string]string, nodeType rayiov1alpha1.RayNodeType, svcName string, headPort string) (completeStartParams map[string]string) { +func setMissingRayStartParams(rayStartParams map[string]string, nodeType rayiov1alpha1.RayNodeType, headPort string, fqdnRayIP string) (completeStartParams map[string]string) { // Note: The argument headPort is unused for nodeType == rayiov1alpha1.HeadNode. if nodeType == rayiov1alpha1.WorkerNode { if _, ok := rayStartParams["address"]; !ok { - address := fmt.Sprintf("%s:%s", svcName, headPort) + address := fmt.Sprintf("%s:%s", fqdnRayIP, headPort) rayStartParams["address"] = address } } diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index 0e2ba9c2c6..6e853f521e 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -343,9 +343,8 @@ func TestBuildPod(t *testing.T) { // Test head pod podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) - podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") - pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName, "6379", nil, "") + podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") + pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "") // Check RAY_ADDRESS env. checkPodEnv(t, pod, RAY_ADDRESS, "127.0.0.1:6379") @@ -386,21 +385,24 @@ func TestBuildPod(t *testing.T) { // testing worker pod worker := cluster.Spec.WorkerGroupSpecs[0] podName = cluster.Name + DashSymbol + string(rayiov1alpha1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) - podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, svcName, "6379") - pod = BuildPod(podTemplateSpec, rayiov1alpha1.WorkerNode, worker.RayStartParams, svcName, "6379", nil, "") + fqdnRayIP := utils.GenerateFQDNServiceName(cluster.Name, cluster.Namespace) + podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379") + pod = BuildPod(podTemplateSpec, rayiov1alpha1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP) - // Check RAY_ADDRESS env - checkPodEnv(t, pod, RAY_ADDRESS, "raycluster-sample-head-svc:6379") + // Check environment variables + checkPodEnv(t, pod, RAY_ADDRESS, "raycluster-sample-head-svc.default.svc.cluster.local:6379") + checkPodEnv(t, pod, FQ_RAY_IP, "raycluster-sample-head-svc.default.svc.cluster.local") + checkPodEnv(t, pod, RAY_IP, "raycluster-sample-head-svc") // Check RayStartParams - expectedResult = fmt.Sprintf("%s:6379", svcName) + expectedResult = fmt.Sprintf("%s:6379", fqdnRayIP) actualResult = cluster.Spec.WorkerGroupSpecs[0].RayStartParams["address"] if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - expectedCommandArg := splitAndSort("ulimit -n 65536; ray start --block --memory=1073741824 --num-cpus=1 --num-gpus=3 --address=raycluster-sample-head-svc:6379 --port=6379 --metrics-export-port=8080") + expectedCommandArg := splitAndSort("ulimit -n 65536; ray start --block --memory=1073741824 --num-cpus=1 --num-gpus=3 --address=raycluster-sample-head-svc.default.svc.cluster.local:6379 --port=6379 --metrics-export-port=8080") actualCommandArg := splitAndSort(pod.Spec.Containers[0].Args[0]) if !reflect.DeepEqual(expectedCommandArg, actualCommandArg) { t.Fatalf("Expected `%v` but got `%v`", expectedCommandArg, actualCommandArg) @@ -414,9 +416,8 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) - podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") - pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName, "6379", &trueFlag, "") + podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") + pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", &trueFlag, "", "") actualResult := pod.Labels[RayClusterLabelKey] expectedResult := cluster.Name @@ -470,9 +471,8 @@ func TestBuildPod_WithCreatedByRayService(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) - podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") - pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName, "6379", &trueFlag, RayServiceCreatorLabelValue) + podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") + pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", &trueFlag, RayServiceCreatorLabelValue, "") hasCorrectDeathEnv := false for _, container := range pod.Spec.Containers { @@ -498,7 +498,6 @@ func TestBuildPodWithAutoscalerOptions(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) customAutoscalerImage := "custom-autoscaler-xxx" customPullPolicy := v1.PullIfNotPresent @@ -543,8 +542,8 @@ func TestBuildPodWithAutoscalerOptions(t *testing.T) { EnvFrom: customEnvFrom, SecurityContext: &customSecurityContext, } - podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") - pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName, "6379", &trueFlag, "") + podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") + pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", &trueFlag, "", "") expectedContainer := *autoscalerContainer.DeepCopy() expectedContainer.Image = customAutoscalerImage expectedContainer.ImagePullPolicy = customPullPolicy @@ -563,8 +562,7 @@ func TestHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) - podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") + podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") // autoscaler container is injected into head pod actualContainerCount := len(podTemplateSpec.Spec.Containers) @@ -582,7 +580,7 @@ func TestHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { // Repeat ServiceAccountName check with long cluster name. cluster.Name = longString(t) // 200 chars long - podTemplateSpec = DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") + podTemplateSpec = DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") actualResult = podTemplateSpec.Spec.ServiceAccountName expectedResult = shortString(t) // 50 chars long, truncated by utils.CheckName if !reflect.DeepEqual(expectedResult, actualResult) { @@ -595,8 +593,7 @@ func TestHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { func TestHeadPodTemplate_WithNoServiceAccount(t *testing.T) { cluster := instance.DeepCopy() podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) - pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") + pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") actualResult := pod.Spec.ServiceAccountName expectedResult := "" @@ -612,8 +609,7 @@ func TestHeadPodTemplate_WithServiceAccountNoAutoscaling(t *testing.T) { serviceAccount := "head-service-account" cluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = serviceAccount podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) - pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") + pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") actualResult := pod.Spec.ServiceAccountName expectedResult := serviceAccount @@ -630,8 +626,7 @@ func TestHeadPodTemplate_WithServiceAccount(t *testing.T) { cluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = serviceAccount cluster.Spec.EnableInTreeAutoscaling = &trueFlag podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) - pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") + pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") actualResult := pod.Spec.ServiceAccountName expectedResult := serviceAccount @@ -687,9 +682,8 @@ func TestCleanupInvalidVolumeMounts(t *testing.T) { // Test head pod podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) - svcName := utils.GenerateServiceName(cluster.Name) - podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName, "6379") - pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName, "6379", nil, "") + podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") + pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "") pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, []v1.VolumeMount{ { @@ -710,14 +704,14 @@ func TestCleanupInvalidVolumeMounts(t *testing.T) { func TestDefaultWorkerPodTemplateWithName(t *testing.T) { cluster := instance.DeepCopy() - svcName := utils.GenerateServiceName(cluster.Name) + fqdnRayIP := utils.GenerateFQDNServiceName(cluster.Name, cluster.Namespace) worker := cluster.Spec.WorkerGroupSpecs[0] worker.Template.ObjectMeta.Name = "ray-worker-test" podName := cluster.Name + DashSymbol + string(rayiov1alpha1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) expectedWorker := *worker.DeepCopy() // Pass a deep copy of worker (*worker.DeepCopy()) to prevent "worker" from updating. - podTemplateSpec := DefaultWorkerPodTemplate(*cluster, *worker.DeepCopy(), podName, svcName, "6379") + podTemplateSpec := DefaultWorkerPodTemplate(*cluster, *worker.DeepCopy(), podName, fqdnRayIP, "6379") assert.Equal(t, podTemplateSpec.ObjectMeta.Name, "") assert.Equal(t, worker, expectedWorker) } diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 4dd74c6b5d..e3fe1d1721 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -758,15 +758,13 @@ func (r *RayClusterReconciler) createWorkerPod(instance rayiov1alpha1.RayCluster func (r *RayClusterReconciler) buildHeadPod(instance rayiov1alpha1.RayCluster) corev1.Pod { podName := strings.ToLower(instance.Name + common.DashSymbol + string(rayiov1alpha1.HeadNode) + common.DashSymbol) podName = utils.CheckName(podName) // making sure the name is valid - svcName := utils.GenerateServiceName(instance.Name) - svcName = utils.CheckName(svcName) // The Ray head port used by workers to connect to the cluster (GCS server port for Ray >= 1.11.0, Redis port for older Ray.) headPort := common.GetHeadPort(instance.Spec.HeadGroupSpec.RayStartParams) autoscalingEnabled := instance.Spec.EnableInTreeAutoscaling - podConf := common.DefaultHeadPodTemplate(instance, instance.Spec.HeadGroupSpec, podName, svcName, headPort) + podConf := common.DefaultHeadPodTemplate(instance, instance.Spec.HeadGroupSpec, podName, headPort) r.Log.Info("head pod labels", "labels", podConf.Labels) creatorName := getCreator(instance) - pod := common.BuildPod(podConf, rayiov1alpha1.HeadNode, instance.Spec.HeadGroupSpec.RayStartParams, svcName, headPort, autoscalingEnabled, creatorName) + pod := common.BuildPod(podConf, rayiov1alpha1.HeadNode, instance.Spec.HeadGroupSpec.RayStartParams, headPort, autoscalingEnabled, creatorName, "") // Set raycluster instance as the owner and controller if err := controllerutil.SetControllerReference(&instance, &pod, r.Scheme); err != nil { r.Log.Error(err, "Failed to set controller reference for raycluster pod") @@ -791,15 +789,14 @@ func getCreator(instance rayiov1alpha1.RayCluster) string { // Build worker instance pods. func (r *RayClusterReconciler) buildWorkerPod(instance rayiov1alpha1.RayCluster, worker rayiov1alpha1.WorkerGroupSpec) corev1.Pod { podName := strings.ToLower(instance.Name + common.DashSymbol + string(rayiov1alpha1.WorkerNode) + common.DashSymbol + worker.GroupName + common.DashSymbol) - podName = utils.CheckName(podName) // making sure the name is valid - svcName := utils.GenerateServiceName(instance.Name) - svcName = utils.CheckName(svcName) + podName = utils.CheckName(podName) // making sure the name is valid + fqdnRayIP := utils.GenerateFQDNServiceName(instance.Name, instance.Namespace) // Fully Qualified Domain Name // The Ray head port used by workers to connect to the cluster (GCS server port for Ray >= 1.11.0, Redis port for older Ray.) headPort := common.GetHeadPort(instance.Spec.HeadGroupSpec.RayStartParams) autoscalingEnabled := instance.Spec.EnableInTreeAutoscaling - podTemplateSpec := common.DefaultWorkerPodTemplate(instance, worker, podName, svcName, headPort) + podTemplateSpec := common.DefaultWorkerPodTemplate(instance, worker, podName, fqdnRayIP, headPort) creatorName := getCreator(instance) - pod := common.BuildPod(podTemplateSpec, rayiov1alpha1.WorkerNode, worker.RayStartParams, svcName, headPort, autoscalingEnabled, creatorName) + pod := common.BuildPod(podTemplateSpec, rayiov1alpha1.WorkerNode, worker.RayStartParams, headPort, autoscalingEnabled, creatorName, fqdnRayIP) // Set raycluster instance as the owner and controller if err := controllerutil.SetControllerReference(&instance, &pod, r.Scheme); err != nil { r.Log.Error(err, "Failed to set controller reference for raycluster pod") diff --git a/ray-operator/controllers/ray/utils/dashboard_httpclient.go b/ray-operator/controllers/ray/utils/dashboard_httpclient.go index f043c93d47..a2503d3611 100644 --- a/ray-operator/controllers/ray/utils/dashboard_httpclient.go +++ b/ray-operator/controllers/ray/utils/dashboard_httpclient.go @@ -130,7 +130,7 @@ func FetchDashboardAgentURL(ctx context.Context, log *logr.Logger, cli client.Cl func FetchDashboardURL(ctx context.Context, log *logr.Logger, cli client.Client, rayCluster *rayv1alpha1.RayCluster) (string, error) { headSvc := &corev1.Service{} - headSvcName := CheckName(GenerateServiceName(rayCluster.Name)) + headSvcName := GenerateServiceName(rayCluster.Name) if err := cli.Get(ctx, client.ObjectKey{Name: headSvcName, Namespace: rayCluster.Namespace}, headSvc); err != nil { return "", err } diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 2a2a30d15f..9e8c0e1121 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -118,7 +118,18 @@ func GetNamespace(metaData metav1.ObjectMeta) string { // GenerateServiceName generates a ray head service name from cluster name func GenerateServiceName(clusterName string) string { - return fmt.Sprintf("%s-%s-%s", clusterName, rayiov1alpha1.HeadNode, "svc") + return CheckName(fmt.Sprintf("%s-%s-%s", clusterName, rayiov1alpha1.HeadNode, "svc")) +} + +// GenerateFQDNServiceName generates a Fully Qualified Domain Name. +func GenerateFQDNServiceName(clusterName string, namespace string) string { + return fmt.Sprintf("%s.%s.svc.cluster.local", GenerateServiceName(clusterName), namespace) +} + +// ExtractRayIPFromFQDN extracts the head service name (i.e., RAY_IP, deprecated) from a fully qualified +// domain name (FQDN). This function is provided for backward compatibility purposes only. +func ExtractRayIPFromFQDN(fqdnRayIP string) string { + return strings.Split(fqdnRayIP, ".")[0] } // GenerateDashboardServiceName generates a ray head service name from cluster name diff --git a/tests/config/ray-cluster.mini.yaml.template b/tests/config/ray-cluster.mini.yaml.template index 6f45612d53..d915145b35 100644 --- a/tests/config/ray-cluster.mini.yaml.template +++ b/tests/config/ray-cluster.mini.yaml.template @@ -60,7 +60,7 @@ spec: - name: init image: busybox:1.28 # Change the cluster postfix if you don't have a default setting - command: ['sh', '-c', "until nslookup $$RAY_IP.$$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $$RAY_IP; sleep 2; done"] + command: ['sh', '-c', "until nslookup $$FQ_RAY_IP; do echo waiting for K8s Service $$FQ_RAY_IP; sleep 2; done"] containers: - name: ray-worker image: $ray_image diff --git a/tests/config/ray-cluster.ray-ft.yaml.template b/tests/config/ray-cluster.ray-ft.yaml.template index 34fd95a57a..6a7179ddf4 100644 --- a/tests/config/ray-cluster.ray-ft.yaml.template +++ b/tests/config/ray-cluster.ray-ft.yaml.template @@ -142,7 +142,7 @@ spec: initContainers: # to avoid worker crashing before head service is created - name: init image: busybox:1.28 - command: ['sh', '-c', "until nslookup $$RAY_IP.$$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $$RAY_IP; sleep 2; done"] + command: ['sh', '-c', "until nslookup $$FQ_RAY_IP; do echo waiting for K8s Service $$FQ_RAY_IP; sleep 2; done"] containers: - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' image: $ray_image diff --git a/tests/config/ray-service.yaml.template b/tests/config/ray-service.yaml.template index 6896bc0a2e..50e21ebb50 100644 --- a/tests/config/ray-service.yaml.template +++ b/tests/config/ray-service.yaml.template @@ -100,7 +100,7 @@ spec: initContainers: - name: init image: busybox:1.28 - command: ['sh', '-c', "until nslookup $$RAY_IP.$$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $$RAY_IP; sleep 2; done"] + command: ['sh', '-c', "until nslookup $$FQ_RAY_IP; do echo waiting for K8s Service $$FQ_RAY_IP; sleep 2; done"] containers: - name: ray-worker image: $ray_image