From 139bc5bbd3726382e0ab0666cf9796f141dbe989 Mon Sep 17 00:00:00 2001 From: kaihsun Date: Fri, 17 Mar 2023 01:37:00 +0000 Subject: [PATCH 1/2] update --- ray-operator/controllers/ray/common/pod.go | 12 ++++++++++++ ray-operator/controllers/ray/common/pod_test.go | 14 ++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 1940b67764..433ea245dc 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -189,6 +189,18 @@ func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayi log.Info("Setting pod namespaces", "namespace", instance.Namespace) } + // The Ray worker should only start once the GCS server is ready. + initContainer := v1.Container{ + Name: "wait-gcs-ready", + Image: podTemplate.Spec.Containers[0].Image, + ImagePullPolicy: v1.PullIfNotPresent, + Command: []string{"/bin/bash", "-lc", "--"}, + Args: []string{ + fmt.Sprintf("until ray health-check --address %s:%s > /dev/null 2>&1; do echo wait for GCS to be ready; sleep 5; done", fqdnRayIP, headPort), + }, + } + podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, initContainer) + // If the replica of workers is more than 1, `ObjectMeta.Name` may cause name conflict errors. // Hence, we set `ObjectMeta.Name` to an empty string, and use GenerateName to prevent name conflicts. podTemplate.ObjectMeta.Name = "" diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index a5d95a3d38..68e0a1e897 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -800,3 +800,17 @@ func TestDefaultWorkerPodTemplateWithConfigurablePorts(t *testing.T) { t.Fatal(err) } } + +func TestDefaultInitContainer(t *testing.T) { + // A default init container to check the health of GCS is expected to be added. + cluster := instance.DeepCopy() + fqdnRayIP := utils.GenerateFQDNServiceName(cluster.Name, cluster.Namespace) + worker := cluster.Spec.WorkerGroupSpecs[0] + podName := cluster.Name + DashSymbol + string(rayiov1alpha1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) + expectedResult := len(cluster.Spec.WorkerGroupSpecs[0].Template.Spec.InitContainers) + 1 + + // Pass a deep copy of worker (*worker.DeepCopy()) to prevent "worker" from updating. + podTemplateSpec := DefaultWorkerPodTemplate(*cluster, *worker.DeepCopy(), podName, fqdnRayIP, "6379") + actualResult := len(podTemplateSpec.Spec.InitContainers) + assert.Equal(t, expectedResult, actualResult, "A default init container is expected to be added.") +} From a45997665dcbf96cb42e24e89941cd383e1ba75e Mon Sep 17 00:00:00 2001 From: kaihsun Date: Sat, 18 Mar 2023 18:14:20 +0000 Subject: [PATCH 2/2] update --- ray-operator/controllers/ray/common/pod.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 433ea245dc..36cb625748 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -190,14 +190,16 @@ func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayi } // The Ray worker should only start once the GCS server is ready. + rayContainerIndex := getRayContainerIndex(podTemplate.Spec) initContainer := v1.Container{ Name: "wait-gcs-ready", - Image: podTemplate.Spec.Containers[0].Image, + Image: podTemplate.Spec.Containers[rayContainerIndex].Image, ImagePullPolicy: v1.PullIfNotPresent, Command: []string{"/bin/bash", "-lc", "--"}, Args: []string{ fmt.Sprintf("until ray health-check --address %s:%s > /dev/null 2>&1; do echo wait for GCS to be ready; sleep 5; done", fqdnRayIP, headPort), }, + SecurityContext: podTemplate.Spec.Containers[rayContainerIndex].SecurityContext.DeepCopy(), } podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, initContainer) @@ -214,7 +216,7 @@ func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayi initTemplateAnnotations(instance, &podTemplate) isMetricsPortExists := false - for _, port := range podTemplate.Spec.Containers[0].Ports { + for _, port := range podTemplate.Spec.Containers[rayContainerIndex].Ports { if port.Name == DefaultMetricsName { isMetricsPortExists = true break @@ -226,7 +228,7 @@ func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayi Name: DefaultMetricsName, ContainerPort: int32(DefaultMetricsPort), } - podTemplate.Spec.Containers[0].Ports = append(podTemplate.Spec.Containers[0].Ports, metricsPort) + podTemplate.Spec.Containers[rayContainerIndex].Ports = append(podTemplate.Spec.Containers[rayContainerIndex].Ports, metricsPort) } return podTemplate