diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 6cce6a2a9d..24bfccc254 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -18,6 +18,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" ctrl "sigs.k8s.io/controller-runtime" ) @@ -248,72 +249,40 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo } func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType rayv1.RayNodeType, creatorCRDType utils.CRDType) { - rayAgentRayletHealthCommand := fmt.Sprintf( - utils.BaseWgetHealthCommand, - utils.DefaultReadinessProbeTimeoutSeconds, - utils.DefaultDashboardAgentListenPort, - utils.RayAgentRayletHealthPath, - ) - rayDashboardGCSHealthCommand := fmt.Sprintf( - utils.BaseWgetHealthCommand, - utils.DefaultReadinessProbeFailureThreshold, - utils.DefaultDashboardPort, - utils.RayDashboardGCSHealthPath, - ) - - // Generally, the liveness and readiness probes perform the same checks. - // For head node => Check GCS and Raylet status. - // For worker node => Check Raylet status. - commands := []string{} + healthCheckPath := utils.RayAgentRayletHealthPath + healthCheckPort := intstr.FromInt(utils.DefaultDashboardAgentListenPort) if rayNodeType == rayv1.HeadNode { - commands = append(commands, rayAgentRayletHealthCommand, rayDashboardGCSHealthCommand) - } else { - commands = append(commands, rayAgentRayletHealthCommand) + healthCheckPath = utils.RayDashboardGCSHealthPath + healthCheckPort = intstr.FromInt(utils.DefaultDashboardPort) } if rayContainer.LivenessProbe == nil { - probeTimeout := int32(utils.DefaultLivenessProbeTimeoutSeconds) - if rayNodeType == rayv1.HeadNode { - probeTimeout = int32(utils.DefaultHeadLivenessProbeTimeoutSeconds) - } - rayContainer.LivenessProbe = &corev1.Probe{ InitialDelaySeconds: utils.DefaultLivenessProbeInitialDelaySeconds, - TimeoutSeconds: probeTimeout, + TimeoutSeconds: utils.DefaultLivenessProbeTimeoutSeconds, PeriodSeconds: utils.DefaultLivenessProbePeriodSeconds, SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold, FailureThreshold: utils.DefaultLivenessProbeFailureThreshold, } - rayContainer.LivenessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}} + rayContainer.LivenessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort} } if rayContainer.ReadinessProbe == nil { - probeTimeout := int32(utils.DefaultReadinessProbeTimeoutSeconds) - if rayNodeType == rayv1.HeadNode { - probeTimeout = int32(utils.DefaultHeadReadinessProbeTimeoutSeconds) - } rayContainer.ReadinessProbe = &corev1.Probe{ InitialDelaySeconds: utils.DefaultReadinessProbeInitialDelaySeconds, - TimeoutSeconds: probeTimeout, + TimeoutSeconds: utils.DefaultReadinessProbeTimeoutSeconds, PeriodSeconds: utils.DefaultReadinessProbePeriodSeconds, SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold, FailureThreshold: utils.DefaultReadinessProbeFailureThreshold, } - rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}} + rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort} // For worker Pods serving traffic, we need to add an additional HTTP proxy health check for the readiness probe. // Note: head Pod checks the HTTP proxy's health at every rayservice controller reconcile instaed of using readiness probe. // See https://github.com/ray-project/kuberay/pull/1808 for reasons. if creatorCRDType == utils.RayServiceCRD && rayNodeType == rayv1.WorkerNode { rayContainer.ReadinessProbe.FailureThreshold = utils.ServeReadinessProbeFailureThreshold - rayServeProxyHealthCommand := fmt.Sprintf( - utils.BaseWgetHealthCommand, - utils.DefaultReadinessProbeInitialDelaySeconds, - utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort), - utils.RayServeProxyHealthPath, - ) - commands = append(commands, rayServeProxyHealthCommand) - rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}} + rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: utils.RayServeProxyHealthPath, Port: intstr.FromInt(utils.DefaultServingPort)} } } } diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index b9890f28b6..3b86897460 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -1167,35 +1167,33 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) { podTemplateSpec := DefaultHeadPodTemplate(context.Background(), *cluster, cluster.Spec.HeadGroupSpec, podName, "6379") rayContainer := &podTemplateSpec.Spec.Containers[utils.RayContainerIndex] - // Test 1: User defines a custom HTTPGet probe. - httpGetProbe := corev1.Probe{ + // Test 1: User defines a custom Exec probe to override default HTTP probe. + execProbe := corev1.Probe{ ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - // Check Raylet status - Path: fmt.Sprintf("/%s", utils.RayAgentRayletHealthPath), - Port: intstr.FromInt(utils.DefaultDashboardAgentListenPort), + Exec: &corev1.ExecAction{ + Command: []string{"foo", "bar"}, }, }, } - rayContainer.LivenessProbe = &httpGetProbe - rayContainer.ReadinessProbe = &httpGetProbe + rayContainer.LivenessProbe = &execProbe + rayContainer.ReadinessProbe = &execProbe initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, "") - assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet) - assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet) - assert.Nil(t, rayContainer.LivenessProbe.Exec) - assert.Nil(t, rayContainer.ReadinessProbe.Exec) + assert.NotNil(t, rayContainer.LivenessProbe.Exec) + assert.NotNil(t, rayContainer.ReadinessProbe.Exec) + assert.Nil(t, rayContainer.LivenessProbe.HTTPGet) + assert.Nil(t, rayContainer.ReadinessProbe.HTTPGet) - // Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod. + // Test 2: User does not define a custom probe. KubeRay will inject HTTP probe for worker pod. // Here we test the case where the Ray Pod originates from RayServiceCRD, // implying that an additional serve health check will be added to the readiness probe. rayContainer.LivenessProbe = nil rayContainer.ReadinessProbe = nil initLivenessAndReadinessProbe(rayContainer, rayv1.WorkerNode, utils.RayServiceCRD) - assert.NotNil(t, rayContainer.LivenessProbe.Exec) - assert.NotNil(t, rayContainer.ReadinessProbe.Exec) - assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) - assert.True(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) + assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet) + assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayServeProxyHealthPath) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultServingPort)) assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds) assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds) @@ -1205,13 +1203,34 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) { rayContainer.LivenessProbe = nil rayContainer.ReadinessProbe = nil initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD) - assert.NotNil(t, rayContainer.LivenessProbe.Exec) - assert.NotNil(t, rayContainer.ReadinessProbe.Exec) - // head pod should not have Ray Serve proxy health probes - assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) - assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) - assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds) - assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds) + assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet) + assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayDashboardGCSHealthPath) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardPort)) + assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds) + assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds) + + // Test 4: User does not define custom probe. Pod is a worker Pod for a RayJob + rayContainer.LivenessProbe = nil + rayContainer.ReadinessProbe = nil + initLivenessAndReadinessProbe(rayContainer, rayv1.WorkerNode, utils.RayJobCRD) + assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet) + assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayAgentRayletHealthPath) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardAgentListenPort)) + assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds) + assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds) + + // Test 5: User does not define custom probe. Pod is a head Pod for a RayJob + rayContainer.LivenessProbe = nil + rayContainer.ReadinessProbe = nil + initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayJobCRD) + assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet) + assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayDashboardGCSHealthPath) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardPort)) + assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds) + assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds) } func TestGenerateRayStartCommand(t *testing.T) { diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index ab4df1d280..2ca52dfc89 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -151,21 +151,17 @@ const ( // Ray FT default readiness probe values DefaultReadinessProbeInitialDelaySeconds = 10 DefaultReadinessProbeTimeoutSeconds = 2 - // Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz) - DefaultHeadReadinessProbeTimeoutSeconds = 5 - DefaultReadinessProbePeriodSeconds = 5 - DefaultReadinessProbeSuccessThreshold = 1 - DefaultReadinessProbeFailureThreshold = 10 - ServeReadinessProbeFailureThreshold = 1 + DefaultReadinessProbePeriodSeconds = 5 + DefaultReadinessProbeSuccessThreshold = 1 + DefaultReadinessProbeFailureThreshold = 10 + ServeReadinessProbeFailureThreshold = 1 // Ray FT default liveness probe values DefaultLivenessProbeInitialDelaySeconds = 30 DefaultLivenessProbeTimeoutSeconds = 2 - // Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz) - DefaultHeadLivenessProbeTimeoutSeconds = 5 - DefaultLivenessProbePeriodSeconds = 5 - DefaultLivenessProbeSuccessThreshold = 1 - DefaultLivenessProbeFailureThreshold = 120 + DefaultLivenessProbePeriodSeconds = 5 + DefaultLivenessProbeSuccessThreshold = 1 + DefaultLivenessProbeFailureThreshold = 120 // Ray health check related configurations // Note: Since the Raylet process and the dashboard agent process are fate-sharing,