From c54c3d9c470337e04a653fcb93b3bf8098bd3f2b Mon Sep 17 00:00:00 2001 From: Kai-Hsun Chen Date: Wed, 29 Nov 2023 11:13:27 -0800 Subject: [PATCH] [RayService][Health-Check][8/n] Add readiness / liveness probes (#1674) --- .github/workflows/test-job.yaml | 20 ------ helm-chart/kuberay-operator/values.yaml | 5 ++ .../controllers/ray/common/constant.go | 6 ++ ray-operator/controllers/ray/common/pod.go | 64 +++++++++++-------- .../controllers/ray/common/pod_test.go | 26 ++++++++ tests/compatibility-test.py | 19 ++++++ tests/config/ray-cluster.mini.yaml.template | 4 -- 7 files changed, 92 insertions(+), 52 deletions(-) diff --git a/.github/workflows/test-job.yaml b/.github/workflows/test-job.yaml index 0d82d1f80a..e58f612f8c 100644 --- a/.github/workflows/test-job.yaml +++ b/.github/workflows/test-job.yaml @@ -287,26 +287,6 @@ jobs: docker push quay.io/kuberay/operator:nightly if: contains(fromJson('["refs/heads/master"]'), github.ref) - test-compatibility-1_13_0: - needs: - - build_operator - - build_apiserver - - lint - runs-on: ubuntu-latest - name: Compatibility Test - 1.13.0 - steps: - - name: Check out code into the Go module directory - uses: actions/checkout@v2 - with: - # When checking out the repository that - # triggered a workflow, this defaults to the reference or SHA for that event. - # Default value should work for both pull_request and merge(push) event. - ref: ${{github.event.pull_request.head.sha}} - - - uses: ./.github/workflows/actions/compatibility - with: - ray_version: 1.13.0 - test-compatibility-2_5_0: needs: - build_operator diff --git a/helm-chart/kuberay-operator/values.yaml b/helm-chart/kuberay-operator/values.yaml index 87255fb3c7..e668ade001 100644 --- a/helm-chart/kuberay-operator/values.yaml +++ b/helm-chart/kuberay-operator/values.yaml @@ -105,3 +105,8 @@ env: # Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades. # - name: ENABLE_ZERO_DOWNTIME # value: "true" +# This environment variable for the KubeRay operator is used to determine whether to enable +# the injection of readiness and liveness probes into Ray head and worker containers. +# Enabling this feature contributes to the robustness of Ray clusters. +# - name: ENABLE_PROBES_INJECTION +# value: "true" diff --git a/ray-operator/controllers/ray/common/constant.go b/ray-operator/controllers/ray/common/constant.go index 21df7783e3..1ce771aec0 100644 --- a/ray-operator/controllers/ray/common/constant.go +++ b/ray-operator/controllers/ray/common/constant.go @@ -99,6 +99,12 @@ const ( // cleanup Job should be enabled. This is a feature flag for v1.0.0. ENABLE_GCS_FT_REDIS_CLEANUP = "ENABLE_GCS_FT_REDIS_CLEANUP" + // This environment variable for the KubeRay operator is used to determine whether to enable + // the injection of readiness and liveness probes into Ray head and worker containers. + // Enabling this feature contributes to the robustness of Ray clusters. It is currently a feature + // flag for v1.1.0 and will be removed if the behavior proves to be stable enough. + ENABLE_PROBES_INJECTION = "ENABLE_PROBES_INJECTION" + // Ray core default configurations DefaultWorkerRayGcsReconnectTimeoutS = "600" diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 99e7fb86d5..8d9adba6fb 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -137,6 +137,13 @@ func getEnableInitContainerInjection() bool { return true } +func getEnableProbesInjection() bool { + if s := os.Getenv(ENABLE_PROBES_INJECTION); strings.ToLower(s) == "false" { + return false + } + return true +} + // DefaultWorkerPodTemplate sets the config values func DefaultWorkerPodTemplate(instance rayv1.RayCluster, workerSpec rayv1.WorkerGroupSpec, podName string, fqdnRayIP string, headPort string) v1.PodTemplateSpec { podTemplate := workerSpec.Template @@ -323,36 +330,37 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayv1.RayNodeType, setContainerEnvVars(&pod, rayNodeType, rayStartParams, fqdnRayIP, headPort, creator) - // health check only if FT enabled - if podTemplateSpec.Annotations != nil { - if enabledString, ok := podTemplateSpec.Annotations[RayFTEnabledAnnotationKey]; ok { - if strings.ToLower(enabledString) == "true" { - // If users do not specify probes, we will set the default probes. - if pod.Spec.Containers[RayContainerIndex].ReadinessProbe == nil { - probe := &v1.Probe{ - InitialDelaySeconds: DefaultReadinessProbeInitialDelaySeconds, - TimeoutSeconds: DefaultReadinessProbeTimeoutSeconds, - PeriodSeconds: DefaultReadinessProbePeriodSeconds, - SuccessThreshold: DefaultReadinessProbeSuccessThreshold, - FailureThreshold: DefaultReadinessProbeFailureThreshold, - } - pod.Spec.Containers[RayContainerIndex].ReadinessProbe = probe - } - initHealthProbe(pod.Spec.Containers[RayContainerIndex].ReadinessProbe, rayNodeType) - - if pod.Spec.Containers[RayContainerIndex].LivenessProbe == nil { - probe := &v1.Probe{ - InitialDelaySeconds: DefaultLivenessProbeInitialDelaySeconds, - TimeoutSeconds: DefaultLivenessProbeTimeoutSeconds, - PeriodSeconds: DefaultLivenessProbePeriodSeconds, - SuccessThreshold: DefaultLivenessProbeSuccessThreshold, - FailureThreshold: DefaultLivenessProbeFailureThreshold, - } - pod.Spec.Containers[RayContainerIndex].LivenessProbe = probe - } - initHealthProbe(pod.Spec.Containers[RayContainerIndex].LivenessProbe, rayNodeType) + // Inject probes into the Ray containers if the user has not explicitly disabled them. + // The feature flag `ENABLE_PROBES_INJECTION` will be removed if this feature is stable enough. + enableProbesInjection := getEnableProbesInjection() + log.Info("Probes injection feature flag", "enabled", enableProbesInjection) + if enableProbesInjection { + // Configure the readiness and liveness probes for the Ray container. These probes + // play a crucial role in KubeRay health checks. Without them, certain failures, + // such as the Raylet process crashing, may go undetected. + if pod.Spec.Containers[RayContainerIndex].ReadinessProbe == nil { + probe := &v1.Probe{ + InitialDelaySeconds: DefaultReadinessProbeInitialDelaySeconds, + TimeoutSeconds: DefaultReadinessProbeTimeoutSeconds, + PeriodSeconds: DefaultReadinessProbePeriodSeconds, + SuccessThreshold: DefaultReadinessProbeSuccessThreshold, + FailureThreshold: DefaultReadinessProbeFailureThreshold, + } + pod.Spec.Containers[RayContainerIndex].ReadinessProbe = probe + } + initHealthProbe(pod.Spec.Containers[RayContainerIndex].ReadinessProbe, rayNodeType) + + if pod.Spec.Containers[RayContainerIndex].LivenessProbe == nil { + probe := &v1.Probe{ + InitialDelaySeconds: DefaultLivenessProbeInitialDelaySeconds, + TimeoutSeconds: DefaultLivenessProbeTimeoutSeconds, + PeriodSeconds: DefaultLivenessProbePeriodSeconds, + SuccessThreshold: DefaultLivenessProbeSuccessThreshold, + FailureThreshold: DefaultLivenessProbeFailureThreshold, } + pod.Spec.Containers[RayContainerIndex].LivenessProbe = probe } + initHealthProbe(pod.Spec.Containers[RayContainerIndex].LivenessProbe, rayNodeType) } return pod diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index c871524dcc..5c049d7dd2 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -1137,6 +1137,32 @@ func TestGetCustomWorkerInitImage(t *testing.T) { assert.False(t, b) } +func TestGetEnableProbesInjection(t *testing.T) { + // cleanup + defer os.Unsetenv(ENABLE_PROBES_INJECTION) + + // not set the env + os.Unsetenv(ENABLE_PROBES_INJECTION) + b := getEnableProbesInjection() + assert.True(t, b) + // set the env with "true" + os.Setenv(ENABLE_PROBES_INJECTION, "true") + b = getEnableProbesInjection() + assert.True(t, b) + // set the env with "True" + os.Setenv(ENABLE_PROBES_INJECTION, "True") + b = getEnableProbesInjection() + assert.True(t, b) + // set the env with "false" + os.Setenv(ENABLE_PROBES_INJECTION, "false") + b = getEnableProbesInjection() + assert.False(t, b) + // set the env with "False" + os.Setenv(ENABLE_PROBES_INJECTION, "False") + b = getEnableProbesInjection() + assert.False(t, b) +} + func TestInitHealthProbe(t *testing.T) { // Test 1: User defines a custom HTTPGet probe. httpGetProbe := v1.Probe{ diff --git a/tests/compatibility-test.py b/tests/compatibility-test.py index df90eb237c..8c8590141d 100755 --- a/tests/compatibility-test.py +++ b/tests/compatibility-test.py @@ -13,6 +13,7 @@ from framework.utils import ( get_head_pod, + get_pod, pod_exec_command, shell_subprocess_run, CONST, @@ -58,6 +59,24 @@ def test_cluster_info(self): """Execute "print(ray.cluster_resources())" in the head Pod.""" EasyJobRule().assert_rule() + def test_probe_injection(self): + """ + Check whether the readiness and liveness probes are injected into the Ray container. + """ + def is_probe_injected(pod): + probes = [ + pod.spec.containers[0].readiness_probe, + pod.spec.containers[0].liveness_probe + ] + for probe in probes: + if probe is None: + return False + return True + headpod = get_head_pod(BasicRayTestCase.ray_cluster_ns) + assert is_probe_injected(headpod) + # TODO (kevin85421): We only check 1 worker Pod here. + worker_pod = get_pod(BasicRayTestCase.ray_cluster_ns, "ray.io/node-type=worker") + assert is_probe_injected(worker_pod) class RayFTTestCase(unittest.TestCase): """Test Ray GCS Fault Tolerance""" diff --git a/tests/config/ray-cluster.mini.yaml.template b/tests/config/ray-cluster.mini.yaml.template index 79b0f0a982..31507921a1 100644 --- a/tests/config/ray-cluster.mini.yaml.template +++ b/tests/config/ray-cluster.mini.yaml.template @@ -1,9 +1,6 @@ apiVersion: ray.io/v1 kind: RayCluster metadata: - labels: - controller-tools.k8s.io: "1.0" - # An unique identifier for the head node and workers of this cluster. name: raycluster-mini spec: rayVersion: '$ray_version' # should match the Ray version in the image of the containers @@ -12,7 +9,6 @@ spec: headGroupSpec: # the following params are used to complete the ray start: ray start --head --block ... rayStartParams: - dashboard-host: '0.0.0.0' num-cpus: '1' #pod template template: