Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RayService][Health-Check][8/n] Add readiness / liveness probes #1674

Merged
merged 4 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 0 additions & 20 deletions .github/workflows/test-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -287,26 +287,6 @@ jobs:
docker push quay.io/kuberay/operator:nightly
if: contains(fromJson('["refs/heads/master"]'), github.ref)

test-compatibility-1_13_0:
needs:
- build_operator
- build_apiserver
- lint
runs-on: ubuntu-latest
name: Compatibility Test - 1.13.0
steps:
- name: Check out code into the Go module directory
uses: actions/checkout@v2
with:
# When checking out the repository that
# triggered a workflow, this defaults to the reference or SHA for that event.
# Default value should work for both pull_request and merge(push) event.
ref: ${{github.event.pull_request.head.sha}}

- uses: ./.github/workflows/actions/compatibility
with:
ray_version: 1.13.0

test-compatibility-2_5_0:
needs:
- build_operator
Expand Down
5 changes: 5 additions & 0 deletions helm-chart/kuberay-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,8 @@ env:
# Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades.
# - name: ENABLE_ZERO_DOWNTIME
# value: "true"
# This environment variable for the KubeRay operator is used to determine whether to enable
# the injection of readiness and liveness probes into Ray head and worker containers.
# Enabling this feature contributes to the robustness of Ray clusters.
# - name: ENABLE_PROBES_INJECTION
# value: "true"
6 changes: 6 additions & 0 deletions ray-operator/controllers/ray/common/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ const (
// cleanup Job should be enabled. This is a feature flag for v1.0.0.
ENABLE_GCS_FT_REDIS_CLEANUP = "ENABLE_GCS_FT_REDIS_CLEANUP"

// This environment variable for the KubeRay operator is used to determine whether to enable
// the injection of readiness and liveness probes into Ray head and worker containers.
// Enabling this feature contributes to the robustness of Ray clusters. It is currently a feature
// flag for v1.1.0 and will be removed if the behavior proves to be stable enough.
ENABLE_PROBES_INJECTION = "ENABLE_PROBES_INJECTION"

// Ray core default configurations
DefaultWorkerRayGcsReconnectTimeoutS = "600"

Expand Down
64 changes: 36 additions & 28 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,13 @@ func getEnableInitContainerInjection() bool {
return true
}

func getEnableProbesInjection() bool {
if s := os.Getenv(ENABLE_PROBES_INJECTION); strings.ToLower(s) == "false" {
return false
}
return true
}

// DefaultWorkerPodTemplate sets the config values
func DefaultWorkerPodTemplate(instance rayv1.RayCluster, workerSpec rayv1.WorkerGroupSpec, podName string, fqdnRayIP string, headPort string) v1.PodTemplateSpec {
podTemplate := workerSpec.Template
Expand Down Expand Up @@ -323,36 +330,37 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayv1.RayNodeType,

setContainerEnvVars(&pod, rayNodeType, rayStartParams, fqdnRayIP, headPort, creator)

// health check only if FT enabled
if podTemplateSpec.Annotations != nil {
if enabledString, ok := podTemplateSpec.Annotations[RayFTEnabledAnnotationKey]; ok {
if strings.ToLower(enabledString) == "true" {
// If users do not specify probes, we will set the default probes.
if pod.Spec.Containers[RayContainerIndex].ReadinessProbe == nil {
probe := &v1.Probe{
InitialDelaySeconds: DefaultReadinessProbeInitialDelaySeconds,
TimeoutSeconds: DefaultReadinessProbeTimeoutSeconds,
PeriodSeconds: DefaultReadinessProbePeriodSeconds,
SuccessThreshold: DefaultReadinessProbeSuccessThreshold,
FailureThreshold: DefaultReadinessProbeFailureThreshold,
}
pod.Spec.Containers[RayContainerIndex].ReadinessProbe = probe
}
initHealthProbe(pod.Spec.Containers[RayContainerIndex].ReadinessProbe, rayNodeType)

if pod.Spec.Containers[RayContainerIndex].LivenessProbe == nil {
probe := &v1.Probe{
InitialDelaySeconds: DefaultLivenessProbeInitialDelaySeconds,
TimeoutSeconds: DefaultLivenessProbeTimeoutSeconds,
PeriodSeconds: DefaultLivenessProbePeriodSeconds,
SuccessThreshold: DefaultLivenessProbeSuccessThreshold,
FailureThreshold: DefaultLivenessProbeFailureThreshold,
}
pod.Spec.Containers[RayContainerIndex].LivenessProbe = probe
}
initHealthProbe(pod.Spec.Containers[RayContainerIndex].LivenessProbe, rayNodeType)
// Inject probes into the Ray containers if the user has not explicitly disabled them.
// The feature flag `ENABLE_PROBES_INJECTION` will be removed if this feature is stable enough.
enableProbesInjection := getEnableProbesInjection()
log.Info("Probes injection feature flag", "enabled", enableProbesInjection)
if enableProbesInjection {
// Configure the readiness and liveness probes for the Ray container. These probes
// play a crucial role in KubeRay health checks. Without them, certain failures,
// such as the Raylet process crashing, may go undetected.
if pod.Spec.Containers[RayContainerIndex].ReadinessProbe == nil {
probe := &v1.Probe{
InitialDelaySeconds: DefaultReadinessProbeInitialDelaySeconds,
TimeoutSeconds: DefaultReadinessProbeTimeoutSeconds,
PeriodSeconds: DefaultReadinessProbePeriodSeconds,
SuccessThreshold: DefaultReadinessProbeSuccessThreshold,
FailureThreshold: DefaultReadinessProbeFailureThreshold,
}
pod.Spec.Containers[RayContainerIndex].ReadinessProbe = probe
}
initHealthProbe(pod.Spec.Containers[RayContainerIndex].ReadinessProbe, rayNodeType)

if pod.Spec.Containers[RayContainerIndex].LivenessProbe == nil {
probe := &v1.Probe{
InitialDelaySeconds: DefaultLivenessProbeInitialDelaySeconds,
TimeoutSeconds: DefaultLivenessProbeTimeoutSeconds,
PeriodSeconds: DefaultLivenessProbePeriodSeconds,
SuccessThreshold: DefaultLivenessProbeSuccessThreshold,
FailureThreshold: DefaultLivenessProbeFailureThreshold,
}
pod.Spec.Containers[RayContainerIndex].LivenessProbe = probe
}
initHealthProbe(pod.Spec.Containers[RayContainerIndex].LivenessProbe, rayNodeType)
}

return pod
Expand Down
26 changes: 26 additions & 0 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1137,6 +1137,32 @@ func TestGetCustomWorkerInitImage(t *testing.T) {
assert.False(t, b)
}

func TestGetEnableProbesInjection(t *testing.T) {
// cleanup
defer os.Unsetenv(ENABLE_PROBES_INJECTION)

// not set the env
os.Unsetenv(ENABLE_PROBES_INJECTION)
b := getEnableProbesInjection()
assert.True(t, b)
// set the env with "true"
os.Setenv(ENABLE_PROBES_INJECTION, "true")
b = getEnableProbesInjection()
assert.True(t, b)
// set the env with "True"
os.Setenv(ENABLE_PROBES_INJECTION, "True")
b = getEnableProbesInjection()
assert.True(t, b)
// set the env with "false"
os.Setenv(ENABLE_PROBES_INJECTION, "false")
b = getEnableProbesInjection()
assert.False(t, b)
// set the env with "False"
os.Setenv(ENABLE_PROBES_INJECTION, "False")
b = getEnableProbesInjection()
assert.False(t, b)
}

func TestInitHealthProbe(t *testing.T) {
// Test 1: User defines a custom HTTPGet probe.
httpGetProbe := v1.Probe{
Expand Down
19 changes: 19 additions & 0 deletions tests/compatibility-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from framework.utils import (
get_head_pod,
get_pod,
pod_exec_command,
shell_subprocess_run,
CONST,
Expand Down Expand Up @@ -58,6 +59,24 @@ def test_cluster_info(self):
"""Execute "print(ray.cluster_resources())" in the head Pod."""
EasyJobRule().assert_rule()

def test_probe_injection(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Run the test with kuberay/operator:v1.0.0. The test should fail.

    RAY_IMAGE=rayproject/ray:nightly OPERATOR_IMAGE=kuberay/operator:v1.0.0 python3 tests/compatibility-test.py BasicRayTestCase 2>&1
    Screen Shot 2023-11-27 at 11 18 29 PM
  • Run the test with controller:latest (this PR). The test should pass.

    RAY_IMAGE=rayproject/ray:nightly OPERATOR_IMAGE=controller:latest python3 tests/compatibility-test.py BasicRayTestCase 2>&1
    Screen Shot 2023-11-27 at 11 19 59 PM

"""
Check whether the readiness and liveness probes are injected into the Ray container.
"""
def is_probe_injected(pod):
probes = [
pod.spec.containers[0].readiness_probe,
pod.spec.containers[0].liveness_probe
]
for probe in probes:
if probe is None:
return False
return True
headpod = get_head_pod(BasicRayTestCase.ray_cluster_ns)
assert is_probe_injected(headpod)
# TODO (kevin85421): We only check 1 worker Pod here.
worker_pod = get_pod(BasicRayTestCase.ray_cluster_ns, "ray.io/node-type=worker")
assert is_probe_injected(worker_pod)

class RayFTTestCase(unittest.TestCase):
"""Test Ray GCS Fault Tolerance"""
Expand Down
4 changes: 0 additions & 4 deletions tests/config/ray-cluster.mini.yaml.template
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
labels:
controller-tools.k8s.io: "1.0"
# An unique identifier for the head node and workers of this cluster.
name: raycluster-mini
spec:
rayVersion: '$ray_version' # should match the Ray version in the image of the containers
Expand All @@ -12,7 +9,6 @@ spec:
headGroupSpec:
# the following params are used to complete the ray start: ray start --head --block ...
rayStartParams:
dashboard-host: '0.0.0.0'
num-cpus: '1'
#pod template
template:
Expand Down
Loading