Skip to content

Commit

Permalink
[RayService][Health-Check][8/n] Add readiness / liveness probes (ray-…
Browse files Browse the repository at this point in the history
  • Loading branch information
kevin85421 authored Nov 29, 2023
1 parent 1d2afa4 commit c54c3d9
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 52 deletions.
20 changes: 0 additions & 20 deletions .github/workflows/test-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -287,26 +287,6 @@ jobs:
docker push quay.io/kuberay/operator:nightly
if: contains(fromJson('["refs/heads/master"]'), github.ref)

test-compatibility-1_13_0:
needs:
- build_operator
- build_apiserver
- lint
runs-on: ubuntu-latest
name: Compatibility Test - 1.13.0
steps:
- name: Check out code into the Go module directory
uses: actions/checkout@v2
with:
# When checking out the repository that
# triggered a workflow, this defaults to the reference or SHA for that event.
# Default value should work for both pull_request and merge(push) event.
ref: ${{github.event.pull_request.head.sha}}

- uses: ./.github/workflows/actions/compatibility
with:
ray_version: 1.13.0

test-compatibility-2_5_0:
needs:
- build_operator
Expand Down
5 changes: 5 additions & 0 deletions helm-chart/kuberay-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,8 @@ env:
# Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades.
# - name: ENABLE_ZERO_DOWNTIME
# value: "true"
# This environment variable for the KubeRay operator is used to determine whether to enable
# the injection of readiness and liveness probes into Ray head and worker containers.
# Enabling this feature contributes to the robustness of Ray clusters.
# - name: ENABLE_PROBES_INJECTION
# value: "true"
6 changes: 6 additions & 0 deletions ray-operator/controllers/ray/common/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ const (
// cleanup Job should be enabled. This is a feature flag for v1.0.0.
ENABLE_GCS_FT_REDIS_CLEANUP = "ENABLE_GCS_FT_REDIS_CLEANUP"

// This environment variable for the KubeRay operator is used to determine whether to enable
// the injection of readiness and liveness probes into Ray head and worker containers.
// Enabling this feature contributes to the robustness of Ray clusters. It is currently a feature
// flag for v1.1.0 and will be removed if the behavior proves to be stable enough.
ENABLE_PROBES_INJECTION = "ENABLE_PROBES_INJECTION"

// Ray core default configurations
DefaultWorkerRayGcsReconnectTimeoutS = "600"

Expand Down
64 changes: 36 additions & 28 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,13 @@ func getEnableInitContainerInjection() bool {
return true
}

func getEnableProbesInjection() bool {
if s := os.Getenv(ENABLE_PROBES_INJECTION); strings.ToLower(s) == "false" {
return false
}
return true
}

// DefaultWorkerPodTemplate sets the config values
func DefaultWorkerPodTemplate(instance rayv1.RayCluster, workerSpec rayv1.WorkerGroupSpec, podName string, fqdnRayIP string, headPort string) v1.PodTemplateSpec {
podTemplate := workerSpec.Template
Expand Down Expand Up @@ -323,36 +330,37 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayv1.RayNodeType,

setContainerEnvVars(&pod, rayNodeType, rayStartParams, fqdnRayIP, headPort, creator)

// health check only if FT enabled
if podTemplateSpec.Annotations != nil {
if enabledString, ok := podTemplateSpec.Annotations[RayFTEnabledAnnotationKey]; ok {
if strings.ToLower(enabledString) == "true" {
// If users do not specify probes, we will set the default probes.
if pod.Spec.Containers[RayContainerIndex].ReadinessProbe == nil {
probe := &v1.Probe{
InitialDelaySeconds: DefaultReadinessProbeInitialDelaySeconds,
TimeoutSeconds: DefaultReadinessProbeTimeoutSeconds,
PeriodSeconds: DefaultReadinessProbePeriodSeconds,
SuccessThreshold: DefaultReadinessProbeSuccessThreshold,
FailureThreshold: DefaultReadinessProbeFailureThreshold,
}
pod.Spec.Containers[RayContainerIndex].ReadinessProbe = probe
}
initHealthProbe(pod.Spec.Containers[RayContainerIndex].ReadinessProbe, rayNodeType)

if pod.Spec.Containers[RayContainerIndex].LivenessProbe == nil {
probe := &v1.Probe{
InitialDelaySeconds: DefaultLivenessProbeInitialDelaySeconds,
TimeoutSeconds: DefaultLivenessProbeTimeoutSeconds,
PeriodSeconds: DefaultLivenessProbePeriodSeconds,
SuccessThreshold: DefaultLivenessProbeSuccessThreshold,
FailureThreshold: DefaultLivenessProbeFailureThreshold,
}
pod.Spec.Containers[RayContainerIndex].LivenessProbe = probe
}
initHealthProbe(pod.Spec.Containers[RayContainerIndex].LivenessProbe, rayNodeType)
// Inject probes into the Ray containers if the user has not explicitly disabled them.
// The feature flag `ENABLE_PROBES_INJECTION` will be removed if this feature is stable enough.
enableProbesInjection := getEnableProbesInjection()
log.Info("Probes injection feature flag", "enabled", enableProbesInjection)
if enableProbesInjection {
// Configure the readiness and liveness probes for the Ray container. These probes
// play a crucial role in KubeRay health checks. Without them, certain failures,
// such as the Raylet process crashing, may go undetected.
if pod.Spec.Containers[RayContainerIndex].ReadinessProbe == nil {
probe := &v1.Probe{
InitialDelaySeconds: DefaultReadinessProbeInitialDelaySeconds,
TimeoutSeconds: DefaultReadinessProbeTimeoutSeconds,
PeriodSeconds: DefaultReadinessProbePeriodSeconds,
SuccessThreshold: DefaultReadinessProbeSuccessThreshold,
FailureThreshold: DefaultReadinessProbeFailureThreshold,
}
pod.Spec.Containers[RayContainerIndex].ReadinessProbe = probe
}
initHealthProbe(pod.Spec.Containers[RayContainerIndex].ReadinessProbe, rayNodeType)

if pod.Spec.Containers[RayContainerIndex].LivenessProbe == nil {
probe := &v1.Probe{
InitialDelaySeconds: DefaultLivenessProbeInitialDelaySeconds,
TimeoutSeconds: DefaultLivenessProbeTimeoutSeconds,
PeriodSeconds: DefaultLivenessProbePeriodSeconds,
SuccessThreshold: DefaultLivenessProbeSuccessThreshold,
FailureThreshold: DefaultLivenessProbeFailureThreshold,
}
pod.Spec.Containers[RayContainerIndex].LivenessProbe = probe
}
initHealthProbe(pod.Spec.Containers[RayContainerIndex].LivenessProbe, rayNodeType)
}

return pod
Expand Down
26 changes: 26 additions & 0 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1137,6 +1137,32 @@ func TestGetCustomWorkerInitImage(t *testing.T) {
assert.False(t, b)
}

func TestGetEnableProbesInjection(t *testing.T) {
// cleanup
defer os.Unsetenv(ENABLE_PROBES_INJECTION)

// not set the env
os.Unsetenv(ENABLE_PROBES_INJECTION)
b := getEnableProbesInjection()
assert.True(t, b)
// set the env with "true"
os.Setenv(ENABLE_PROBES_INJECTION, "true")
b = getEnableProbesInjection()
assert.True(t, b)
// set the env with "True"
os.Setenv(ENABLE_PROBES_INJECTION, "True")
b = getEnableProbesInjection()
assert.True(t, b)
// set the env with "false"
os.Setenv(ENABLE_PROBES_INJECTION, "false")
b = getEnableProbesInjection()
assert.False(t, b)
// set the env with "False"
os.Setenv(ENABLE_PROBES_INJECTION, "False")
b = getEnableProbesInjection()
assert.False(t, b)
}

func TestInitHealthProbe(t *testing.T) {
// Test 1: User defines a custom HTTPGet probe.
httpGetProbe := v1.Probe{
Expand Down
19 changes: 19 additions & 0 deletions tests/compatibility-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from framework.utils import (
get_head_pod,
get_pod,
pod_exec_command,
shell_subprocess_run,
CONST,
Expand Down Expand Up @@ -58,6 +59,24 @@ def test_cluster_info(self):
"""Execute "print(ray.cluster_resources())" in the head Pod."""
EasyJobRule().assert_rule()

def test_probe_injection(self):
"""
Check whether the readiness and liveness probes are injected into the Ray container.
"""
def is_probe_injected(pod):
probes = [
pod.spec.containers[0].readiness_probe,
pod.spec.containers[0].liveness_probe
]
for probe in probes:
if probe is None:
return False
return True
headpod = get_head_pod(BasicRayTestCase.ray_cluster_ns)
assert is_probe_injected(headpod)
# TODO (kevin85421): We only check 1 worker Pod here.
worker_pod = get_pod(BasicRayTestCase.ray_cluster_ns, "ray.io/node-type=worker")
assert is_probe_injected(worker_pod)

class RayFTTestCase(unittest.TestCase):
"""Test Ray GCS Fault Tolerance"""
Expand Down
4 changes: 0 additions & 4 deletions tests/config/ray-cluster.mini.yaml.template
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
labels:
controller-tools.k8s.io: "1.0"
# An unique identifier for the head node and workers of this cluster.
name: raycluster-mini
spec:
rayVersion: '$ray_version' # should match the Ray version in the image of the containers
Expand All @@ -12,7 +9,6 @@ spec:
headGroupSpec:
# the following params are used to complete the ray start: ray start --head --block ...
rayStartParams:
dashboard-host: '0.0.0.0'
num-cpus: '1'
#pod template
template:
Expand Down

0 comments on commit c54c3d9

Please sign in to comment.