diff --git a/ray-operator/controllers/ray/batchscheduler/schedulermanager.go b/ray-operator/controllers/ray/batchscheduler/schedulermanager.go index 221c10798d..bdc58b9df1 100644 --- a/ray-operator/controllers/ray/batchscheduler/schedulermanager.go +++ b/ray-operator/controllers/ray/batchscheduler/schedulermanager.go @@ -11,7 +11,7 @@ import ( rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface" "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" ) var schedulerContainers = map[string]schedulerinterface.BatchSchedulerFactory{ @@ -55,7 +55,7 @@ func NewSchedulerManager(config *rest.Config) *SchedulerManager { } func (batch *SchedulerManager) GetSchedulerForCluster(app *rayv1.RayCluster) (schedulerinterface.BatchScheduler, error) { - if schedulerName, ok := app.ObjectMeta.Labels[common.RaySchedulerName]; ok { + if schedulerName, ok := app.ObjectMeta.Labels[utils.RaySchedulerName]; ok { return batch.GetScheduler(schedulerName) } diff --git a/ray-operator/controllers/ray/batchscheduler/volcano/volcano_scheduler.go b/ray-operator/controllers/ray/batchscheduler/volcano/volcano_scheduler.go index cb8c3616be..954c984db2 100644 --- a/ray-operator/controllers/ray/batchscheduler/volcano/volcano_scheduler.go +++ b/ray-operator/controllers/ray/batchscheduler/volcano/volcano_scheduler.go @@ -18,7 +18,6 @@ import ( volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" utilruntime "k8s.io/apimachinery/pkg/util/runtime" quotav1 "k8s.io/apiserver/pkg/quota/v1" @@ -128,7 +127,7 @@ func createPodGroup( podGroup.Spec.Queue = queue } - if priorityClassName, ok := app.ObjectMeta.Labels[common.RayPriorityClassName]; ok { + if priorityClassName, ok := app.ObjectMeta.Labels[utils.RayPriorityClassName]; ok { podGroup.Spec.PriorityClassName = priorityClassName } @@ -140,7 +139,7 @@ func (v *VolcanoBatchScheduler) AddMetadataToPod(app *rayv1.RayCluster, pod *cor if queue, ok := app.ObjectMeta.Labels[QueueNameLabelKey]; ok { pod.Labels[QueueNameLabelKey] = queue } - if priorityClassName, ok := app.ObjectMeta.Labels[common.RayPriorityClassName]; ok { + if priorityClassName, ok := app.ObjectMeta.Labels[utils.RayPriorityClassName]; ok { pod.Spec.PriorityClassName = priorityClassName } pod.Spec.SchedulerName = v.Name() diff --git a/ray-operator/controllers/ray/common/ingress.go b/ray-operator/controllers/ray/common/ingress.go index 09b64bac87..14fc0905a8 100644 --- a/ray-operator/controllers/ray/common/ingress.go +++ b/ray-operator/controllers/ray/common/ingress.go @@ -17,10 +17,10 @@ const IngressClassAnnotationKey = "kubernetes.io/ingress.class" // This is used to expose dashboard for external traffic. func BuildIngressForHeadService(cluster rayv1.RayCluster) (*networkingv1.Ingress, error) { labels := map[string]string{ - RayClusterLabelKey: cluster.Name, - RayIDLabelKey: utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode), - KubernetesApplicationNameLabelKey: ApplicationName, - KubernetesCreatedByLabelKey: ComponentName, + utils.RayClusterLabelKey: cluster.Name, + utils.RayIDLabelKey: utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode), + utils.KubernetesApplicationNameLabelKey: utils.ApplicationName, + utils.KubernetesCreatedByLabelKey: utils.ComponentName, } // Copy other ingress configurations from cluster annotations to provide a generic way @@ -40,7 +40,7 @@ func BuildIngressForHeadService(cluster rayv1.RayCluster) (*networkingv1.Ingress var paths []networkingv1.HTTPIngressPath pathType := networkingv1.PathTypeExact servicePorts := getServicePorts(cluster) - dashboardPort := int32(DefaultDashboardPort) + dashboardPort := int32(utils.DefaultDashboardPort) if port, ok := servicePorts["dashboard"]; ok { dashboardPort = port } @@ -113,8 +113,8 @@ func BuildIngressForRayService(service rayv1.RayService, cluster rayv1.RayCluste ingress.ObjectMeta.Name = headSvcName ingress.ObjectMeta.Namespace = service.Namespace ingress.ObjectMeta.Labels = map[string]string{ - RayServiceLabelKey: service.Name, - RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(service.Name, rayv1.HeadNode)), + utils.RayServiceLabelKey: service.Name, + utils.RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(service.Name, rayv1.HeadNode)), } return ingress, nil diff --git a/ray-operator/controllers/ray/common/ingress_test.go b/ray-operator/controllers/ray/common/ingress_test.go index 407e8f9ccb..fc9b8dbd24 100644 --- a/ray-operator/controllers/ray/common/ingress_test.go +++ b/ray-operator/controllers/ray/common/ingress_test.go @@ -85,7 +85,7 @@ func TestBuildIngressForHeadService(t *testing.T) { assert.Nil(t, err) // check ingress.class annotation - actualResult := ingress.Labels[RayClusterLabelKey] + actualResult := ingress.Labels[utils.RayClusterLabelKey] expectedResult := instanceWithIngressEnabled.Name if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) diff --git a/ray-operator/controllers/ray/common/job.go b/ray-operator/controllers/ray/common/job.go index 7d7c33f672..e7600e8e99 100644 --- a/ray-operator/controllers/ray/common/job.go +++ b/ray-operator/controllers/ray/common/job.go @@ -9,6 +9,7 @@ import ( semver "github.com/Masterminds/semver/v3" "github.com/google/shlex" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "sigs.k8s.io/yaml" @@ -144,7 +145,7 @@ func GetDefaultSubmitterTemplate(rayClusterInstance *rayv1.RayCluster) corev1.Po { Name: "ray-job-submitter", // Use the image of the Ray head to be defensive against version mismatch issues - Image: rayClusterInstance.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Image, + Image: rayClusterInstance.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Image, Resources: corev1.ResourceRequirements{ Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("1"), diff --git a/ray-operator/controllers/ray/common/job_test.go b/ray-operator/controllers/ray/common/job_test.go index ed33b90947..f17b8576ae 100644 --- a/ray-operator/controllers/ray/common/job_test.go +++ b/ray-operator/controllers/ray/common/job_test.go @@ -5,6 +5,7 @@ import ( "testing" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" ) @@ -197,5 +198,5 @@ func TestGetDefaultSubmitterTemplate(t *testing.T) { }, } template := GetDefaultSubmitterTemplate(rayCluster) - assert.Equal(t, template.Spec.Containers[0].Image, rayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Image) + assert.Equal(t, template.Spec.Containers[0].Image, rayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Image) } diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 9b98e35e86..206669ed75 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -43,7 +43,7 @@ func GetHeadPort(headStartParams map[string]string) string { var headPort string if value, ok := headStartParams["port"]; !ok { // using default port - headPort = strconv.Itoa(DefaultRedisPort) + headPort = strconv.Itoa(utils.DefaultRedisPort) } else { // setting port from the params headPort = value @@ -53,13 +53,13 @@ func GetHeadPort(headStartParams map[string]string) string { // Check if the RayCluster has GCS fault tolerance enabled. func IsGCSFaultToleranceEnabled(instance rayv1.RayCluster) bool { - v, ok := instance.Annotations[RayFTEnabledAnnotationKey] + v, ok := instance.Annotations[utils.RayFTEnabledAnnotationKey] return ok && strings.ToLower(v) == "true" } // Check if overwrites the container command. func isOverwriteRayContainerCmd(instance rayv1.RayCluster) bool { - v, ok := instance.Annotations[RayOverwriteContainerCmdAnnotationKey] + v, ok := instance.Annotations[utils.RayOverwriteContainerCmdAnnotationKey] return ok && strings.ToLower(v) == "true" } @@ -71,20 +71,20 @@ func initTemplateAnnotations(instance rayv1.RayCluster, podTemplate *corev1.PodT // For now, we just set ray external storage enabled/disabled by checking if FT is enabled/disabled. // This may need to be updated in the future. if IsGCSFaultToleranceEnabled(instance) { - podTemplate.Annotations[RayFTEnabledAnnotationKey] = "true" + podTemplate.Annotations[utils.RayFTEnabledAnnotationKey] = "true" // if we have FT enabled, we need to set up a default external storage namespace. - podTemplate.Annotations[RayExternalStorageNSAnnotationKey] = string(instance.UID) + podTemplate.Annotations[utils.RayExternalStorageNSAnnotationKey] = string(instance.UID) } else { - podTemplate.Annotations[RayFTEnabledAnnotationKey] = "false" + podTemplate.Annotations[utils.RayFTEnabledAnnotationKey] = "false" } if isOverwriteRayContainerCmd(instance) { - podTemplate.Annotations[RayOverwriteContainerCmdAnnotationKey] = "true" + podTemplate.Annotations[utils.RayOverwriteContainerCmdAnnotationKey] = "true" } // set ray external storage namespace if user specified one. if instance.Annotations != nil { - if v, ok := instance.Annotations[RayExternalStorageNSAnnotationKey]; ok { - podTemplate.Annotations[RayExternalStorageNSAnnotationKey] = v + if v, ok := instance.Annotations[utils.RayExternalStorageNSAnnotationKey]; ok { + podTemplate.Annotations[utils.RayExternalStorageNSAnnotationKey] = v } } } @@ -118,7 +118,7 @@ func DefaultHeadPodTemplate(instance rayv1.RayCluster, headSpec rayv1.HeadGroupS // utils.CheckName clips the name to match the behavior of reconcileAutoscalerServiceAccount podTemplate.Spec.ServiceAccountName = utils.CheckName(utils.GetHeadGroupServiceAccountName(&instance)) // Use the same image as Ray head container by default. - autoscalerImage := podTemplate.Spec.Containers[RayContainerIndex].Image + autoscalerImage := podTemplate.Spec.Containers[utils.RayContainerIndex].Image // inject autoscaler container into head pod autoscalerContainer := BuildAutoscalerContainer(autoscalerImage) // Merge the user overrides from autoscalerOptions into the autoscaler container config. @@ -127,13 +127,13 @@ func DefaultHeadPodTemplate(instance rayv1.RayCluster, headSpec rayv1.HeadGroupS } // If the metrics port does not exist in the Ray container, add a default one for Promethues. - isMetricsPortExists := utils.FindContainerPort(&podTemplate.Spec.Containers[RayContainerIndex], MetricsPortName, -1) != -1 + isMetricsPortExists := utils.FindContainerPort(&podTemplate.Spec.Containers[utils.RayContainerIndex], utils.MetricsPortName, -1) != -1 if !isMetricsPortExists { metricsPort := corev1.ContainerPort{ - Name: MetricsPortName, - ContainerPort: int32(DefaultMetricsPort), + Name: utils.MetricsPortName, + ContainerPort: int32(utils.DefaultMetricsPort), } - podTemplate.Spec.Containers[RayContainerIndex].Ports = append(podTemplate.Spec.Containers[RayContainerIndex].Ports, metricsPort) + podTemplate.Spec.Containers[utils.RayContainerIndex].Ports = append(podTemplate.Spec.Containers[utils.RayContainerIndex].Ports, metricsPort) } return podTemplate @@ -147,7 +147,7 @@ func getEnableInitContainerInjection() bool { } func getEnableProbesInjection() bool { - if s := os.Getenv(ENABLE_PROBES_INJECTION); strings.ToLower(s) == "false" { + if s := os.Getenv(utils.ENABLE_PROBES_INJECTION); strings.ToLower(s) == "false" { return false } return true @@ -168,11 +168,11 @@ func DefaultWorkerPodTemplate(instance rayv1.RayCluster, workerSpec rayv1.Worker if enableInitContainerInjection { // Do not modify `deepCopyRayContainer` anywhere. - deepCopyRayContainer := podTemplate.Spec.Containers[RayContainerIndex].DeepCopy() + deepCopyRayContainer := podTemplate.Spec.Containers[utils.RayContainerIndex].DeepCopy() initContainer := corev1.Container{ Name: "wait-gcs-ready", - Image: podTemplate.Spec.Containers[RayContainerIndex].Image, - ImagePullPolicy: podTemplate.Spec.Containers[RayContainerIndex].ImagePullPolicy, + Image: podTemplate.Spec.Containers[utils.RayContainerIndex].Image, + ImagePullPolicy: podTemplate.Spec.Containers[utils.RayContainerIndex].ImagePullPolicy, Command: []string{"/bin/bash", "-lc", "--"}, Args: []string{ fmt.Sprintf(` @@ -195,7 +195,7 @@ func DefaultWorkerPodTemplate(instance rayv1.RayCluster, workerSpec rayv1.Worker done `, fqdnRayIP, headPort, fqdnRayIP, headPort), }, - SecurityContext: podTemplate.Spec.Containers[RayContainerIndex].SecurityContext.DeepCopy(), + SecurityContext: podTemplate.Spec.Containers[utils.RayContainerIndex].SecurityContext.DeepCopy(), // This init container requires certain environment variables to establish a secure connection with the Ray head using TLS authentication. // Additionally, some of these environment variables may reference files stored in volumes, so we need to include both the `Env` and `VolumeMounts` fields here. // For more details, please refer to: https://docs.ray.io/en/latest/ray-core/configure.html#tls-authentication. @@ -231,13 +231,13 @@ func DefaultWorkerPodTemplate(instance rayv1.RayCluster, workerSpec rayv1.Worker initTemplateAnnotations(instance, &podTemplate) // If the metrics port does not exist in the Ray container, add a default one for Promethues. - isMetricsPortExists := utils.FindContainerPort(&podTemplate.Spec.Containers[RayContainerIndex], MetricsPortName, -1) != -1 + isMetricsPortExists := utils.FindContainerPort(&podTemplate.Spec.Containers[utils.RayContainerIndex], utils.MetricsPortName, -1) != -1 if !isMetricsPortExists { metricsPort := corev1.ContainerPort{ - Name: MetricsPortName, - ContainerPort: int32(DefaultMetricsPort), + Name: utils.MetricsPortName, + ContainerPort: int32(utils.DefaultMetricsPort), } - podTemplate.Spec.Containers[RayContainerIndex].Ports = append(podTemplate.Spec.Containers[RayContainerIndex].Ports, metricsPort) + podTemplate.Spec.Containers[utils.RayContainerIndex].Ports = append(podTemplate.Spec.Containers[utils.RayContainerIndex].Ports, metricsPort) } return podTemplate @@ -258,15 +258,15 @@ func initHealthProbe(probe *corev1.Probe, rayNodeType rayv1.RayNodeType) { if rayNodeType == rayv1.HeadNode { cmd := []string{ "bash", "-c", fmt.Sprintf("wget -T 2 -q -O- http://localhost:%d/%s | grep success", - DefaultDashboardAgentListenPort, RayAgentRayletHealthPath), + utils.DefaultDashboardAgentListenPort, utils.RayAgentRayletHealthPath), "&&", "bash", "-c", fmt.Sprintf("wget -T 2 -q -O- http://localhost:%d/%s | grep success", - DefaultDashboardPort, RayDashboardGCSHealthPath), + utils.DefaultDashboardPort, utils.RayDashboardGCSHealthPath), } probe.Exec = &corev1.ExecAction{Command: cmd} } else { cmd := []string{ "bash", "-c", fmt.Sprintf("wget -T 2 -q -O- http://localhost:%d/%s | grep success", - DefaultDashboardAgentListenPort, RayAgentRayletHealthPath), + utils.DefaultDashboardAgentListenPort, utils.RayAgentRayletHealthPath), } probe.Exec = &corev1.ExecAction{Command: cmd} } @@ -279,7 +279,7 @@ func BuildPod(podTemplateSpec corev1.PodTemplateSpec, rayNodeType rayv1.RayNodeT // TODO (kevin85421): In the current RayService implementation, we only add this label to a Pod after // it passes the health check. The other option is to use the readiness probe to control it. This // logic always add the label to the Pod no matter whether it is ready or not. - podTemplateSpec.Labels[RayClusterServingServiceLabelKey] = EnableRayClusterServingServiceTrue + podTemplateSpec.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue } pod := corev1.Pod{ TypeMeta: metav1.TypeMeta{ @@ -291,37 +291,37 @@ func BuildPod(podTemplateSpec corev1.PodTemplateSpec, rayNodeType rayv1.RayNodeT } // Add /dev/shm volumeMount for the object store to avoid performance degradation. - addEmptyDir(&pod.Spec.Containers[RayContainerIndex], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, corev1.StorageMediumMemory) + addEmptyDir(&pod.Spec.Containers[utils.RayContainerIndex], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, corev1.StorageMediumMemory) if rayNodeType == rayv1.HeadNode && enableRayAutoscaler != nil && *enableRayAutoscaler { // The Ray autoscaler writes logs which are read by the Ray head. // We need a shared log volume to enable this information flow. // Specifically, this is required for the event-logging functionality // introduced in https://github.com/ray-project/ray/pull/13434. autoscalerContainerIndex := getAutoscalerContainerIndex(pod) - addEmptyDir(&pod.Spec.Containers[RayContainerIndex], &pod, RayLogVolumeName, RayLogVolumeMountPath, corev1.StorageMediumDefault) + addEmptyDir(&pod.Spec.Containers[utils.RayContainerIndex], &pod, RayLogVolumeName, RayLogVolumeMountPath, corev1.StorageMediumDefault) addEmptyDir(&pod.Spec.Containers[autoscalerContainerIndex], &pod, RayLogVolumeName, RayLogVolumeMountPath, corev1.StorageMediumDefault) } - cleanupInvalidVolumeMounts(&pod.Spec.Containers[RayContainerIndex], &pod) - if len(pod.Spec.InitContainers) > RayContainerIndex { - cleanupInvalidVolumeMounts(&pod.Spec.InitContainers[RayContainerIndex], &pod) + cleanupInvalidVolumeMounts(&pod.Spec.Containers[utils.RayContainerIndex], &pod) + if len(pod.Spec.InitContainers) > utils.RayContainerIndex { + cleanupInvalidVolumeMounts(&pod.Spec.InitContainers[utils.RayContainerIndex], &pod) } var cmd, args string - if len(pod.Spec.Containers[RayContainerIndex].Command) > 0 { - cmd = convertCmdToString(pod.Spec.Containers[RayContainerIndex].Command) + if len(pod.Spec.Containers[utils.RayContainerIndex].Command) > 0 { + cmd = convertCmdToString(pod.Spec.Containers[utils.RayContainerIndex].Command) } - if len(pod.Spec.Containers[RayContainerIndex].Args) > 0 { - cmd += convertCmdToString(pod.Spec.Containers[RayContainerIndex].Args) + if len(pod.Spec.Containers[utils.RayContainerIndex].Args) > 0 { + cmd += convertCmdToString(pod.Spec.Containers[utils.RayContainerIndex].Args) } // Increase the open file descriptor limit of the `ray start` process and its child processes to 65536. ulimitCmd := "ulimit -n 65536" // Generate the `ray start` command. - rayStartCmd := generateRayStartCommand(rayNodeType, rayStartParams, pod.Spec.Containers[RayContainerIndex].Resources) + rayStartCmd := generateRayStartCommand(rayNodeType, rayStartParams, pod.Spec.Containers[utils.RayContainerIndex].Resources) // Check if overwrites the generated container command or not. isOverwriteRayContainerCmd := false - if v, ok := podTemplateSpec.Annotations[RayOverwriteContainerCmdAnnotationKey]; ok { + if v, ok := podTemplateSpec.Annotations[utils.RayOverwriteContainerCmdAnnotationKey]; ok { isOverwriteRayContainerCmd = strings.ToLower(v) == "true" } @@ -330,7 +330,7 @@ func BuildPod(podTemplateSpec corev1.PodTemplateSpec, rayNodeType rayv1.RayNodeT generatedCmd := fmt.Sprintf("%s; %s", ulimitCmd, rayStartCmd) log.Info("BuildPod", "rayNodeType", rayNodeType, "generatedCmd", generatedCmd) // replacing the old command - pod.Spec.Containers[RayContainerIndex].Command = []string{"/bin/bash", "-lc", "--"} + pod.Spec.Containers[utils.RayContainerIndex].Command = []string{"/bin/bash", "-lc", "--"} if cmd != "" { // If 'ray start' has --block specified, commands after it will not get executed. // so we need to put cmd before cont. @@ -339,7 +339,7 @@ func BuildPod(podTemplateSpec corev1.PodTemplateSpec, rayNodeType rayv1.RayNodeT args = generatedCmd } - pod.Spec.Containers[RayContainerIndex].Args = []string{args} + pod.Spec.Containers[utils.RayContainerIndex].Args = []string{args} } for index := range pod.Spec.InitContainers { @@ -355,29 +355,29 @@ func BuildPod(podTemplateSpec corev1.PodTemplateSpec, rayNodeType rayv1.RayNodeT // Configure the readiness and liveness probes for the Ray container. These probes // play a crucial role in KubeRay health checks. Without them, certain failures, // such as the Raylet process crashing, may go undetected. - if pod.Spec.Containers[RayContainerIndex].ReadinessProbe == nil { + if pod.Spec.Containers[utils.RayContainerIndex].ReadinessProbe == nil { probe := &corev1.Probe{ - InitialDelaySeconds: DefaultReadinessProbeInitialDelaySeconds, - TimeoutSeconds: DefaultReadinessProbeTimeoutSeconds, - PeriodSeconds: DefaultReadinessProbePeriodSeconds, - SuccessThreshold: DefaultReadinessProbeSuccessThreshold, - FailureThreshold: DefaultReadinessProbeFailureThreshold, + InitialDelaySeconds: utils.DefaultReadinessProbeInitialDelaySeconds, + TimeoutSeconds: utils.DefaultReadinessProbeTimeoutSeconds, + PeriodSeconds: utils.DefaultReadinessProbePeriodSeconds, + SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold, + FailureThreshold: utils.DefaultReadinessProbeFailureThreshold, } - pod.Spec.Containers[RayContainerIndex].ReadinessProbe = probe + pod.Spec.Containers[utils.RayContainerIndex].ReadinessProbe = probe } - initHealthProbe(pod.Spec.Containers[RayContainerIndex].ReadinessProbe, rayNodeType) + initHealthProbe(pod.Spec.Containers[utils.RayContainerIndex].ReadinessProbe, rayNodeType) - if pod.Spec.Containers[RayContainerIndex].LivenessProbe == nil { + if pod.Spec.Containers[utils.RayContainerIndex].LivenessProbe == nil { probe := &corev1.Probe{ - InitialDelaySeconds: DefaultLivenessProbeInitialDelaySeconds, - TimeoutSeconds: DefaultLivenessProbeTimeoutSeconds, - PeriodSeconds: DefaultLivenessProbePeriodSeconds, - SuccessThreshold: DefaultLivenessProbeSuccessThreshold, - FailureThreshold: DefaultLivenessProbeFailureThreshold, + InitialDelaySeconds: utils.DefaultLivenessProbeInitialDelaySeconds, + TimeoutSeconds: utils.DefaultLivenessProbeTimeoutSeconds, + PeriodSeconds: utils.DefaultLivenessProbePeriodSeconds, + SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold, + FailureThreshold: utils.DefaultLivenessProbeFailureThreshold, } - pod.Spec.Containers[RayContainerIndex].LivenessProbe = probe + pod.Spec.Containers[utils.RayContainerIndex].LivenessProbe = probe } - initHealthProbe(pod.Spec.Containers[RayContainerIndex].LivenessProbe, rayNodeType) + initHealthProbe(pod.Spec.Containers[utils.RayContainerIndex].LivenessProbe, rayNodeType) } return pod @@ -391,10 +391,10 @@ func BuildAutoscalerContainer(autoscalerImage string) corev1.Container { ImagePullPolicy: corev1.PullIfNotPresent, Env: []corev1.EnvVar{ { - Name: RAY_CLUSTER_NAME, + Name: utils.RAY_CLUSTER_NAME, ValueFrom: &corev1.EnvVarSource{ FieldRef: &corev1.ObjectFieldSelector{ - FieldPath: fmt.Sprintf("metadata.labels['%s']", RayClusterLabelKey), + FieldPath: fmt.Sprintf("metadata.labels['%s']", utils.RayClusterLabelKey), }, }, }, @@ -498,13 +498,13 @@ func labelPod(rayNodeType rayv1.RayNodeType, rayClusterName string, groupName st } ret = map[string]string{ - RayNodeLabelKey: "yes", - RayClusterLabelKey: rayClusterName, - RayNodeTypeLabelKey: string(rayNodeType), - RayNodeGroupLabelKey: groupName, - RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(rayClusterName, rayNodeType)), - KubernetesApplicationNameLabelKey: ApplicationName, - KubernetesCreatedByLabelKey: ComponentName, + utils.RayNodeLabelKey: "yes", + utils.RayClusterLabelKey: rayClusterName, + utils.RayNodeTypeLabelKey: string(rayNodeType), + utils.RayNodeGroupLabelKey: groupName, + utils.RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(rayClusterName, rayNodeType)), + utils.KubernetesApplicationNameLabelKey: utils.ApplicationName, + utils.KubernetesCreatedByLabelKey: utils.ComponentName, } for k, v := range ret { @@ -514,7 +514,7 @@ func labelPod(rayNodeType rayv1.RayNodeType, rayClusterName string, groupName st labels[k] = v } } - if k == RayNodeGroupLabelKey { + if k == utils.RayNodeGroupLabelKey { // overriding invalid values for this label if v != groupName { labels[k] = v @@ -536,113 +536,113 @@ func setInitContainerEnvVars(container *corev1.Container, fqdnRayIP string) { // (1) The head needs FQ_RAY_IP to create a self-signed certificate for its TLS authenticate. // (2) The worker needs FQ_RAY_IP to establish a connection with the Ray head. container.Env = append(container.Env, - corev1.EnvVar{Name: FQ_RAY_IP, Value: fqdnRayIP}, + corev1.EnvVar{Name: utils.FQ_RAY_IP, Value: fqdnRayIP}, // RAY_IP is deprecated and should be kept for backward compatibility purposes only. - corev1.EnvVar{Name: RAY_IP, Value: utils.ExtractRayIPFromFQDN(fqdnRayIP)}, + corev1.EnvVar{Name: utils.RAY_IP, Value: utils.ExtractRayIPFromFQDN(fqdnRayIP)}, ) } func setContainerEnvVars(pod *corev1.Pod, rayNodeType rayv1.RayNodeType, rayStartParams map[string]string, fqdnRayIP string, headPort string, rayStartCmd string, creator string) { // TODO: Audit all environment variables to identify which should not be modified by users. - container := &pod.Spec.Containers[RayContainerIndex] + container := &pod.Spec.Containers[utils.RayContainerIndex] if container.Env == nil || len(container.Env) == 0 { container.Env = []corev1.EnvVar{} } // case 1: head => Use LOCAL_HOST // case 2: worker => Use fqdnRayIP (fully qualified domain name) - ip := LOCAL_HOST + ip := utils.LOCAL_HOST if rayNodeType == rayv1.WorkerNode { ip = fqdnRayIP container.Env = append(container.Env, - corev1.EnvVar{Name: FQ_RAY_IP, Value: ip}, + corev1.EnvVar{Name: utils.FQ_RAY_IP, Value: ip}, // RAY_IP is deprecated and should be kept for backward compatibility purposes only. - corev1.EnvVar{Name: RAY_IP, Value: utils.ExtractRayIPFromFQDN(ip)}, + corev1.EnvVar{Name: utils.RAY_IP, Value: utils.ExtractRayIPFromFQDN(ip)}, ) } // The RAY_CLUSTER_NAME environment variable is managed by KubeRay and should not be set by the user. clusterNameEnv := corev1.EnvVar{ - Name: RAY_CLUSTER_NAME, + Name: utils.RAY_CLUSTER_NAME, ValueFrom: &corev1.EnvVarSource{ FieldRef: &corev1.ObjectFieldSelector{ - FieldPath: fmt.Sprintf("metadata.labels['%s']", RayClusterLabelKey), + FieldPath: fmt.Sprintf("metadata.labels['%s']", utils.RayClusterLabelKey), }, }, } container.Env = append(container.Env, clusterNameEnv) - // KUBERAY_GEN_RAY_START_CMD stores the `ray start` command generated by KubeRay. + // utils.KUBERAY_GEN_RAY_START_CMD stores the `ray start` command generated by KubeRay. // See https://github.com/ray-project/kuberay/issues/1560 for more details. - generatedRayStartCmdEnv := corev1.EnvVar{Name: KUBERAY_GEN_RAY_START_CMD, Value: rayStartCmd} + generatedRayStartCmdEnv := corev1.EnvVar{Name: utils.KUBERAY_GEN_RAY_START_CMD, Value: rayStartCmd} container.Env = append(container.Env, generatedRayStartCmdEnv) - if !envVarExists(RAY_PORT, container.Env) { - portEnv := corev1.EnvVar{Name: RAY_PORT, Value: headPort} + if !envVarExists(utils.RAY_PORT, container.Env) { + portEnv := corev1.EnvVar{Name: utils.RAY_PORT, Value: headPort} container.Env = append(container.Env, portEnv) } - if strings.ToLower(creator) == RayServiceCreatorLabelValue { + if strings.ToLower(creator) == utils.RayServiceCreatorLabelValue { // Only add this env for Ray Service cluster to improve service SLA. - if !envVarExists(RAY_TIMEOUT_MS_TASK_WAIT_FOR_DEATH_INFO, container.Env) { - deathEnv := corev1.EnvVar{Name: RAY_TIMEOUT_MS_TASK_WAIT_FOR_DEATH_INFO, Value: "0"} + if !envVarExists(utils.RAY_TIMEOUT_MS_TASK_WAIT_FOR_DEATH_INFO, container.Env) { + deathEnv := corev1.EnvVar{Name: utils.RAY_TIMEOUT_MS_TASK_WAIT_FOR_DEATH_INFO, Value: "0"} container.Env = append(container.Env, deathEnv) } - if !envVarExists(RAY_GCS_SERVER_REQUEST_TIMEOUT_SECONDS, container.Env) { - gcsTimeoutEnv := corev1.EnvVar{Name: RAY_GCS_SERVER_REQUEST_TIMEOUT_SECONDS, Value: "5"} + if !envVarExists(utils.RAY_GCS_SERVER_REQUEST_TIMEOUT_SECONDS, container.Env) { + gcsTimeoutEnv := corev1.EnvVar{Name: utils.RAY_GCS_SERVER_REQUEST_TIMEOUT_SECONDS, Value: "5"} container.Env = append(container.Env, gcsTimeoutEnv) } - if !envVarExists(RAY_SERVE_KV_TIMEOUT_S, container.Env) { - serveKvTimeoutEnv := corev1.EnvVar{Name: RAY_SERVE_KV_TIMEOUT_S, Value: "5"} + if !envVarExists(utils.RAY_SERVE_KV_TIMEOUT_S, container.Env) { + serveKvTimeoutEnv := corev1.EnvVar{Name: utils.RAY_SERVE_KV_TIMEOUT_S, Value: "5"} container.Env = append(container.Env, serveKvTimeoutEnv) } - if !envVarExists(SERVE_CONTROLLER_PIN_ON_NODE, container.Env) { - servePinOnNode := corev1.EnvVar{Name: SERVE_CONTROLLER_PIN_ON_NODE, Value: "0"} + if !envVarExists(utils.SERVE_CONTROLLER_PIN_ON_NODE, container.Env) { + servePinOnNode := corev1.EnvVar{Name: utils.SERVE_CONTROLLER_PIN_ON_NODE, Value: "0"} container.Env = append(container.Env, servePinOnNode) } } // Setting the RAY_ADDRESS env allows connecting to Ray using ray.init() when connecting // from within the cluster. - if !envVarExists(RAY_ADDRESS, container.Env) { + if !envVarExists(utils.RAY_ADDRESS, container.Env) { rayAddress := fmt.Sprintf("%s:%s", ip, headPort) - addressEnv := corev1.EnvVar{Name: RAY_ADDRESS, Value: rayAddress} + addressEnv := corev1.EnvVar{Name: utils.RAY_ADDRESS, Value: rayAddress} container.Env = append(container.Env, addressEnv) } - if !envVarExists(RAY_USAGE_STATS_KUBERAY_IN_USE, container.Env) { - usageEnv := corev1.EnvVar{Name: RAY_USAGE_STATS_KUBERAY_IN_USE, Value: "1"} + if !envVarExists(utils.RAY_USAGE_STATS_KUBERAY_IN_USE, container.Env) { + usageEnv := corev1.EnvVar{Name: utils.RAY_USAGE_STATS_KUBERAY_IN_USE, Value: "1"} container.Env = append(container.Env, usageEnv) } - if !envVarExists(REDIS_PASSWORD, container.Env) { + if !envVarExists(utils.REDIS_PASSWORD, container.Env) { // setting the REDIS_PASSWORD env var from the params - redisPasswordEnv := corev1.EnvVar{Name: REDIS_PASSWORD} + redisPasswordEnv := corev1.EnvVar{Name: utils.REDIS_PASSWORD} if value, ok := rayStartParams["redis-password"]; ok { redisPasswordEnv.Value = value } container.Env = append(container.Env, redisPasswordEnv) } - if !envVarExists(RAY_EXTERNAL_STORAGE_NS, container.Env) { + if !envVarExists(utils.RAY_EXTERNAL_STORAGE_NS, container.Env) { // setting the RAY_EXTERNAL_STORAGE_NS env var from the params if pod.Annotations != nil { - if v, ok := pod.Annotations[RayExternalStorageNSAnnotationKey]; ok { - storageNS := corev1.EnvVar{Name: RAY_EXTERNAL_STORAGE_NS, Value: v} + if v, ok := pod.Annotations[utils.RayExternalStorageNSAnnotationKey]; ok { + storageNS := corev1.EnvVar{Name: utils.RAY_EXTERNAL_STORAGE_NS, Value: v} container.Env = append(container.Env, storageNS) } } } - if !envVarExists(RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, container.Env) && rayNodeType == rayv1.WorkerNode { + if !envVarExists(utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, container.Env) && rayNodeType == rayv1.WorkerNode { // If GCS FT is enabled and RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S is not set, set the worker's // RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S to 600s. If the worker cannot reconnect to GCS within // 600s, the Raylet will exit the process. By default, the value is 60s, so the head node will // crash if the GCS server is down for more than 60s. Typically, the new GCS server will be available // in 120 seconds, so we set the timeout to 600s to avoid the worker nodes crashing. - if ftEnabled := pod.Annotations[RayFTEnabledAnnotationKey] == "true"; ftEnabled { - gcsTimeout := corev1.EnvVar{Name: RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, Value: DefaultWorkerRayGcsReconnectTimeoutS} + if ftEnabled := pod.Annotations[utils.RayFTEnabledAnnotationKey] == "true"; ftEnabled { + gcsTimeout := corev1.EnvVar{Name: utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, Value: utils.DefaultWorkerRayGcsReconnectTimeoutS} container.Env = append(container.Env, gcsTimeout) } } - if !envVarExists(RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE, container.Env) { + if !envVarExists(utils.RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE, container.Env) { // This flag enables the display of disk usage. Without this flag, the dashboard will not show disk usage. - container.Env = append(container.Env, corev1.EnvVar{Name: RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE, Value: "1"}) + container.Env = append(container.Env, corev1.EnvVar{Name: utils.RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE, Value: "1"}) } } @@ -683,7 +683,7 @@ func setMissingRayStartParams(rayStartParams map[string]string, nodeType rayv1.R // Add a metrics port to expose the metrics to Prometheus. if _, ok := rayStartParams["metrics-export-port"]; !ok { - rayStartParams["metrics-export-port"] = fmt.Sprint(DefaultMetricsPort) + rayStartParams["metrics-export-port"] = fmt.Sprint(utils.DefaultMetricsPort) } // Add --block option. See https://github.com/ray-project/kuberay/pull/675 @@ -691,8 +691,8 @@ func setMissingRayStartParams(rayStartParams map[string]string, nodeType rayv1.R // Add dashboard listen port for RayService. if _, ok := rayStartParams["dashboard-agent-listen-port"]; !ok { - if value, ok := annotations[EnableServeServiceKey]; ok && value == EnableServeServiceTrue { - rayStartParams["dashboard-agent-listen-port"] = strconv.Itoa(DefaultDashboardAgentListenPort) + if value, ok := annotations[utils.EnableServeServiceKey]; ok && value == utils.EnableServeServiceTrue { + rayStartParams["dashboard-agent-listen-port"] = strconv.Itoa(utils.DefaultDashboardAgentListenPort) } } diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index bbe2f0d493..28cecdd1a5 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -189,10 +189,10 @@ var autoscalerContainer = corev1.Container{ ImagePullPolicy: corev1.PullIfNotPresent, Env: []corev1.EnvVar{ { - Name: RAY_CLUSTER_NAME, + Name: utils.RAY_CLUSTER_NAME, ValueFrom: &corev1.EnvVarSource{ FieldRef: &corev1.ObjectFieldSelector{ - FieldPath: fmt.Sprintf("metadata.labels['%s']", RayClusterLabelKey), + FieldPath: fmt.Sprintf("metadata.labels['%s']", utils.RayClusterLabelKey), }, }, }, @@ -331,37 +331,37 @@ func TestBuildPod(t *testing.T) { cluster := instance.DeepCopy() // Test head pod - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", false) // Check environment variables - rayContainer := pod.Spec.Containers[RayContainerIndex] - checkContainerEnv(t, rayContainer, RAY_ADDRESS, "127.0.0.1:6379") - checkContainerEnv(t, rayContainer, RAY_USAGE_STATS_KUBERAY_IN_USE, "1") - checkContainerEnv(t, rayContainer, RAY_CLUSTER_NAME, fmt.Sprintf("metadata.labels['%s']", RayClusterLabelKey)) - checkContainerEnv(t, rayContainer, RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE, "1") - headRayStartCommandEnv := getEnvVar(rayContainer, KUBERAY_GEN_RAY_START_CMD) + rayContainer := pod.Spec.Containers[utils.RayContainerIndex] + checkContainerEnv(t, rayContainer, utils.RAY_ADDRESS, "127.0.0.1:6379") + checkContainerEnv(t, rayContainer, utils.RAY_USAGE_STATS_KUBERAY_IN_USE, "1") + checkContainerEnv(t, rayContainer, utils.RAY_CLUSTER_NAME, fmt.Sprintf("metadata.labels['%s']", utils.RayClusterLabelKey)) + checkContainerEnv(t, rayContainer, utils.RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE, "1") + headRayStartCommandEnv := getEnvVar(rayContainer, utils.KUBERAY_GEN_RAY_START_CMD) assert.True(t, strings.Contains(headRayStartCommandEnv.Value, "ray start")) // In head, init container needs FQ_RAY_IP to create a self-signed certificate for its TLS authenticate. for _, initContainer := range pod.Spec.InitContainers { - checkContainerEnv(t, initContainer, FQ_RAY_IP, "raycluster-sample-head-svc.default.svc.cluster.local") - checkContainerEnv(t, rayContainer, RAY_IP, "raycluster-sample-head-svc") + checkContainerEnv(t, initContainer, utils.FQ_RAY_IP, "raycluster-sample-head-svc.default.svc.cluster.local") + checkContainerEnv(t, rayContainer, utils.RAY_IP, "raycluster-sample-head-svc") } // Check labels. - actualResult := pod.Labels[RayClusterLabelKey] + actualResult := pod.Labels[utils.RayClusterLabelKey] expectedResult := cluster.Name if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - actualResult = pod.Labels[RayNodeTypeLabelKey] + actualResult = pod.Labels[utils.RayNodeTypeLabelKey] expectedResult = string(rayv1.HeadNode) if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - actualResult = pod.Labels[RayNodeGroupLabelKey] + actualResult = pod.Labels[utils.RayNodeGroupLabelKey] expectedResult = "headgroup" if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) @@ -383,19 +383,19 @@ func TestBuildPod(t *testing.T) { // testing worker pod worker := cluster.Spec.WorkerGroupSpecs[0] - podName = cluster.Name + DashSymbol + string(rayv1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) + podName = cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0) fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace) podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379") pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, false) // Check environment variables - rayContainer = pod.Spec.Containers[RayContainerIndex] - checkContainerEnv(t, rayContainer, RAY_ADDRESS, "raycluster-sample-head-svc.default.svc.cluster.local:6379") - checkContainerEnv(t, rayContainer, FQ_RAY_IP, "raycluster-sample-head-svc.default.svc.cluster.local") - checkContainerEnv(t, rayContainer, RAY_IP, "raycluster-sample-head-svc") - checkContainerEnv(t, rayContainer, RAY_CLUSTER_NAME, fmt.Sprintf("metadata.labels['%s']", RayClusterLabelKey)) - checkContainerEnv(t, rayContainer, RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE, "1") - workerRayStartCommandEnv := getEnvVar(rayContainer, KUBERAY_GEN_RAY_START_CMD) + rayContainer = pod.Spec.Containers[utils.RayContainerIndex] + checkContainerEnv(t, rayContainer, utils.RAY_ADDRESS, "raycluster-sample-head-svc.default.svc.cluster.local:6379") + checkContainerEnv(t, rayContainer, utils.FQ_RAY_IP, "raycluster-sample-head-svc.default.svc.cluster.local") + checkContainerEnv(t, rayContainer, utils.RAY_IP, "raycluster-sample-head-svc") + checkContainerEnv(t, rayContainer, utils.RAY_CLUSTER_NAME, fmt.Sprintf("metadata.labels['%s']", utils.RayClusterLabelKey)) + checkContainerEnv(t, rayContainer, utils.RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE, "1") + workerRayStartCommandEnv := getEnvVar(rayContainer, utils.KUBERAY_GEN_RAY_START_CMD) assert.True(t, strings.Contains(workerRayStartCommandEnv.Value, "ray start")) expectedCommandArg := splitAndSort("ulimit -n 65536; ray start --block --memory=1073741824 --num-cpus=1 --num-gpus=3 --address=raycluster-sample-head-svc.default.svc.cluster.local:6379 --port=6379 --metrics-export-port=8080") @@ -405,14 +405,14 @@ func TestBuildPod(t *testing.T) { } // Check Envs - rayContainer = pod.Spec.Containers[RayContainerIndex] + rayContainer = pod.Spec.Containers[utils.RayContainerIndex] checkContainerEnv(t, rayContainer, "TEST_ENV_NAME", "TEST_ENV_VALUE") // Try to build pod for serve pod = BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", true) - val, ok := pod.Labels[RayClusterServingServiceLabelKey] + val, ok := pod.Labels[utils.RayClusterServingServiceLabelKey] assert.True(t, ok, "Expected serve label is not present") - assert.Equal(t, EnableRayClusterServingServiceTrue, val, "Wrong serve label value") + assert.Equal(t, utils.EnableRayClusterServingServiceTrue, val, "Wrong serve label value") } func TestBuildPod_WithOverwriteCommand(t *testing.T) { @@ -420,26 +420,26 @@ func TestBuildPod_WithOverwriteCommand(t *testing.T) { cluster.Annotations = map[string]string{ // When the value of the annotation is "true", KubeRay will not generate the command and args for the container. // Instead, it will use the command and args specified by the use. - RayOverwriteContainerCmdAnnotationKey: "true", + utils.RayOverwriteContainerCmdAnnotationKey: "true", } - cluster.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Command = []string{"I am head"} - cluster.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Args = []string{"I am head again"} - cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[RayContainerIndex].Command = []string{"I am worker"} - cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[RayContainerIndex].Args = []string{"I am worker again"} + cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Command = []string{"I am head"} + cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Args = []string{"I am head again"} + cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[utils.RayContainerIndex].Command = []string{"I am worker"} + cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[utils.RayContainerIndex].Args = []string{"I am worker again"} - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") headPod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", false) - headContainer := headPod.Spec.Containers[RayContainerIndex] + headContainer := headPod.Spec.Containers[utils.RayContainerIndex] assert.Equal(t, headContainer.Command, []string{"I am head"}) assert.Equal(t, headContainer.Args, []string{"I am head again"}) worker := cluster.Spec.WorkerGroupSpecs[0] - podName = cluster.Name + DashSymbol + string(rayv1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) + podName = cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0) fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace) podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379") workerPod := BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, false) - workerContainer := workerPod.Spec.Containers[RayContainerIndex] + workerContainer := workerPod.Spec.Containers[utils.RayContainerIndex] assert.Equal(t, workerContainer.Command, []string{"I am worker"}) assert.Equal(t, workerContainer.Args, []string{"I am worker again"}) } @@ -447,21 +447,21 @@ func TestBuildPod_WithOverwriteCommand(t *testing.T) { func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", &trueFlag, "", "", false) - actualResult := pod.Labels[RayClusterLabelKey] + actualResult := pod.Labels[utils.RayClusterLabelKey] expectedResult := cluster.Name if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - actualResult = pod.Labels[RayNodeTypeLabelKey] + actualResult = pod.Labels[utils.RayNodeTypeLabelKey] expectedResult = string(rayv1.HeadNode) if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - actualResult = pod.Labels[RayNodeGroupLabelKey] + actualResult = pod.Labels[utils.RayNodeGroupLabelKey] expectedResult = "headgroup" if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) @@ -502,9 +502,9 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { func TestBuildPod_WithCreatedByRayService(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") - pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", &trueFlag, RayServiceCreatorLabelValue, "", false) + pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", &trueFlag, utils.RayServiceCreatorLabelValue, "", false) hasCorrectDeathEnv := false for _, container := range pod.Spec.Containers { @@ -515,7 +515,7 @@ func TestBuildPod_WithCreatedByRayService(t *testing.T) { t.Fatalf("Expected death env `%v`", container) } for _, env := range container.Env { - if env.Name == RAY_TIMEOUT_MS_TASK_WAIT_FOR_DEATH_INFO { + if env.Name == utils.RAY_TIMEOUT_MS_TASK_WAIT_FOR_DEATH_INFO { assert.Equal(t, "0", env.Value) hasCorrectDeathEnv = true break @@ -529,76 +529,76 @@ func TestBuildPod_WithGcsFtEnabled(t *testing.T) { // Test 1 cluster := instance.DeepCopy() cluster.Annotations = map[string]string{ - RayFTEnabledAnnotationKey: "true", + utils.RayFTEnabledAnnotationKey: "true", } // Build a head Pod. - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", false) // Check environment variable "RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S" - rayContainer := pod.Spec.Containers[RayContainerIndex] + rayContainer := pod.Spec.Containers[utils.RayContainerIndex] // "RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S" should not be set on the head Pod by default - assert.True(t, !envVarExists(RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, rayContainer.Env)) + assert.True(t, !envVarExists(utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, rayContainer.Env)) // Test 2 cluster = instance.DeepCopy() cluster.Annotations = map[string]string{ - RayFTEnabledAnnotationKey: "true", + utils.RayFTEnabledAnnotationKey: "true", } // Add "RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S" env var in the head group spec. - cluster.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env = append(cluster.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env, - corev1.EnvVar{Name: RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, Value: "60"}) + cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Env = append(cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Env, + corev1.EnvVar{Name: utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, Value: "60"}) podTemplateSpec = DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") pod = BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", false) - rayContainer = pod.Spec.Containers[RayContainerIndex] + rayContainer = pod.Spec.Containers[utils.RayContainerIndex] // Check environment variable "RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S" - checkContainerEnv(t, rayContainer, RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, "60") + checkContainerEnv(t, rayContainer, utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, "60") // Test 3 cluster = instance.DeepCopy() cluster.Annotations = map[string]string{ - RayFTEnabledAnnotationKey: "true", + utils.RayFTEnabledAnnotationKey: "true", } // Build a worker pod worker := cluster.Spec.WorkerGroupSpecs[0] - podName = cluster.Name + DashSymbol + string(rayv1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) + podName = cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0) fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace) podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379") pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, false) // Check the default value of "RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S" - rayContainer = pod.Spec.Containers[RayContainerIndex] - checkContainerEnv(t, rayContainer, RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, DefaultWorkerRayGcsReconnectTimeoutS) + rayContainer = pod.Spec.Containers[utils.RayContainerIndex] + checkContainerEnv(t, rayContainer, utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, utils.DefaultWorkerRayGcsReconnectTimeoutS) // Test 4 cluster = instance.DeepCopy() cluster.Annotations = map[string]string{ - RayFTEnabledAnnotationKey: "true", + utils.RayFTEnabledAnnotationKey: "true", } // Add "RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S" env var in the worker group spec. - cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[RayContainerIndex].Env = append(cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[RayContainerIndex].Env, - corev1.EnvVar{Name: RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, Value: "120"}) + cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[utils.RayContainerIndex].Env = append(cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[utils.RayContainerIndex].Env, + corev1.EnvVar{Name: utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, Value: "120"}) worker = cluster.Spec.WorkerGroupSpecs[0] podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379") pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, false) // Check the default value of "RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S" - rayContainer = pod.Spec.Containers[RayContainerIndex] - checkContainerEnv(t, rayContainer, RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, "120") + rayContainer = pod.Spec.Containers[utils.RayContainerIndex] + checkContainerEnv(t, rayContainer, utils.RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S, "120") } // Check that autoscaler container overrides work as expected. func TestBuildPodWithAutoscalerOptions(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) customAutoscalerImage := "custom-autoscaler-xxx" customPullPolicy := corev1.PullIfNotPresent @@ -675,7 +675,7 @@ func TestBuildPodWithAutoscalerOptions(t *testing.T) { func TestHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") // autoscaler container is injected into head pod @@ -706,10 +706,10 @@ func TestHeadPodTemplate_AutoscalerImage(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.EnableInTreeAutoscaling = &trueFlag cluster.Spec.AutoscalerOptions = nil - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) // Case 1: If `AutoscalerOptions.Image` is not set, the Autoscaler container should use the Ray head container's image by default. - expectedAutoscalerImage := cluster.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Image + expectedAutoscalerImage := cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Image podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") pod := corev1.Pod{ Spec: podTemplateSpec.Spec, @@ -734,7 +734,7 @@ func TestHeadPodTemplate_AutoscalerImage(t *testing.T) { // the head pod's service account should be an empty string. func TestHeadPodTemplate_WithNoServiceAccount(t *testing.T) { cluster := instance.DeepCopy() - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") actualResult := pod.Spec.ServiceAccountName @@ -750,7 +750,7 @@ func TestHeadPodTemplate_WithServiceAccountNoAutoscaling(t *testing.T) { cluster := instance.DeepCopy() serviceAccount := "head-service-account" cluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = serviceAccount - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") actualResult := pod.Spec.ServiceAccountName @@ -767,7 +767,7 @@ func TestHeadPodTemplate_WithServiceAccount(t *testing.T) { serviceAccount := "head-service-account" cluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = serviceAccount cluster.Spec.EnableInTreeAutoscaling = &trueFlag - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) pod := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") actualResult := pod.Spec.ServiceAccountName @@ -823,7 +823,7 @@ func TestCleanupInvalidVolumeMounts(t *testing.T) { cluster := instance.DeepCopy() // Test head pod - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", false) @@ -849,7 +849,7 @@ func TestDefaultWorkerPodTemplateWithName(t *testing.T) { fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace) worker := cluster.Spec.WorkerGroupSpecs[0] worker.Template.ObjectMeta.Name = "ray-worker-test" - podName := cluster.Name + DashSymbol + string(rayv1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) + podName := cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0) expectedWorker := *worker.DeepCopy() // Pass a deep copy of worker (*worker.DeepCopy()) to prevent "worker" from updating. @@ -873,22 +873,22 @@ func containerPortExists(ports []corev1.ContainerPort, name string, containerPor func TestDefaultHeadPodTemplateWithConfigurablePorts(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Ports = []corev1.ContainerPort{} - podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1.HeadNode) + DashSymbol + utils.FormatInt32(0)) + podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") // DefaultHeadPodTemplate will add the default metrics port if user doesn't specify it. // Verify the default metrics port exists. - if err := containerPortExists(podTemplateSpec.Spec.Containers[0].Ports, MetricsPortName, int32(DefaultMetricsPort)); err != nil { + if err := containerPortExists(podTemplateSpec.Spec.Containers[0].Ports, utils.MetricsPortName, int32(utils.DefaultMetricsPort)); err != nil { t.Fatal(err) } - customMetricsPort := int32(DefaultMetricsPort) + 1 + customMetricsPort := int32(utils.DefaultMetricsPort) + 1 metricsPort := corev1.ContainerPort{ - Name: MetricsPortName, + Name: utils.MetricsPortName, ContainerPort: customMetricsPort, } cluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Ports = []corev1.ContainerPort{metricsPort} podTemplateSpec = DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379") // Verify the custom metrics port exists. - if err := containerPortExists(podTemplateSpec.Spec.Containers[0].Ports, MetricsPortName, customMetricsPort); err != nil { + if err := containerPortExists(podTemplateSpec.Spec.Containers[0].Ports, utils.MetricsPortName, customMetricsPort); err != nil { t.Fatal(err) } } @@ -897,23 +897,23 @@ func TestDefaultWorkerPodTemplateWithConfigurablePorts(t *testing.T) { cluster := instance.DeepCopy() cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Ports = []corev1.ContainerPort{} worker := cluster.Spec.WorkerGroupSpecs[0] - podName := cluster.Name + DashSymbol + string(rayv1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) + podName := cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0) fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace) podTemplateSpec := DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379") // DefaultWorkerPodTemplate will add the default metrics port if user doesn't specify it. // Verify the default metrics port exists. - if err := containerPortExists(podTemplateSpec.Spec.Containers[0].Ports, MetricsPortName, int32(DefaultMetricsPort)); err != nil { + if err := containerPortExists(podTemplateSpec.Spec.Containers[0].Ports, utils.MetricsPortName, int32(utils.DefaultMetricsPort)); err != nil { t.Fatal(err) } - customMetricsPort := int32(DefaultMetricsPort) + 1 + customMetricsPort := int32(utils.DefaultMetricsPort) + 1 metricsPort := corev1.ContainerPort{ - Name: MetricsPortName, + Name: utils.MetricsPortName, ContainerPort: customMetricsPort, } cluster.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Ports = []corev1.ContainerPort{metricsPort} podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379") // Verify the custom metrics port exists. - if err := containerPortExists(podTemplateSpec.Spec.Containers[0].Ports, MetricsPortName, customMetricsPort); err != nil { + if err := containerPortExists(podTemplateSpec.Spec.Containers[0].Ports, utils.MetricsPortName, customMetricsPort); err != nil { t.Fatal(err) } } @@ -923,7 +923,7 @@ func TestDefaultInitContainer(t *testing.T) { cluster := instance.DeepCopy() fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace) worker := cluster.Spec.WorkerGroupSpecs[0] - podName := cluster.Name + DashSymbol + string(rayv1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) + podName := cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0) expectedResult := len(cluster.Spec.WorkerGroupSpecs[0].Template.Spec.InitContainers) + 1 // Pass a deep copy of worker (*worker.DeepCopy()) to prevent "worker" from updating. @@ -935,7 +935,7 @@ func TestDefaultInitContainer(t *testing.T) { // with the Ray head using TLS authentication. Currently, we simply copied all environment variables from // Ray container to the init container. This may be changed in the future. healthCheckContainer := podTemplateSpec.Spec.InitContainers[numInitContainers-1] - rayContainer := worker.Template.Spec.Containers[RayContainerIndex] + rayContainer := worker.Template.Spec.Containers[utils.RayContainerIndex] assert.NotEqual(t, len(rayContainer.Env), 0, "The test only makes sense if the Ray container has environment variables.") assert.Equal(t, len(rayContainer.Env), len(healthCheckContainer.Env)) @@ -955,7 +955,7 @@ func TestDefaultInitContainerImagePullPolicy(t *testing.T) { cluster := instance.DeepCopy() fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace) worker := cluster.Spec.WorkerGroupSpecs[0] - podName := cluster.Name + DashSymbol + string(rayv1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) + podName := cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0) cases := []struct { name string @@ -982,7 +982,7 @@ func TestDefaultInitContainerImagePullPolicy(t *testing.T) { for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { // set ray container imagePullPolicy - worker.Template.Spec.Containers[RayContainerIndex].ImagePullPolicy = tc.imagePullPolicy + worker.Template.Spec.Containers[utils.RayContainerIndex].ImagePullPolicy = tc.imagePullPolicy podTemplateSpec := DefaultWorkerPodTemplate(*cluster, *worker.DeepCopy(), podName, fqdnRayIP, "6379") @@ -1030,12 +1030,12 @@ func TestSetMissingRayStartParamsMetricsExportPort(t *testing.T) { headPort := "6379" fqdnRayIP := "raycluster-kuberay-head-svc.default.svc.cluster.local" - customMetricsPort := DefaultMetricsPort + 1 + customMetricsPort := utils.DefaultMetricsPort + 1 // Case 1: Head node with no metrics-export-port option set. rayStartParams := map[string]string{} rayStartParams = setMissingRayStartParams(rayStartParams, rayv1.HeadNode, headPort, "", nil) - assert.Equal(t, fmt.Sprint(DefaultMetricsPort), rayStartParams["metrics-export-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(DefaultMetricsPort), rayStartParams["metrics-export-port"])) + assert.Equal(t, fmt.Sprint(utils.DefaultMetricsPort), rayStartParams["metrics-export-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(utils.DefaultMetricsPort), rayStartParams["metrics-export-port"])) // Case 2: Head node with custom metrics-export-port option set. rayStartParams = map[string]string{"metrics-export-port": fmt.Sprint(customMetricsPort)} @@ -1045,7 +1045,7 @@ func TestSetMissingRayStartParamsMetricsExportPort(t *testing.T) { // Case 3: Worker node with no metrics-export-port option set. rayStartParams = map[string]string{} rayStartParams = setMissingRayStartParams(rayStartParams, rayv1.WorkerNode, headPort, fqdnRayIP, nil) - assert.Equal(t, fmt.Sprint(DefaultMetricsPort), rayStartParams["metrics-export-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(DefaultMetricsPort), rayStartParams["metrics-export-port"])) + assert.Equal(t, fmt.Sprint(utils.DefaultMetricsPort), rayStartParams["metrics-export-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(utils.DefaultMetricsPort), rayStartParams["metrics-export-port"])) // Case 4: Worker node with custom metrics-export-port option set. rayStartParams = map[string]string{"metrics-export-port": fmt.Sprint(customMetricsPort)} @@ -1128,16 +1128,16 @@ func TestSetMissingRayStartParamsAgentListenPort(t *testing.T) { assert.NotContains(t, rayStartParams, "dashboard-agent-listen-port", "Worker Pod should not have a dashboard-agent-listen-port option set by default.") // Case 3: Head node with "ray.io/enable-serve-service=true" and users do not provide "dashboard-agent-listen-port". - annotaions = map[string]string{EnableServeServiceKey: EnableServeServiceTrue} + annotaions = map[string]string{utils.EnableServeServiceKey: utils.EnableServeServiceTrue} rayStartParams = setMissingRayStartParams(rayStartParams, rayv1.HeadNode, headPort, "", annotaions) - assert.Equal(t, fmt.Sprint(DefaultDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(DefaultDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"])) + assert.Equal(t, fmt.Sprint(utils.DefaultDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(utils.DefaultDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"])) // Case 4: Worker node with "ray.io/enable-serve-service=true" and users do not provide "dashboard-agent-listen-port". rayStartParams = setMissingRayStartParams(rayStartParams, rayv1.WorkerNode, headPort, fqdnRayIP, annotaions) - assert.Equal(t, fmt.Sprint(DefaultDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(DefaultDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"])) + assert.Equal(t, fmt.Sprint(utils.DefaultDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(utils.DefaultDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"])) // Case 5: Head node with "ray.io/enable-serve-service=true" and users provide "dashboard-agent-listen-port". - customDashboardAgentListenPort := DefaultDashboardAgentListenPort + 1 + customDashboardAgentListenPort := utils.DefaultDashboardAgentListenPort + 1 rayStartParams = map[string]string{"dashboard-agent-listen-port": fmt.Sprint(customDashboardAgentListenPort)} rayStartParams = setMissingRayStartParams(rayStartParams, rayv1.HeadNode, headPort, "", annotaions) assert.Equal(t, fmt.Sprint(customDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"], fmt.Sprintf("Expected `%v` but got `%v`", fmt.Sprint(customDashboardAgentListenPort), rayStartParams["dashboard-agent-listen-port"])) @@ -1175,26 +1175,26 @@ func TestGetCustomWorkerInitImage(t *testing.T) { func TestGetEnableProbesInjection(t *testing.T) { // cleanup - defer os.Unsetenv(ENABLE_PROBES_INJECTION) + defer os.Unsetenv(utils.ENABLE_PROBES_INJECTION) // not set the env - os.Unsetenv(ENABLE_PROBES_INJECTION) + os.Unsetenv(utils.ENABLE_PROBES_INJECTION) b := getEnableProbesInjection() assert.True(t, b) // set the env with "true" - os.Setenv(ENABLE_PROBES_INJECTION, "true") + os.Setenv(utils.ENABLE_PROBES_INJECTION, "true") b = getEnableProbesInjection() assert.True(t, b) // set the env with "True" - os.Setenv(ENABLE_PROBES_INJECTION, "True") + os.Setenv(utils.ENABLE_PROBES_INJECTION, "True") b = getEnableProbesInjection() assert.True(t, b) // set the env with "false" - os.Setenv(ENABLE_PROBES_INJECTION, "false") + os.Setenv(utils.ENABLE_PROBES_INJECTION, "false") b = getEnableProbesInjection() assert.False(t, b) // set the env with "False" - os.Setenv(ENABLE_PROBES_INJECTION, "False") + os.Setenv(utils.ENABLE_PROBES_INJECTION, "False") b = getEnableProbesInjection() assert.False(t, b) } @@ -1205,8 +1205,8 @@ func TestInitHealthProbe(t *testing.T) { ProbeHandler: corev1.ProbeHandler{ HTTPGet: &corev1.HTTPGetAction{ // Check Raylet status - Path: fmt.Sprintf("/%s", RayAgentRayletHealthPath), - Port: intstr.FromInt(DefaultDashboardAgentListenPort), + Path: fmt.Sprintf("/%s", utils.RayAgentRayletHealthPath), + Port: intstr.FromInt(utils.DefaultDashboardAgentListenPort), }, }, } diff --git a/ray-operator/controllers/ray/common/rbac.go b/ray-operator/controllers/ray/common/rbac.go index 5c07197d01..4545764186 100644 --- a/ray-operator/controllers/ray/common/rbac.go +++ b/ray-operator/controllers/ray/common/rbac.go @@ -15,9 +15,9 @@ func BuildServiceAccount(cluster *rayv1.RayCluster) (*corev1.ServiceAccount, err Name: utils.GetHeadGroupServiceAccountName(cluster), Namespace: cluster.Namespace, Labels: map[string]string{ - RayClusterLabelKey: cluster.Name, - KubernetesApplicationNameLabelKey: ApplicationName, - KubernetesCreatedByLabelKey: ComponentName, + utils.RayClusterLabelKey: cluster.Name, + utils.KubernetesApplicationNameLabelKey: utils.ApplicationName, + utils.KubernetesCreatedByLabelKey: utils.ComponentName, }, }, } @@ -32,9 +32,9 @@ func BuildRole(cluster *rayv1.RayCluster) (*rbacv1.Role, error) { Name: cluster.Name, Namespace: cluster.Namespace, Labels: map[string]string{ - RayClusterLabelKey: cluster.Name, - KubernetesApplicationNameLabelKey: ApplicationName, - KubernetesCreatedByLabelKey: ComponentName, + utils.RayClusterLabelKey: cluster.Name, + utils.KubernetesApplicationNameLabelKey: utils.ApplicationName, + utils.KubernetesCreatedByLabelKey: utils.ComponentName, }, }, Rules: []rbacv1.PolicyRule{ @@ -62,9 +62,9 @@ func BuildRoleBinding(cluster *rayv1.RayCluster) (*rbacv1.RoleBinding, error) { Name: cluster.Name, Namespace: cluster.Namespace, Labels: map[string]string{ - RayClusterLabelKey: cluster.Name, - KubernetesApplicationNameLabelKey: ApplicationName, - KubernetesCreatedByLabelKey: ComponentName, + utils.RayClusterLabelKey: cluster.Name, + utils.KubernetesApplicationNameLabelKey: utils.ApplicationName, + utils.KubernetesCreatedByLabelKey: utils.ComponentName, }, }, Subjects: []rbacv1.Subject{ diff --git a/ray-operator/controllers/ray/common/route.go b/ray-operator/controllers/ray/common/route.go index 39c854db98..b0142bcc48 100644 --- a/ray-operator/controllers/ray/common/route.go +++ b/ray-operator/controllers/ray/common/route.go @@ -13,10 +13,10 @@ import ( // This is used to expose dashboard and remote submit service apis or external traffic. func BuildRouteForHeadService(cluster rayv1.RayCluster) (*routev1.Route, error) { labels := map[string]string{ - RayClusterLabelKey: cluster.Name, - RayIDLabelKey: utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode), - KubernetesApplicationNameLabelKey: ApplicationName, - KubernetesCreatedByLabelKey: ComponentName, + utils.RayClusterLabelKey: cluster.Name, + utils.RayIDLabelKey: utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode), + utils.KubernetesApplicationNameLabelKey: utils.ApplicationName, + utils.KubernetesCreatedByLabelKey: utils.ComponentName, } // Copy other configurations from cluster annotations to provide a generic way @@ -27,7 +27,7 @@ func BuildRouteForHeadService(cluster rayv1.RayCluster) (*routev1.Route, error) } servicePorts := getServicePorts(cluster) - dashboardPort := DefaultDashboardPort + dashboardPort := utils.DefaultDashboardPort if port, ok := servicePorts["dashboard"]; ok { dashboardPort = int(port) } @@ -77,8 +77,8 @@ func BuildRouteForRayService(service rayv1.RayService, cluster rayv1.RayCluster) } route.ObjectMeta.Name = serviceName route.ObjectMeta.Namespace = service.Namespace - route.ObjectMeta.Labels[RayServiceLabelKey] = service.Name - route.ObjectMeta.Labels[RayIDLabelKey] = utils.CheckLabel(utils.GenerateIdentifier(service.Name, rayv1.HeadNode)) + route.ObjectMeta.Labels[utils.RayServiceLabelKey] = service.Name + route.ObjectMeta.Labels[utils.RayIDLabelKey] = utils.CheckLabel(utils.GenerateIdentifier(service.Name, rayv1.HeadNode)) return route, nil } diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index 4088e4ab38..50c4241554 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -14,11 +14,11 @@ import ( // HeadServiceLabels returns the default labels for a cluster's head service. func HeadServiceLabels(cluster rayv1.RayCluster) map[string]string { return map[string]string{ - RayClusterLabelKey: cluster.Name, - RayNodeTypeLabelKey: string(rayv1.HeadNode), - RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode)), - KubernetesApplicationNameLabelKey: ApplicationName, - KubernetesCreatedByLabelKey: ComponentName, + utils.RayClusterLabelKey: cluster.Name, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode)), + utils.KubernetesApplicationNameLabelKey: utils.ApplicationName, + utils.KubernetesCreatedByLabelKey: utils.ComponentName, } } @@ -58,7 +58,7 @@ func BuildServiceForHeadPod(cluster rayv1.RayCluster, labels map[string]string, default_namespace := cluster.Namespace default_type := cluster.Spec.HeadGroupSpec.ServiceType - defaultAppProtocol := DefaultServiceAppProtocol + defaultAppProtocol := utils.DefaultServiceAppProtocol // `ports_int` is a map of port names to port numbers, while `ports` is a list of ServicePort objects ports_int := getServicePorts(cluster) ports := []corev1.ServicePort{} @@ -143,9 +143,9 @@ func BuildHeadServiceForRayService(rayService rayv1.RayService, rayCluster rayv1 service.ObjectMeta.Name = headSvcName service.ObjectMeta.Namespace = rayService.Namespace service.ObjectMeta.Labels = map[string]string{ - RayServiceLabelKey: rayService.Name, - RayNodeTypeLabelKey: string(rayv1.HeadNode), - RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(rayService.Name, rayv1.HeadNode)), + utils.RayServiceLabelKey: rayService.Name, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(rayService.Name, rayv1.HeadNode)), } return service, nil @@ -171,12 +171,12 @@ func BuildServeService(rayService rayv1.RayService, rayCluster rayv1.RayCluster, } labels := map[string]string{ - RayServiceLabelKey: name, - RayClusterServingServiceLabelKey: utils.GenerateServeServiceLabel(name), + utils.RayServiceLabelKey: name, + utils.RayClusterServingServiceLabelKey: utils.GenerateServeServiceLabel(name), } selectorLabels := map[string]string{ - RayClusterLabelKey: rayCluster.Name, - RayClusterServingServiceLabelKey: EnableRayClusterServingServiceTrue, + utils.RayClusterLabelKey: rayCluster.Name, + utils.RayClusterServingServiceLabelKey: utils.EnableRayClusterServingServiceTrue, } default_name := utils.GenerateServeServiceName(name) @@ -190,7 +190,7 @@ func BuildServeService(rayService rayv1.RayService, rayCluster rayv1.RayCluster, ports_int := getServicePorts(rayCluster) ports := []corev1.ServicePort{} for name, port := range ports_int { - if name == ServingPortName { + if name == utils.ServingPortName { svcPort := corev1.ServicePort{Name: name, Port: port} ports = append(ports, svcPort) break @@ -224,7 +224,7 @@ func BuildServeService(rayService rayv1.RayService, rayCluster rayv1.RayCluster, } else { ports := []corev1.ServicePort{} for _, port := range serveService.Spec.Ports { - if port.Name == ServingPortName { + if port.Name == utils.ServingPortName { svcPort := corev1.ServicePort{Name: port.Name, Port: port.Port} ports = append(ports, svcPort) break @@ -324,17 +324,17 @@ func getServicePorts(cluster rayv1.RayCluster) map[string]int32 { } // Check if agent port is defined. If not, check if enable agent service. - if _, agentDefined := ports[DashboardAgentListenPortName]; !agentDefined { - enableServeServiceValue, exist := cluster.Annotations[EnableServeServiceKey] - if exist && enableServeServiceValue == EnableServeServiceTrue { + if _, agentDefined := ports[utils.DashboardAgentListenPortName]; !agentDefined { + enableServeServiceValue, exist := cluster.Annotations[utils.EnableServeServiceKey] + if exist && enableServeServiceValue == utils.EnableServeServiceTrue { // If agent port is not in the config, add default value for it. - ports[DashboardAgentListenPortName] = DefaultDashboardAgentListenPort + ports[utils.DashboardAgentListenPortName] = utils.DefaultDashboardAgentListenPort } } // check if metrics port is defined. If not, add default value for it. - if _, metricsDefined := ports[MetricsPortName]; !metricsDefined { - ports[MetricsPortName] = DefaultMetricsPort + if _, metricsDefined := ports[utils.MetricsPortName]; !metricsDefined { + ports[utils.MetricsPortName] = utils.DefaultMetricsPort } return ports @@ -346,7 +346,7 @@ func getServicePorts(cluster rayv1.RayCluster) map[string]int32 { func getPortsFromCluster(cluster rayv1.RayCluster) (map[string]int32, error) { svcPorts := map[string]int32{} - cPorts := cluster.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Ports + cPorts := cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Ports for _, port := range cPorts { if port.Name == "" { port.Name = fmt.Sprint(port.ContainerPort) + "-port" @@ -359,10 +359,10 @@ func getPortsFromCluster(cluster rayv1.RayCluster) (map[string]int32, error) { func getDefaultPorts() map[string]int32 { return map[string]int32{ - ClientPortName: DefaultClientPort, - RedisPortName: DefaultRedisPort, - DashboardPortName: DefaultDashboardPort, - MetricsPortName: DefaultMetricsPort, - ServingPortName: DefaultServingPort, + utils.ClientPortName: utils.DefaultClientPort, + utils.RedisPortName: utils.DefaultRedisPort, + utils.DashboardPortName: utils.DefaultDashboardPort, + utils.MetricsPortName: utils.DefaultMetricsPort, + utils.ServingPortName: utils.DefaultServingPort, } } diff --git a/ray-operator/controllers/ray/common/service_test.go b/ray-operator/controllers/ray/common/service_test.go index 49ca7ef4e5..44fdb531b8 100644 --- a/ray-operator/controllers/ray/common/service_test.go +++ b/ray-operator/controllers/ray/common/service_test.go @@ -7,6 +7,7 @@ import ( "testing" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" "github.com/stretchr/testify/assert" @@ -127,28 +128,28 @@ func TestBuildServiceForHeadPod(t *testing.T) { svc, err := BuildServiceForHeadPod(*instanceWithWrongSvc, nil, nil) assert.Nil(t, err) - actualResult := svc.Spec.Selector[RayClusterLabelKey] + actualResult := svc.Spec.Selector[utils.RayClusterLabelKey] expectedResult := string(instanceWithWrongSvc.Name) if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - actualResult = svc.Spec.Selector[RayNodeTypeLabelKey] + actualResult = svc.Spec.Selector[utils.RayNodeTypeLabelKey] expectedResult = string(rayv1.HeadNode) if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - actualResult = svc.Spec.Selector[KubernetesApplicationNameLabelKey] - expectedResult = ApplicationName + actualResult = svc.Spec.Selector[utils.KubernetesApplicationNameLabelKey] + expectedResult = utils.ApplicationName if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } ports := svc.Spec.Ports - expectedResult = DefaultServiceAppProtocol + expectedResult = utils.DefaultServiceAppProtocol for _, port := range ports { - if *port.AppProtocol != DefaultServiceAppProtocol { + if *port.AppProtocol != utils.DefaultServiceAppProtocol { t.Fatalf("Expected `%v` but got `%v`", expectedResult, *port.AppProtocol) } } @@ -156,12 +157,12 @@ func TestBuildServiceForHeadPod(t *testing.T) { func TestBuildServiceForHeadPodWithAppNameLabel(t *testing.T) { labels := make(map[string]string) - labels[KubernetesApplicationNameLabelKey] = "testname" + labels[utils.KubernetesApplicationNameLabelKey] = "testname" svc, err := BuildServiceForHeadPod(*instanceWithWrongSvc, labels, nil) assert.Nil(t, err) - actualResult := svc.Spec.Selector[KubernetesApplicationNameLabelKey] + actualResult := svc.Spec.Selector[utils.KubernetesApplicationNameLabelKey] expectedResult := "testname" if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) @@ -204,7 +205,7 @@ func TestGetPortsFromCluster(t *testing.T) { svcNames[port] = name } - cPorts := instanceWithWrongSvc.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Ports + cPorts := instanceWithWrongSvc.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex].Ports for _, cPort := range cPorts { expectedResult := cPort.Name @@ -222,8 +223,8 @@ func TestGetServicePortsWithMetricsPort(t *testing.T) { cluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Ports = []corev1.ContainerPort{} ports := getServicePorts(*cluster) // Verify that getServicePorts sets the default metrics port when the user doesn't specify any ports. - if ports[MetricsPortName] != int32(DefaultMetricsPort) { - t.Fatalf("Expected `%v` but got `%v`", int32(DefaultMetricsPort), ports[MetricsPortName]) + if ports[utils.MetricsPortName] != int32(utils.DefaultMetricsPort) { + t.Fatalf("Expected `%v` but got `%v`", int32(utils.DefaultMetricsPort), ports[utils.MetricsPortName]) } // Test case 2: Only a random port is specified by the user. @@ -235,21 +236,21 @@ func TestGetServicePortsWithMetricsPort(t *testing.T) { } ports = getServicePorts(*cluster) // Verify that getServicePorts sets the default metrics port when the user doesn't specify the metrics port but specifies other ports. - if ports[MetricsPortName] != int32(DefaultMetricsPort) { - t.Fatalf("Expected `%v` but got `%v`", int32(DefaultMetricsPort), ports[MetricsPortName]) + if ports[utils.MetricsPortName] != int32(utils.DefaultMetricsPort) { + t.Fatalf("Expected `%v` but got `%v`", int32(utils.DefaultMetricsPort), ports[utils.MetricsPortName]) } // Test case 3: A custom metrics port is specified by the user. - customMetricsPort := int32(DefaultMetricsPort) + 1 + customMetricsPort := int32(utils.DefaultMetricsPort) + 1 metricsPort := corev1.ContainerPort{ - Name: MetricsPortName, + Name: utils.MetricsPortName, ContainerPort: customMetricsPort, } cluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Ports = append(cluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Ports, metricsPort) ports = getServicePorts(*cluster) // Verify that getServicePorts uses the user's custom metrics port when the user specifies the metrics port. - if ports[MetricsPortName] != customMetricsPort { - t.Fatalf("Expected `%v` but got `%v`", customMetricsPort, ports[MetricsPortName]) + if ports[utils.MetricsPortName] != customMetricsPort { + t.Fatalf("Expected `%v` but got `%v`", customMetricsPort, ports[utils.MetricsPortName]) } } @@ -260,12 +261,12 @@ func TestUserSpecifiedHeadService(t *testing.T) { // Set user-specified head service with user-specified labels, annotations, and ports. userName := "user-custom-name" userNamespace := "user-custom-namespace" - userLabels := map[string]string{"userLabelKey": "userLabelValue", RayClusterLabelKey: "userClusterName"} // Override default cluster name + userLabels := map[string]string{"userLabelKey": "userLabelValue", utils.RayClusterLabelKey: "userClusterName"} // Override default cluster name userAnnotations := map[string]string{"userAnnotationKey": "userAnnotationValue", headServiceAnnotationKey1: "user_override"} userPort := corev1.ServicePort{Name: "userPort", Port: 12345} - userPortOverride := corev1.ServicePort{Name: ClientPortName, Port: 98765} // Override default client port (10001) + userPortOverride := corev1.ServicePort{Name: utils.ClientPortName, Port: 98765} // Override default client port (10001) userPorts := []corev1.ServicePort{userPort, userPortOverride} - userSelector := map[string]string{"userSelectorKey": "userSelectorValue", RayClusterLabelKey: "userSelectorClusterName"} + userSelector := map[string]string{"userSelectorKey": "userSelectorValue", utils.RayClusterLabelKey: "userSelectorClusterName"} // Specify a "LoadBalancer" type, which differs from the default "ClusterIP" type. userType := corev1.ServiceTypeLoadBalancer testRayClusterWithHeadService.Spec.HeadGroupSpec.HeadService = &corev1.Service{ @@ -283,7 +284,7 @@ func TestUserSpecifiedHeadService(t *testing.T) { } // These labels originate from HeadGroupSpec.Template.ObjectMeta.Labels userTemplateClusterName := "userTemplateClusterName" - template_labels := map[string]string{RayClusterLabelKey: userTemplateClusterName} + template_labels := map[string]string{utils.RayClusterLabelKey: userTemplateClusterName} headService, err := BuildServiceForHeadPod(*testRayClusterWithHeadService, template_labels, testRayClusterWithHeadService.Spec.HeadServiceAnnotations) if err != nil { t.Errorf("failed to build head service: %v", err) @@ -293,8 +294,8 @@ func TestUserSpecifiedHeadService(t *testing.T) { // The user-provided HeadService labels should be ignored for the purposes of the selector field. The user-provided Selector field should be ignored. default_labels := HeadServiceLabels(*testRayClusterWithHeadService) // Make sure this test isn't spuriously passing. Check that RayClusterLabelKey is in the default labels. - if _, ok := default_labels[RayClusterLabelKey]; !ok { - t.Errorf("RayClusterLabelKey=%s should be in the default labels", RayClusterLabelKey) + if _, ok := default_labels[utils.RayClusterLabelKey]; !ok { + t.Errorf("utils.RayClusterLabelKey=%s should be in the default labels", utils.RayClusterLabelKey) } for k, v := range headService.Spec.Selector { // If k is not in the default labels, then the selector field should not contain it. @@ -428,13 +429,13 @@ func TestBuildServeServiceForRayService(t *testing.T) { svc, err := BuildServeServiceForRayService(*serviceInstance, *instanceWithWrongSvc) assert.Nil(t, err) - actualResult := svc.Spec.Selector[RayClusterLabelKey] + actualResult := svc.Spec.Selector[utils.RayClusterLabelKey] expectedResult := string(instanceWithWrongSvc.Name) if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - actualLabel := svc.Labels[RayServiceLabelKey] + actualLabel := svc.Labels[utils.RayServiceLabelKey] expectedLabel := string(serviceInstance.Name) if !reflect.DeepEqual(expectedLabel, actualLabel) { t.Fatalf("Expected `%v` but got `%v`", expectedLabel, actualLabel) @@ -454,13 +455,13 @@ func TestBuildServeServiceForRayCluster(t *testing.T) { svc, err := BuildServeServiceForRayCluster(*instanceForServeSvc) assert.Nil(t, err) - actualResult := svc.Spec.Selector[RayClusterLabelKey] + actualResult := svc.Spec.Selector[utils.RayClusterLabelKey] expectedResult := string(instanceForServeSvc.Name) if !reflect.DeepEqual(expectedResult, actualResult) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - actualLabel := svc.Labels[RayServiceLabelKey] + actualLabel := svc.Labels[utils.RayServiceLabelKey] expectedLabel := string(instanceForServeSvc.Name) if !reflect.DeepEqual(expectedLabel, actualLabel) { t.Fatalf("Expected `%v` but got `%v`", expectedLabel, actualLabel) @@ -511,12 +512,12 @@ func TestUserSpecifiedServeService(t *testing.T) { userName := "user-custom-name" userNamespace := "user-custom-namespace" - userLabels := map[string]string{"userLabelKey": "userLabelValue", RayClusterLabelKey: "userClusterName"} // Override default cluster name + userLabels := map[string]string{"userLabelKey": "userLabelValue", utils.RayClusterLabelKey: "userClusterName"} // Override default cluster name userAnnotations := map[string]string{"userAnnotationKey": "userAnnotationValue", "userAnnotationKey2": "userAnnotationValue2"} userPort := corev1.ServicePort{Name: "serve", Port: 12345} - userPortOverride := corev1.ServicePort{Name: ClientPortName, Port: 98765} // Override default client port (10001) + userPortOverride := corev1.ServicePort{Name: utils.ClientPortName, Port: 98765} // Override default client port (10001) userPorts := []corev1.ServicePort{userPort, userPortOverride} - userSelector := map[string]string{"userSelectorKey": "userSelectorValue", RayClusterLabelKey: "userSelectorClusterName"} + userSelector := map[string]string{"userSelectorKey": "userSelectorValue", utils.RayClusterLabelKey: "userSelectorClusterName"} // Specify a "LoadBalancer" type, which differs from the default "ClusterIP" type. userType := corev1.ServiceTypeLoadBalancer @@ -548,21 +549,21 @@ func TestUserSpecifiedServeService(t *testing.T) { // Check that selectors only have default selectors if len(svc.Spec.Selector) != 2 { - t.Errorf("Selectors should have just 2 keys %s and %s", RayClusterLabelKey, RayClusterServingServiceLabelKey) + t.Errorf("Selectors should have just 2 keys %s and %s", utils.RayClusterLabelKey, utils.RayClusterServingServiceLabelKey) } - if svc.Spec.Selector[RayClusterLabelKey] != instanceWithWrongSvc.Name { - t.Errorf("Serve Service selector key %s value didn't match expected value : expected value=%s, actual value=%s", RayClusterLabelKey, instanceWithWrongSvc.Name, svc.Spec.Selector[RayClusterLabelKey]) + if svc.Spec.Selector[utils.RayClusterLabelKey] != instanceWithWrongSvc.Name { + t.Errorf("Serve Service selector key %s value didn't match expected value : expected value=%s, actual value=%s", utils.RayClusterLabelKey, instanceWithWrongSvc.Name, svc.Spec.Selector[utils.RayClusterLabelKey]) } - if svc.Spec.Selector[RayClusterServingServiceLabelKey] != EnableRayClusterServingServiceTrue { - t.Errorf("Serve Service selector key %s value didn't match expected value : expected value=%s, actual value=%s", RayClusterServingServiceLabelKey, EnableRayClusterServingServiceTrue, svc.Spec.Selector[RayClusterServingServiceLabelKey]) + if svc.Spec.Selector[utils.RayClusterServingServiceLabelKey] != utils.EnableRayClusterServingServiceTrue { + t.Errorf("Serve Service selector key %s value didn't match expected value : expected value=%s, actual value=%s", utils.RayClusterServingServiceLabelKey, utils.EnableRayClusterServingServiceTrue, svc.Spec.Selector[utils.RayClusterServingServiceLabelKey]) } // ports should only have DefaultServePort ports := svc.Spec.Ports - expectedPortName := ServingPortName + expectedPortName := utils.ServingPortName expectedPortNumber := int32(8000) for _, port := range ports { - if port.Name != ServingPortName { + if port.Name != utils.ServingPortName { t.Fatalf("Expected `%v` but got `%v`", expectedPortName, port.Name) } if port.Port != expectedPortNumber { diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 6e3ce5e695..6396cdd5a5 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -181,17 +181,17 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request // the Redis cleanup job should be activated. Users can disable the feature by setting the environment // variable `ENABLE_GCS_FT_REDIS_CLEANUP` to `false`, and undertake the Redis storage namespace cleanup // manually after the RayCluster CR deletion. - enableGCSFTRedisCleanup := strings.ToLower(os.Getenv(common.ENABLE_GCS_FT_REDIS_CLEANUP)) != "false" + enableGCSFTRedisCleanup := strings.ToLower(os.Getenv(utils.ENABLE_GCS_FT_REDIS_CLEANUP)) != "false" if enableGCSFTRedisCleanup && common.IsGCSFaultToleranceEnabled(*instance) { if instance.DeletionTimestamp.IsZero() { - if !controllerutil.ContainsFinalizer(instance, common.GCSFaultToleranceRedisCleanupFinalizer) { + if !controllerutil.ContainsFinalizer(instance, utils.GCSFaultToleranceRedisCleanupFinalizer) { r.Log.Info( "GCS fault tolerance has been enabled. Implementing a finalizer to ensure that Redis is properly cleaned up once the RayCluster custom resource (CR) is deleted.", - "finalizer", common.GCSFaultToleranceRedisCleanupFinalizer) - controllerutil.AddFinalizer(instance, common.GCSFaultToleranceRedisCleanupFinalizer) + "finalizer", utils.GCSFaultToleranceRedisCleanupFinalizer) + controllerutil.AddFinalizer(instance, utils.GCSFaultToleranceRedisCleanupFinalizer) if err := r.Update(ctx, instance); err != nil { - r.Log.Error(err, fmt.Sprintf("Failed to add the finalizer %s to the RayCluster.", common.GCSFaultToleranceRedisCleanupFinalizer)) + r.Log.Error(err, fmt.Sprintf("Failed to add the finalizer %s to the RayCluster.", utils.GCSFaultToleranceRedisCleanupFinalizer)) return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err } // Only start the RayCluster reconciliation after the finalizer is added. @@ -200,12 +200,12 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request } else { r.Log.Info( fmt.Sprintf("The RayCluster with GCS enabled, %s, is being deleted. Start to handle the Redis cleanup finalizer %s.", - instance.Name, common.GCSFaultToleranceRedisCleanupFinalizer), + instance.Name, utils.GCSFaultToleranceRedisCleanupFinalizer), "DeletionTimestamp", instance.ObjectMeta.DeletionTimestamp) // Delete the head Pod if it exists. headPods := corev1.PodList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeTypeLabelKey: string(rayv1.HeadNode)} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeTypeLabelKey: string(rayv1.HeadNode)} if err := r.List(ctx, &headPods, client.InNamespace(instance.Namespace), filterLabels); err != nil { return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err } @@ -218,7 +218,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request r.Log.Info(fmt.Sprintf( "Delete the head Pod %s before the Redis cleanup. "+ "The storage namespace %s in Redis cannot be fully deleted if the GCS process on the head Pod is still writing to it.", - headPod.Name, headPod.Annotations[common.RayExternalStorageNSAnnotationKey])) + headPod.Name, headPod.Annotations[utils.RayExternalStorageNSAnnotationKey])) if err := r.Delete(ctx, &headPod); err != nil { return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err } @@ -227,7 +227,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request // Delete all worker Pods if they exist. for _, workerGroup := range instance.Spec.WorkerGroupSpecs { workerPods := corev1.PodList{} - filterLabels = client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeGroupLabelKey: workerGroup.GroupName} + filterLabels = client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeGroupLabelKey: workerGroup.GroupName} if err := r.List(ctx, &workerPods, client.InNamespace(instance.Namespace), filterLabels); err != nil { return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err } @@ -250,13 +250,13 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request r.Log.Info(fmt.Sprintf( "Wait for the head Pod %s to be terminated before initiating the Redis cleanup process. "+ "The storage namespace %s in Redis cannot be fully deleted if the GCS process on the head Pod is still writing to it.", - headPods.Items[0].Name, headPods.Items[0].Annotations[common.RayExternalStorageNSAnnotationKey])) + headPods.Items[0].Name, headPods.Items[0].Annotations[utils.RayExternalStorageNSAnnotationKey])) // Requeue after 10 seconds because it takes much longer than DefaultRequeueDuration (2 seconds) for the head Pod to be terminated. return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } // We can start the Redis cleanup process now because the head Pod has been terminated. - filterLabels = client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeTypeLabelKey: string(rayv1.RedisCleanupNode)} + filterLabels = client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeTypeLabelKey: string(rayv1.RedisCleanupNode)} redisCleanupJobs := batchv1.JobList{} if err := r.List(ctx, &redisCleanupJobs, client.InNamespace(instance.Namespace), filterLabels); err != nil { return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err @@ -271,9 +271,9 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request r.Log.Info(fmt.Sprintf( "The Redis cleanup Job %s has been completed. "+ "The storage namespace %s in Redis has been fully deleted.", - redisCleanupJob.Name, redisCleanupJob.Annotations[common.RayExternalStorageNSAnnotationKey])) + redisCleanupJob.Name, redisCleanupJob.Annotations[utils.RayExternalStorageNSAnnotationKey])) // Remove the finalizer from the RayCluster CR. - controllerutil.RemoveFinalizer(instance, common.GCSFaultToleranceRedisCleanupFinalizer) + controllerutil.RemoveFinalizer(instance, utils.GCSFaultToleranceRedisCleanupFinalizer) if err := r.Update(ctx, instance); err != nil { r.Log.Error(err, "Failed to remove finalizer for RayCluster") return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err @@ -288,7 +288,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request "The Redis cleanup Job %s has failed, requeue the RayCluster CR after 5 minute. "+ "You should manually delete the storage namespace %s in Redis and remove the RayCluster's finalizer. "+ "Please check https://docs.ray.io/en/master/cluster/kubernetes/user-guides/kuberay-gcs-ft.html for more details.", - redisCleanupJob.Name, redisCleanupJob.Annotations[common.RayExternalStorageNSAnnotationKey])) + redisCleanupJob.Name, redisCleanupJob.Annotations[utils.RayExternalStorageNSAnnotationKey])) return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil } } else { @@ -344,7 +344,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err } // Only reconcile the K8s service for Ray Serve when the "ray.io/enable-serve-service" annotation is set to true. - if enableServeServiceValue, exist := instance.Annotations[common.EnableServeServiceKey]; exist && enableServeServiceValue == common.EnableServeServiceTrue { + if enableServeServiceValue, exist := instance.Annotations[utils.EnableServeServiceKey]; exist && enableServeServiceValue == utils.EnableServeServiceTrue { if err := r.reconcileServeService(ctx, instance); err != nil { if updateErr := r.updateClusterState(ctx, instance, rayv1.Failed); updateErr != nil { r.Log.Error(updateErr, "RayCluster update state error", "cluster name", request.Name) @@ -382,10 +382,10 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request // Unconditionally requeue after the number of seconds specified in the // environment variable RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV. If the // environment variable is not set, requeue after the default value. - requeueAfterSeconds, err := strconv.Atoi(os.Getenv(common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV)) + requeueAfterSeconds, err := strconv.Atoi(os.Getenv(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV)) if err != nil { - r.Log.Info(fmt.Sprintf("Environment variable %s is not set, using default value of %d seconds", common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV, common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS), "cluster name", request.Name) - requeueAfterSeconds = common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS + r.Log.Info(fmt.Sprintf("Environment variable %s is not set, using default value of %d seconds", utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV, utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS), "cluster name", request.Name) + requeueAfterSeconds = utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS } r.Log.Info("Unconditional requeue after", "cluster name", request.Name, "seconds", requeueAfterSeconds) return ctrl.Result{RequeueAfter: time.Duration(requeueAfterSeconds) * time.Second}, nil @@ -439,7 +439,7 @@ func (r *RayClusterReconciler) reconcileIngress(ctx context.Context, instance *r func (r *RayClusterReconciler) reconcileRouteOpenShift(ctx context.Context, instance *rayv1.RayCluster) error { headRoutes := routev1.RouteList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: instance.Name} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name} if err := r.List(ctx, &headRoutes, client.InNamespace(instance.Namespace), filterLabels); err != nil { r.Log.Error(err, "Route Listing error!", "Route.Error", err) return err @@ -473,7 +473,7 @@ func (r *RayClusterReconciler) reconcileRouteOpenShift(ctx context.Context, inst func (r *RayClusterReconciler) reconcileIngressKubernetes(ctx context.Context, instance *rayv1.RayCluster) error { headIngresses := networkingv1.IngressList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: instance.Name} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name} if err := r.List(ctx, &headIngresses, client.InNamespace(instance.Namespace), filterLabels); err != nil { return err } @@ -505,7 +505,7 @@ func (r *RayClusterReconciler) reconcileIngressKubernetes(ctx context.Context, i // Return nil only when the head service successfully created or already exists. func (r *RayClusterReconciler) reconcileHeadService(ctx context.Context, instance *rayv1.RayCluster) error { services := corev1.ServiceList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeTypeLabelKey: string(rayv1.HeadNode)} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeTypeLabelKey: string(rayv1.HeadNode)} if err := r.List(ctx, &services, client.InNamespace(instance.Namespace), filterLabels); err != nil { return err @@ -525,8 +525,8 @@ func (r *RayClusterReconciler) reconcileHeadService(ctx context.Context, instanc } else { // Create head service if there's no existing one in the cluster. labels := make(map[string]string) - if val, ok := instance.Spec.HeadGroupSpec.Template.ObjectMeta.Labels[common.KubernetesApplicationNameLabelKey]; ok { - labels[common.KubernetesApplicationNameLabelKey] = val + if val, ok := instance.Spec.HeadGroupSpec.Template.ObjectMeta.Labels[utils.KubernetesApplicationNameLabelKey]; ok { + labels[utils.KubernetesApplicationNameLabelKey] = val } annotations := make(map[string]string) // TODO (kevin85421): KubeRay has already exposed the entire head service (#1040) to users. @@ -584,7 +584,7 @@ func (r *RayClusterReconciler) reconcileServeService(ctx context.Context, instan func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv1.RayCluster) error { // check if all the pods exist headPods := corev1.PodList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeTypeLabelKey: string(rayv1.HeadNode)} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeTypeLabelKey: string(rayv1.HeadNode)} if err := r.List(ctx, &headPods, client.InNamespace(instance.Namespace), filterLabels); err != nil { return err } @@ -661,7 +661,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv // check if WorkerGroupSpecs has been changed and we need to kill worker pods for _, worker := range instance.Spec.WorkerGroupSpecs { workerPods := corev1.PodList{} - filterLabels = client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeGroupLabelKey: worker.GroupName} + filterLabels = client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeGroupLabelKey: worker.GroupName} if err := r.List(ctx, &workerPods, client.InNamespace(instance.Namespace), filterLabels); err != nil { return err } @@ -689,7 +689,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv r.Log.Info("reconcilePods", "desired workerReplicas (always adhering to minReplicas/maxReplica)", workerReplicas, "worker group", worker.GroupName, "maxReplicas", worker.MaxReplicas, "minReplicas", worker.MinReplicas, "replicas", worker.Replicas) workerPods := corev1.PodList{} - filterLabels = client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeGroupLabelKey: worker.GroupName} + filterLabels = client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeGroupLabelKey: worker.GroupName} if err := r.List(ctx, &workerPods, client.InNamespace(instance.Namespace), filterLabels); err != nil { return err } @@ -774,7 +774,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv // default behavior is stable enough, we can remove this feature flag. enableRandomPodDelete := false if enableInTreeAutoscaling { - if s := os.Getenv(common.ENABLE_RANDOM_POD_DELETE); strings.ToLower(s) == "true" { + if s := os.Getenv(utils.ENABLE_RANDOM_POD_DELETE); strings.ToLower(s) == "true" { enableRandomPodDelete = true } } @@ -879,7 +879,7 @@ func shouldDeletePod(pod corev1.Pod, nodeType rayv1.RayNodeType) (bool, string) // (1) https://discuss.kubernetes.io/t/pod-spec-containers-and-pod-status-containerstatuses-can-have-a-different-order-why/25273 // (2) https://github.com/kubernetes/kubernetes/blob/03762cbcb52b2a4394e4d795f9d3517a78a5e1a2/pkg/api/v1/pod/util.go#L261-L268 func getRayContainerStateTerminated(pod corev1.Pod) *corev1.ContainerStateTerminated { - rayContainerName := pod.Spec.Containers[common.RayContainerIndex].Name + rayContainerName := pod.Spec.Containers[utils.RayContainerIndex].Name for _, containerStatus := range pod.Status.ContainerStatuses { if containerStatus.Name == rayContainerName { return containerStatus.State.Terminated @@ -1023,14 +1023,14 @@ func (r *RayClusterReconciler) createWorkerPod(ctx context.Context, instance ray // Build head instance pod(s). func (r *RayClusterReconciler) buildHeadPod(instance rayv1.RayCluster) corev1.Pod { - podName := strings.ToLower(instance.Name + common.DashSymbol + string(rayv1.HeadNode) + common.DashSymbol) + podName := strings.ToLower(instance.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol) podName = utils.CheckName(podName) // making sure the name is valid fqdnRayIP := utils.GenerateFQDNServiceName(instance, instance.Namespace) // Fully Qualified Domain Name // The Ray head port used by workers to connect to the cluster (GCS server port for Ray >= 1.11.0, Redis port for older Ray.) headPort := common.GetHeadPort(instance.Spec.HeadGroupSpec.RayStartParams) // Check whether serve is enabled and add serve label serveLabel := false - if enableServeServiceValue, exist := instance.Annotations[common.EnableServeServiceKey]; exist && enableServeServiceValue == common.EnableServeServiceTrue { + if enableServeServiceValue, exist := instance.Annotations[utils.EnableServeServiceKey]; exist && enableServeServiceValue == utils.EnableServeServiceTrue { serveLabel = true } autoscalingEnabled := instance.Spec.EnableInTreeAutoscaling @@ -1050,7 +1050,7 @@ func getCreator(instance rayv1.RayCluster) string { if instance.Labels == nil { return "" } - creatorName, exist := instance.Labels[common.KubernetesCreatedByLabelKey] + creatorName, exist := instance.Labels[utils.KubernetesCreatedByLabelKey] if !exist { return "" @@ -1061,7 +1061,7 @@ func getCreator(instance rayv1.RayCluster) string { // Build worker instance pods. func (r *RayClusterReconciler) buildWorkerPod(instance rayv1.RayCluster, worker rayv1.WorkerGroupSpec) corev1.Pod { - podName := strings.ToLower(instance.Name + common.DashSymbol + string(rayv1.WorkerNode) + common.DashSymbol + worker.GroupName + common.DashSymbol) + podName := strings.ToLower(instance.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol) podName = utils.CheckName(podName) // making sure the name is valid fqdnRayIP := utils.GenerateFQDNServiceName(instance, instance.Namespace) // Fully Qualified Domain Name @@ -1072,7 +1072,7 @@ func (r *RayClusterReconciler) buildWorkerPod(instance rayv1.RayCluster, worker creatorName := getCreator(instance) // Check whether serve is enabled and add serve label serveLabel := false - if enableServeServiceValue, exist := instance.Annotations[common.EnableServeServiceKey]; exist && enableServeServiceValue == common.EnableServeServiceTrue { + if enableServeServiceValue, exist := instance.Annotations[utils.EnableServeServiceKey]; exist && enableServeServiceValue == utils.EnableServeServiceTrue { serveLabel = true } pod := common.BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, headPort, autoscalingEnabled, creatorName, fqdnRayIP, serveLabel) @@ -1086,12 +1086,12 @@ func (r *RayClusterReconciler) buildWorkerPod(instance rayv1.RayCluster, worker func (r *RayClusterReconciler) buildRedisCleanupJob(instance rayv1.RayCluster) batchv1.Job { pod := r.buildHeadPod(instance) - pod.Labels[common.RayNodeTypeLabelKey] = string(rayv1.RedisCleanupNode) + pod.Labels[utils.RayNodeTypeLabelKey] = string(rayv1.RedisCleanupNode) // Only keep the Ray container in the Redis cleanup Job. - pod.Spec.Containers = []corev1.Container{pod.Spec.Containers[common.RayContainerIndex]} - pod.Spec.Containers[common.RayContainerIndex].Command = []string{"/bin/bash", "-lc", "--"} - pod.Spec.Containers[common.RayContainerIndex].Args = []string{ + pod.Spec.Containers = []corev1.Container{pod.Spec.Containers[utils.RayContainerIndex]} + pod.Spec.Containers[utils.RayContainerIndex].Command = []string{"/bin/bash", "-lc", "--"} + pod.Spec.Containers[utils.RayContainerIndex].Args = []string{ "echo \"To get more information about manually delete the storage namespace in Redis and remove the RayCluster's finalizer, please check https://docs.ray.io/en/master/cluster/kubernetes/user-guides/kuberay-gcs-ft.html for more details.\" && " + "python -c " + "\"from ray._private.gcs_utils import cleanup_redis_storage; " + @@ -1105,22 +1105,22 @@ func (r *RayClusterReconciler) buildRedisCleanupJob(instance rayv1.RayCluster) b } // Disable liveness and readiness probes because the Job will not launch processes like Raylet and GCS. - pod.Spec.Containers[common.RayContainerIndex].LivenessProbe = nil - pod.Spec.Containers[common.RayContainerIndex].ReadinessProbe = nil + pod.Spec.Containers[utils.RayContainerIndex].LivenessProbe = nil + pod.Spec.Containers[utils.RayContainerIndex].ReadinessProbe = nil // Set the environment variables to ensure that the cleanup Job has at least 60s. - pod.Spec.Containers[common.RayContainerIndex].Env = append(pod.Spec.Containers[common.RayContainerIndex].Env, corev1.EnvVar{ + pod.Spec.Containers[utils.RayContainerIndex].Env = append(pod.Spec.Containers[utils.RayContainerIndex].Env, corev1.EnvVar{ Name: "RAY_redis_db_connect_retries", Value: "120", }) - pod.Spec.Containers[common.RayContainerIndex].Env = append(pod.Spec.Containers[common.RayContainerIndex].Env, corev1.EnvVar{ + pod.Spec.Containers[utils.RayContainerIndex].Env = append(pod.Spec.Containers[utils.RayContainerIndex].Env, corev1.EnvVar{ Name: "RAY_redis_db_connect_wait_milliseconds", Value: "500", }) // The container's resource consumption remains constant. so hard-coding the resources is acceptable. // In addition, avoid using the GPU for the Redis cleanup Job. - pod.Spec.Containers[common.RayContainerIndex].Resources = corev1.ResourceRequirements{ + pod.Spec.Containers[utils.RayContainerIndex].Resources = corev1.ResourceRequirements{ Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("200m"), corev1.ResourceMemory: resource.MustParse("256Mi"), @@ -1186,7 +1186,7 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra newInstance.Status.ObservedGeneration = newInstance.ObjectMeta.Generation runtimePods := corev1.PodList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: newInstance.Name} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: newInstance.Name} if err := r.List(ctx, &runtimePods, client.InNamespace(newInstance.Namespace), filterLabels); err != nil { return nil, err } @@ -1227,7 +1227,7 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra // Best effort to obtain the ip of the head node. func (r *RayClusterReconciler) getHeadPodIP(ctx context.Context, instance *rayv1.RayCluster) (string, error) { runtimePods := corev1.PodList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeTypeLabelKey: string(rayv1.HeadNode)} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeTypeLabelKey: string(rayv1.HeadNode)} if err := r.List(ctx, &runtimePods, client.InNamespace(instance.Namespace), filterLabels); err != nil { r.Log.Error(err, "Failed to list pods while getting head pod ip.") return "", err @@ -1262,8 +1262,8 @@ func (r *RayClusterReconciler) updateEndpoints(ctx context.Context, instance *ra // and picking the first one. We may need to select by name in the future if the Service naming is stable. rayHeadSvc := corev1.ServiceList{} filterLabels := client.MatchingLabels{ - common.RayClusterLabelKey: instance.Name, - common.RayNodeTypeLabelKey: "head", + utils.RayClusterLabelKey: instance.Name, + utils.RayNodeTypeLabelKey: "head", } if err := r.List(ctx, &rayHeadSvc, client.InNamespace(instance.Namespace), filterLabels); err != nil { return err diff --git a/ray-operator/controllers/ray/raycluster_controller_fake_test.go b/ray-operator/controllers/ray/raycluster_controller_fake_test.go index f7d7418e7c..baa6765cb4 100644 --- a/ray-operator/controllers/ray/raycluster_controller_fake_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_fake_test.go @@ -84,10 +84,10 @@ func setupTest(t *testing.T) { Name: "headNode", Namespace: namespaceStr, Labels: map[string]string{ - common.RayNodeLabelKey: "yes", - common.RayClusterLabelKey: instanceName, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), - common.RayNodeGroupLabelKey: headGroupNameStr, + utils.RayNodeLabelKey: "yes", + utils.RayClusterLabelKey: instanceName, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayNodeGroupLabelKey: headGroupNameStr, }, }, Spec: corev1.PodSpec{ @@ -116,9 +116,9 @@ func setupTest(t *testing.T) { Name: "pod1", Namespace: namespaceStr, Labels: map[string]string{ - common.RayNodeLabelKey: "yes", - common.RayClusterLabelKey: instanceName, - common.RayNodeGroupLabelKey: groupNameStr, + utils.RayNodeLabelKey: "yes", + utils.RayClusterLabelKey: instanceName, + utils.RayNodeGroupLabelKey: groupNameStr, }, }, Spec: corev1.PodSpec{ @@ -146,9 +146,9 @@ func setupTest(t *testing.T) { Name: "pod2", Namespace: namespaceStr, Labels: map[string]string{ - common.RayNodeLabelKey: "yes", - common.RayClusterLabelKey: instanceName, - common.RayNodeGroupLabelKey: groupNameStr, + utils.RayNodeLabelKey: "yes", + utils.RayClusterLabelKey: instanceName, + utils.RayNodeGroupLabelKey: groupNameStr, }, }, Spec: corev1.PodSpec{ @@ -176,9 +176,9 @@ func setupTest(t *testing.T) { Name: "pod3", Namespace: namespaceStr, Labels: map[string]string{ - common.RayNodeLabelKey: "yes", - common.RayClusterLabelKey: instanceName, - common.RayNodeGroupLabelKey: groupNameStr, + utils.RayNodeLabelKey: "yes", + utils.RayClusterLabelKey: instanceName, + utils.RayNodeGroupLabelKey: groupNameStr, }, }, Spec: corev1.PodSpec{ @@ -206,9 +206,9 @@ func setupTest(t *testing.T) { Name: "pod4", Namespace: namespaceStr, Labels: map[string]string{ - common.RayNodeLabelKey: "yes", - common.RayClusterLabelKey: instanceName, - common.RayNodeGroupLabelKey: groupNameStr, + utils.RayNodeLabelKey: "yes", + utils.RayClusterLabelKey: instanceName, + utils.RayNodeGroupLabelKey: groupNameStr, }, }, Spec: corev1.PodSpec{ @@ -236,9 +236,9 @@ func setupTest(t *testing.T) { Name: "pod5", Namespace: namespaceStr, Labels: map[string]string{ - common.RayNodeLabelKey: "yes", - common.RayClusterLabelKey: instanceName, - common.RayNodeGroupLabelKey: groupNameStr, + utils.RayNodeLabelKey: "yes", + utils.RayClusterLabelKey: instanceName, + utils.RayNodeGroupLabelKey: groupNameStr, }, }, Spec: corev1.PodSpec{ @@ -268,10 +268,10 @@ func setupTest(t *testing.T) { Name: "headNode", Namespace: namespaceStr, Labels: map[string]string{ - common.RayNodeLabelKey: "yes", - common.RayClusterLabelKey: instanceName, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), - common.RayNodeGroupLabelKey: headGroupNameStr, + utils.RayNodeLabelKey: "yes", + utils.RayClusterLabelKey: instanceName, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayNodeGroupLabelKey: headGroupNameStr, }, }, Status: corev1.PodStatus{ @@ -372,19 +372,19 @@ func setupTest(t *testing.T) { instanceReqValue := []string{instanceName} instanceReq, err := labels.NewRequirement( - common.RayClusterLabelKey, + utils.RayClusterLabelKey, selection.Equals, instanceReqValue) assert.Nil(t, err, "Fail to create requirement") groupNameReqValue := []string{groupNameStr} groupNameReq, err := labels.NewRequirement( - common.RayNodeGroupLabelKey, + utils.RayNodeGroupLabelKey, selection.Equals, groupNameReqValue) assert.Nil(t, err, "Fail to create requirement") headNameReqValue := []string{headGroupNameStr} headNameReq, err := labels.NewRequirement( - common.RayNodeGroupLabelKey, + utils.RayNodeGroupLabelKey, selection.Equals, headNameReqValue) assert.Nil(t, err, "Fail to create requirement") @@ -401,7 +401,7 @@ func TestReconcile_RemoveWorkersToDelete_RandomDelete(t *testing.T) { // This test makes some assumptions about the testRayCluster object. // (1) 1 workerGroup (2) The goal state of the workerGroup is 3 replicas. (3) ENABLE_RANDOM_POD_DELETE is set to true. - defer os.Unsetenv(common.ENABLE_RANDOM_POD_DELETE) + defer os.Unsetenv(utils.ENABLE_RANDOM_POD_DELETE) assert.Equal(t, 1, len(testRayCluster.Spec.WorkerGroupSpecs), "This test assumes only one worker group.") expectedNumWorkerPods := int(*testRayCluster.Spec.WorkerGroupSpecs[0].Replicas) @@ -412,7 +412,7 @@ func TestReconcile_RemoveWorkersToDelete_RandomDelete(t *testing.T) { // Case 2: If Autoscaler is enabled, we will respect the value of the feature flag. If the feature flag environment variable // is not set, we will disable random Pod deletion by default. // Here, we enable the Autoscaler and set the feature flag `ENABLE_RANDOM_POD_DELETE` to true to enable random Pod deletion. - os.Setenv(common.ENABLE_RANDOM_POD_DELETE, "true") + os.Setenv(utils.ENABLE_RANDOM_POD_DELETE, "true") enableInTreeAutoscaling := true tests := map[string]struct { @@ -521,7 +521,7 @@ func TestReconcile_RemoveWorkersToDelete_NoRandomDelete(t *testing.T) { // This test makes some assumptions about the testRayCluster object. // (1) 1 workerGroup (2) The goal state of the workerGroup is 3 replicas. (3) Disable random Pod deletion. - defer os.Unsetenv(common.ENABLE_RANDOM_POD_DELETE) + defer os.Unsetenv(utils.ENABLE_RANDOM_POD_DELETE) assert.Equal(t, 1, len(testRayCluster.Spec.WorkerGroupSpecs), "This test assumes only one worker group.") expectedNumWorkerPods := int(*testRayCluster.Spec.WorkerGroupSpecs[0].Replicas) @@ -531,7 +531,7 @@ func TestReconcile_RemoveWorkersToDelete_NoRandomDelete(t *testing.T) { // is not set, we will disable random Pod deletion by default. Hence, this test will disable random Pod deletion. // In this case, the cluster won't achieve the target state (i.e., `expectedNumWorkerPods` worker Pods) in one reconciliation. // Instead, the Ray Autoscaler will gradually scale down the cluster in subsequent reconciliations until it reaches the target state. - os.Unsetenv(common.ENABLE_RANDOM_POD_DELETE) + os.Unsetenv(utils.ENABLE_RANDOM_POD_DELETE) enableInTreeAutoscaling := true tests := map[string]struct { @@ -830,7 +830,7 @@ func TestReconcile_Diff0_WorkersToDelete_OK(t *testing.T) { func TestReconcile_PodCrash_DiffLess0_OK(t *testing.T) { setupTest(t) - defer os.Unsetenv(common.ENABLE_RANDOM_POD_DELETE) + defer os.Unsetenv(utils.ENABLE_RANDOM_POD_DELETE) // TODO (kevin85421): The tests in this file are not independent. As a workaround, // I added the assertion to prevent the test logic from being affected by other changes. // However, we should refactor the tests in the future. @@ -885,9 +885,9 @@ func TestReconcile_PodCrash_DiffLess0_OK(t *testing.T) { } if tc.ENABLE_RANDOM_POD_DELETE { - os.Setenv(common.ENABLE_RANDOM_POD_DELETE, "true") + os.Setenv(utils.ENABLE_RANDOM_POD_DELETE, "true") } else { - os.Setenv(common.ENABLE_RANDOM_POD_DELETE, "false") + os.Setenv(utils.ENABLE_RANDOM_POD_DELETE, "false") } cluster := testRayCluster.DeepCopy() // Case 1: ENABLE_RANDOM_POD_DELETE is true. @@ -978,8 +978,8 @@ func TestReconcileHeadService(t *testing.T) { Name: "head-svc-1", Namespace: cluster.Namespace, Labels: map[string]string{ - common.RayClusterLabelKey: cluster.Name, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayClusterLabelKey: cluster.Name, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), }, }, } @@ -991,8 +991,8 @@ func TestReconcileHeadService(t *testing.T) { fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() ctx := context.TODO() headServiceSelector := labels.SelectorFromSet(map[string]string{ - common.RayClusterLabelKey: cluster.Name, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayClusterLabelKey: cluster.Name, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), }) // Initialize RayCluster reconciler. @@ -1249,9 +1249,9 @@ func TestGetHeadPodIP(t *testing.T) { Name: "unexpectedExtraHeadNode", Namespace: namespaceStr, Labels: map[string]string{ - common.RayClusterLabelKey: instanceName, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), - common.RayNodeGroupLabelKey: headGroupNameStr, + utils.RayClusterLabelKey: instanceName, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayNodeGroupLabelKey: headGroupNameStr, }, }, } @@ -1574,8 +1574,8 @@ func TestCalculateStatus(t *testing.T) { Name: "headNode", Namespace: namespaceStr, Labels: map[string]string{ - common.RayClusterLabelKey: instanceName, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayClusterLabelKey: instanceName, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), }, }, Status: corev1.PodStatus{ @@ -1843,7 +1843,7 @@ func Test_RunningPods_RayContainerTerminated(t *testing.T) { podList.Items[0].Status.Phase = corev1.PodRunning podList.Items[0].Status.ContainerStatuses = []corev1.ContainerStatus{ { - Name: podList.Items[0].Spec.Containers[common.RayContainerIndex].Name, + Name: podList.Items[0].Spec.Containers[utils.RayContainerIndex].Name, State: corev1.ContainerState{ Terminated: &corev1.ContainerStateTerminated{}, }, @@ -1975,7 +1975,7 @@ func Test_ShouldDeletePod(t *testing.T) { func Test_RedisCleanupFeatureFlag(t *testing.T) { setupTest(t) - defer os.Unsetenv(common.ENABLE_GCS_FT_REDIS_CLEANUP) + defer os.Unsetenv(utils.ENABLE_GCS_FT_REDIS_CLEANUP) newScheme := runtime.NewScheme() _ = rayv1.AddToScheme(newScheme) @@ -1986,7 +1986,7 @@ func Test_RedisCleanupFeatureFlag(t *testing.T) { if gcsFTEnabledCluster.Annotations == nil { gcsFTEnabledCluster.Annotations = make(map[string]string) } - gcsFTEnabledCluster.Annotations[common.RayFTEnabledAnnotationKey] = "true" + gcsFTEnabledCluster.Annotations[utils.RayFTEnabledAnnotationKey] = "true" gcsFTEnabledCluster.Spec.EnableInTreeAutoscaling = nil ctx := context.Background() @@ -2014,9 +2014,9 @@ func Test_RedisCleanupFeatureFlag(t *testing.T) { for name, tc := range tests { t.Run(name, func(t *testing.T) { if tc.enableGCSFTRedisCleanup == "unset" { - os.Unsetenv(common.ENABLE_GCS_FT_REDIS_CLEANUP) + os.Unsetenv(utils.ENABLE_GCS_FT_REDIS_CLEANUP) } else { - os.Setenv(common.ENABLE_GCS_FT_REDIS_CLEANUP, tc.enableGCSFTRedisCleanup) + os.Setenv(utils.ENABLE_GCS_FT_REDIS_CLEANUP, tc.enableGCSFTRedisCleanup) } cluster := gcsFTEnabledCluster.DeepCopy() @@ -2063,7 +2063,7 @@ func Test_RedisCleanupFeatureFlag(t *testing.T) { assert.Equal(t, 1, len(rayClusterList.Items)) assert.Equal(t, tc.expectedNumFinalizers, len(rayClusterList.Items[0].Finalizers)) if tc.expectedNumFinalizers > 0 { - assert.True(t, controllerutil.ContainsFinalizer(&rayClusterList.Items[0], common.GCSFaultToleranceRedisCleanupFinalizer)) + assert.True(t, controllerutil.ContainsFinalizer(&rayClusterList.Items[0], utils.GCSFaultToleranceRedisCleanupFinalizer)) // No Pod should be created before adding the GCS FT Redis cleanup finalizer. podList := corev1.PodList{} @@ -2099,11 +2099,11 @@ func Test_RedisCleanup(t *testing.T) { if gcsFTEnabledCluster.Annotations == nil { gcsFTEnabledCluster.Annotations = make(map[string]string) } - gcsFTEnabledCluster.Annotations[common.RayFTEnabledAnnotationKey] = "true" + gcsFTEnabledCluster.Annotations[utils.RayFTEnabledAnnotationKey] = "true" gcsFTEnabledCluster.Spec.EnableInTreeAutoscaling = nil // Add the Redis cleanup finalizer to the RayCluster and modify the RayCluster's DeleteTimestamp to trigger the Redis cleanup. - controllerutil.AddFinalizer(gcsFTEnabledCluster, common.GCSFaultToleranceRedisCleanupFinalizer) + controllerutil.AddFinalizer(gcsFTEnabledCluster, utils.GCSFaultToleranceRedisCleanupFinalizer) now := metav1.Now() gcsFTEnabledCluster.DeletionTimestamp = &now @@ -2115,9 +2115,9 @@ func Test_RedisCleanup(t *testing.T) { Name: "headNode", Namespace: gcsFTEnabledCluster.Namespace, Labels: map[string]string{ - common.RayClusterLabelKey: gcsFTEnabledCluster.Name, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), - common.RayNodeGroupLabelKey: headGroupName, + utils.RayClusterLabelKey: gcsFTEnabledCluster.Name, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayNodeGroupLabelKey: headGroupName, }, }, } @@ -2126,9 +2126,9 @@ func Test_RedisCleanup(t *testing.T) { Name: "workerNode", Namespace: gcsFTEnabledCluster.Namespace, Labels: map[string]string{ - common.RayClusterLabelKey: gcsFTEnabledCluster.Name, - common.RayNodeTypeLabelKey: string(rayv1.WorkerNode), - common.RayNodeGroupLabelKey: gcsFTEnabledCluster.Spec.WorkerGroupSpecs[0].GroupName, + utils.RayClusterLabelKey: gcsFTEnabledCluster.Name, + utils.RayNodeTypeLabelKey: string(rayv1.WorkerNode), + utils.RayNodeGroupLabelKey: gcsFTEnabledCluster.Spec.WorkerGroupSpecs[0].GroupName, }, }, } @@ -2181,9 +2181,9 @@ func Test_RedisCleanup(t *testing.T) { headPods := corev1.PodList{} err := fakeClient.List(ctx, &headPods, client.InNamespace(namespaceStr), client.MatchingLabels{ - common.RayClusterLabelKey: cluster.Name, - common.RayNodeGroupLabelKey: headGroupName, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayClusterLabelKey: cluster.Name, + utils.RayNodeGroupLabelKey: headGroupName, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), }) assert.Nil(t, err) assert.Equal(t, 1, len(headPods.Items)) @@ -2192,9 +2192,9 @@ func Test_RedisCleanup(t *testing.T) { workerPods := corev1.PodList{} err := fakeClient.List(ctx, &workerPods, client.InNamespace(namespaceStr), client.MatchingLabels{ - common.RayClusterLabelKey: cluster.Name, - common.RayNodeGroupLabelKey: cluster.Spec.WorkerGroupSpecs[0].GroupName, - common.RayNodeTypeLabelKey: string(rayv1.WorkerNode), + utils.RayClusterLabelKey: cluster.Name, + utils.RayNodeGroupLabelKey: cluster.Spec.WorkerGroupSpecs[0].GroupName, + utils.RayNodeTypeLabelKey: string(rayv1.WorkerNode), }) assert.Nil(t, err) assert.Equal(t, 1, len(workerPods.Items)) @@ -2229,7 +2229,7 @@ func Test_RedisCleanup(t *testing.T) { err = fakeClient.List(ctx, &rayClusterList, client.InNamespace(namespaceStr)) assert.Nil(t, err, "Fail to get RayCluster list") assert.Equal(t, 1, len(rayClusterList.Items)) - assert.True(t, controllerutil.ContainsFinalizer(&rayClusterList.Items[0], common.GCSFaultToleranceRedisCleanupFinalizer)) + assert.True(t, controllerutil.ContainsFinalizer(&rayClusterList.Items[0], utils.GCSFaultToleranceRedisCleanupFinalizer)) // Simulate the Job succeeded. job := jobList.Items[0] diff --git a/ray-operator/controllers/ray/raycluster_controller_test.go b/ray-operator/controllers/ray/raycluster_controller_test.go index 77f58f2c83..1cfb46dff6 100644 --- a/ray-operator/controllers/ray/raycluster_controller_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_test.go @@ -22,7 +22,6 @@ import ( "reflect" "time" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" . "github.com/onsi/ginkgo/v2" @@ -119,8 +118,8 @@ var _ = Context("Inside the default namespace", func() { }, } - headFilterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayCluster.Name, common.RayNodeGroupLabelKey: "headgroup"} - workerFilterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayCluster.Name, common.RayNodeGroupLabelKey: "small-group"} + headFilterLabels := client.MatchingLabels{utils.RayClusterLabelKey: myRayCluster.Name, utils.RayNodeGroupLabelKey: "headgroup"} + workerFilterLabels := client.MatchingLabels{utils.RayClusterLabelKey: myRayCluster.Name, utils.RayNodeGroupLabelKey: "small-group"} Describe("When creating a raycluster", func() { It("should create a raycluster object", func() { @@ -139,7 +138,7 @@ var _ = Context("Inside the default namespace", func() { Eventually( getResourceFunc(ctx, client.ObjectKey{Name: "raycluster-sample-head-svc", Namespace: "default"}, svc), time.Second*15, time.Millisecond*500).Should(BeNil(), "My head service = %v", svc) - Expect(svc.Spec.Selector[common.RayIDLabelKey]).Should(Equal(utils.GenerateIdentifier(myRayCluster.Name, rayv1.HeadNode))) + Expect(svc.Spec.Selector[utils.RayIDLabelKey]).Should(Equal(utils.GenerateIdentifier(myRayCluster.Name, rayv1.HeadNode))) }) It("should create 3 workers", func() { @@ -203,7 +202,7 @@ var _ = Context("Inside the default namespace", func() { It("cluster's .status.state should be updated to 'ready' shortly after all Pods are Running", func() { Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Ready)) + time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Ready)) }) It("should re-create a deleted worker", func() { diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 575056ef23..5ec82cd9b9 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -91,9 +91,9 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) if rayJobInstance.ObjectMeta.DeletionTimestamp.IsZero() { // The object is not being deleted, so if it does not have our finalizer, // then lets add the finalizer and update the object. - if !controllerutil.ContainsFinalizer(rayJobInstance, common.RayJobStopJobFinalizer) { - r.Log.Info("Add a finalizer", "finalizer", common.RayJobStopJobFinalizer) - controllerutil.AddFinalizer(rayJobInstance, common.RayJobStopJobFinalizer) + if !controllerutil.ContainsFinalizer(rayJobInstance, utils.RayJobStopJobFinalizer) { + r.Log.Info("Add a finalizer", "finalizer", utils.RayJobStopJobFinalizer) + controllerutil.AddFinalizer(rayJobInstance, utils.RayJobStopJobFinalizer) if err := r.Update(ctx, rayJobInstance); err != nil { r.Log.Error(err, "Failed to update RayJob with finalizer") return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err @@ -110,8 +110,8 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) } } - r.Log.Info("Remove the finalizer no matter StopJob() succeeds or not.", "finalizer", common.RayJobStopJobFinalizer) - controllerutil.RemoveFinalizer(rayJobInstance, common.RayJobStopJobFinalizer) + r.Log.Info("Remove the finalizer no matter StopJob() succeeds or not.", "finalizer", utils.RayJobStopJobFinalizer) + controllerutil.RemoveFinalizer(rayJobInstance, utils.RayJobStopJobFinalizer) err := r.Update(ctx, rayJobInstance) if err != nil { r.Log.Error(err, "Failed to remove finalizer for RayJob") @@ -190,7 +190,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) rayDashboardClient := utils.GetRayDashboardClientFunc() if clientURL := rayJobInstance.Status.DashboardURL; clientURL == "" { // TODO: dashboard service may be changed. Check it instead of using the same URL always - if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, common.DashboardPortName); err != nil || clientURL == "" { + if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, utils.DashboardPortName); err != nil || clientURL == "" { if clientURL == "" { err = fmt.Errorf("empty dashboardURL") } @@ -379,7 +379,7 @@ func (r *RayJobReconciler) getSubmitterTemplate(rayJobInstance *rayv1.RayJob, ra } // If the command in the submitter pod template isn't set, use the default command. - if len(submitterTemplate.Spec.Containers[common.RayContainerIndex].Command) == 0 { + if len(submitterTemplate.Spec.Containers[utils.RayContainerIndex].Command) == 0 { // Check for deprecated 'runtimeEnv' field usage and log a warning. if len(rayJobInstance.Spec.RuntimeEnv) > 0 { r.Log.Info("Warning: The 'runtimeEnv' field is deprecated. Please use 'runtimeEnvYAML' instead.") @@ -389,14 +389,14 @@ func (r *RayJobReconciler) getSubmitterTemplate(rayJobInstance *rayv1.RayJob, ra if err != nil { return corev1.PodTemplateSpec{}, err } - submitterTemplate.Spec.Containers[common.RayContainerIndex].Command = k8sJobCommand + submitterTemplate.Spec.Containers[utils.RayContainerIndex].Command = k8sJobCommand r.Log.Info("No command is specified in the user-provided template. Default command is used", "command", k8sJobCommand) } else { - r.Log.Info("User-provided command is used", "command", submitterTemplate.Spec.Containers[common.RayContainerIndex].Command) + r.Log.Info("User-provided command is used", "command", submitterTemplate.Spec.Containers[utils.RayContainerIndex].Command) } // Set PYTHONUNBUFFERED=1 for real-time logging - submitterTemplate.Spec.Containers[common.RayContainerIndex].Env = append(submitterTemplate.Spec.Containers[common.RayContainerIndex].Env, corev1.EnvVar{ + submitterTemplate.Spec.Containers[utils.RayContainerIndex].Env = append(submitterTemplate.Spec.Containers[utils.RayContainerIndex].Env, corev1.EnvVar{ Name: PythonUnbufferedEnvVarName, Value: "1", }) @@ -411,7 +411,7 @@ func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance * Name: rayJobInstance.Name, Namespace: rayJobInstance.Namespace, Labels: map[string]string{ - common.KubernetesCreatedByLabelKey: common.ComponentName, + utils.KubernetesCreatedByLabelKey: utils.ComponentName, }, }, Spec: batchv1.JobSpec{ diff --git a/ray-operator/controllers/ray/rayjob_controller_suspended_test.go b/ray-operator/controllers/ray/rayjob_controller_suspended_test.go index a110647510..3d62a18d04 100644 --- a/ray-operator/controllers/ray/rayjob_controller_suspended_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_suspended_test.go @@ -23,7 +23,7 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -235,8 +235,8 @@ var _ = Context("Inside the default namespace", func() { It("should create 3 workers", func() { Eventually( listResourceFunc(ctx, &workerPods, client.MatchingLabels{ - common.RayClusterLabelKey: mySuspendedRayCluster.Name, - common.RayNodeGroupLabelKey: "small-group", + utils.RayClusterLabelKey: mySuspendedRayCluster.Name, + utils.RayNodeGroupLabelKey: "small-group", }, &client.ListOptions{Namespace: "default"}), time.Second*15, time.Millisecond*500).Should(Equal(3), fmt.Sprintf("workerGroup %v", workerPods.Items)) @@ -248,8 +248,8 @@ var _ = Context("Inside the default namespace", func() { It("should create a head pod resource", func() { err := k8sClient.List(ctx, &headPods, client.MatchingLabels{ - common.RayClusterLabelKey: mySuspendedRayCluster.Name, - common.RayNodeGroupLabelKey: "headgroup", + utils.RayClusterLabelKey: mySuspendedRayCluster.Name, + utils.RayNodeGroupLabelKey: "headgroup", }, &client.ListOptions{Namespace: "default"}, client.InNamespace(mySuspendedRayCluster.Namespace)) @@ -292,8 +292,8 @@ var _ = Context("Inside the default namespace", func() { Eventually( isAllPodsRunning(ctx, headPods, client.MatchingLabels{ - common.RayClusterLabelKey: mySuspendedRayCluster.Name, - common.RayNodeGroupLabelKey: "headgroup", + utils.RayClusterLabelKey: mySuspendedRayCluster.Name, + utils.RayNodeGroupLabelKey: "headgroup", }, "default"), time.Second*15, time.Millisecond*500).Should(Equal(true), "Head Pod should be running.") @@ -303,7 +303,7 @@ var _ = Context("Inside the default namespace", func() { } Eventually( - isAllPodsRunning(ctx, workerPods, client.MatchingLabels{common.RayClusterLabelKey: mySuspendedRayCluster.Name, common.RayNodeGroupLabelKey: "small-group"}, "default"), + isAllPodsRunning(ctx, workerPods, client.MatchingLabels{utils.RayClusterLabelKey: mySuspendedRayCluster.Name, utils.RayNodeGroupLabelKey: "small-group"}, "default"), time.Second*15, time.Millisecond*500).Should(Equal(true), "All worker Pods should be running.") }) diff --git a/ray-operator/controllers/ray/rayjob_controller_test.go b/ray-operator/controllers/ray/rayjob_controller_test.go index 3a173675c9..c79074b9f5 100644 --- a/ray-operator/controllers/ray/rayjob_controller_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_test.go @@ -23,7 +23,6 @@ import ( "k8s.io/apimachinery/pkg/api/resource" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" . "github.com/onsi/ginkgo/v2" @@ -198,7 +197,7 @@ var _ = Context("Inside the default namespace", func() { }) It("Should create a number of workers equal to the replica setting", func() { - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayJob.Status.RayClusterName, common.RayNodeGroupLabelKey: myRayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: myRayJob.Status.RayClusterName, utils.RayNodeGroupLabelKey: myRayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName} workerPods := corev1.PodList{} Eventually( listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}), @@ -227,7 +226,7 @@ var _ = Context("Inside the default namespace", func() { Expect(err).NotTo(HaveOccurred(), "failed to update RayJob") // confirm the number of worker pods increased. - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayJob.Status.RayClusterName, common.RayNodeGroupLabelKey: myRayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: myRayJob.Status.RayClusterName, utils.RayNodeGroupLabelKey: myRayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName} workerPods := corev1.PodList{} Eventually( listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}), @@ -315,7 +314,7 @@ var _ = Context("Inside the default namespace with autoscaler", func() { Expect(err).NotTo(HaveOccurred(), "failed to update RayCluster replica") // confirm a new worker pod is created. - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayJob.Status.RayClusterName, common.RayNodeGroupLabelKey: myRayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: myRayJob.Status.RayClusterName, utils.RayNodeGroupLabelKey: myRayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName} workerPods := corev1.PodList{} Eventually( listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}), @@ -352,7 +351,7 @@ var _ = Context("Inside the default namespace with autoscaler", func() { Expect(err).NotTo(HaveOccurred(), "failed to update RayJob") // confirm the number of worker pods is not changed. - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayJob.Status.RayClusterName, common.RayNodeGroupLabelKey: myRayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: myRayJob.Status.RayClusterName, utils.RayNodeGroupLabelKey: myRayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName} workerPods := corev1.PodList{} Consistently( listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}), diff --git a/ray-operator/controllers/ray/rayjob_controller_unit_test.go b/ray-operator/controllers/ray/rayjob_controller_unit_test.go index 4625bf2768..0530ba1878 100644 --- a/ray-operator/controllers/ray/rayjob_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_unit_test.go @@ -5,7 +5,6 @@ import ( "testing" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" utils "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" "github.com/stretchr/testify/assert" batchv1 "k8s.io/api/batch/v1" @@ -151,25 +150,25 @@ func TestGetSubmitterTemplate(t *testing.T) { // Test 1: User provided template with command submitterTemplate, err := r.getSubmitterTemplate(rayJobInstanceWithTemplate, nil) assert.NoError(t, err) - assert.Equal(t, "user-command", submitterTemplate.Spec.Containers[common.RayContainerIndex].Command[0]) + assert.Equal(t, "user-command", submitterTemplate.Spec.Containers[utils.RayContainerIndex].Command[0]) // Test 2: User provided template without command - rayJobInstanceWithTemplate.Spec.SubmitterPodTemplate.Spec.Containers[common.RayContainerIndex].Command = []string{} + rayJobInstanceWithTemplate.Spec.SubmitterPodTemplate.Spec.Containers[utils.RayContainerIndex].Command = []string{} submitterTemplate, err = r.getSubmitterTemplate(rayJobInstanceWithTemplate, nil) assert.NoError(t, err) - assert.Equal(t, []string{"ray", "job", "submit", "--address", "http://test-url", "--", "echo", "hello", "world"}, submitterTemplate.Spec.Containers[common.RayContainerIndex].Command) + assert.Equal(t, []string{"ray", "job", "submit", "--address", "http://test-url", "--", "echo", "hello", "world"}, submitterTemplate.Spec.Containers[utils.RayContainerIndex].Command) // Test 3: User did not provide template, should use the image of the Ray Head submitterTemplate, err = r.getSubmitterTemplate(rayJobInstanceWithoutTemplate, rayClusterInstance) assert.NoError(t, err) - assert.Equal(t, []string{"ray", "job", "submit", "--address", "http://test-url", "--", "echo", "hello", "world"}, submitterTemplate.Spec.Containers[common.RayContainerIndex].Command) - assert.Equal(t, "rayproject/ray:custom-version", submitterTemplate.Spec.Containers[common.RayContainerIndex].Image) + assert.Equal(t, []string{"ray", "job", "submit", "--address", "http://test-url", "--", "echo", "hello", "world"}, submitterTemplate.Spec.Containers[utils.RayContainerIndex].Command) + assert.Equal(t, "rayproject/ray:custom-version", submitterTemplate.Spec.Containers[utils.RayContainerIndex].Image) // Test 4: Check default PYTHONUNBUFFERED setting submitterTemplate, err = r.getSubmitterTemplate(rayJobInstanceWithoutTemplate, rayClusterInstance) assert.NoError(t, err) found := false - for _, envVar := range submitterTemplate.Spec.Containers[common.RayContainerIndex].Env { + for _, envVar := range submitterTemplate.Spec.Containers[utils.RayContainerIndex].Env { if envVar.Name == PythonUnbufferedEnvVarName { assert.Equal(t, "1", envVar.Value) found = true diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index ae93466aea..3da36c35c6 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -196,7 +196,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque err = r.updateState(ctx, rayServiceInstance, rayv1.FailedToUpdateIngress, err) return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } - if err := r.reconcileServices(ctx, rayServiceInstance, rayClusterInstance, common.HeadService); err != nil { + if err := r.reconcileServices(ctx, rayServiceInstance, rayClusterInstance, utils.HeadService); err != nil { err = r.updateState(ctx, rayServiceInstance, rayv1.FailedToUpdateService, err) return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } @@ -204,7 +204,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque err = r.updateState(ctx, rayServiceInstance, rayv1.FailedToUpdateServingPodLabel, err) return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } - if err := r.reconcileServices(ctx, rayServiceInstance, rayClusterInstance, common.ServingService); err != nil { + if err := r.reconcileServices(ctx, rayServiceInstance, rayClusterInstance, utils.ServingService); err != nil { err = r.updateState(ctx, rayServiceInstance, rayv1.FailedToUpdateService, err) return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } @@ -383,7 +383,7 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi // cleanUpRayClusterInstance cleans up all the dangling RayCluster instances that are owned by the RayService instance. func (r *RayServiceReconciler) cleanUpRayClusterInstance(ctx context.Context, rayServiceInstance *rayv1.RayService) error { rayClusterList := rayv1.RayClusterList{} - filterLabels := client.MatchingLabels{common.RayServiceLabelKey: rayServiceInstance.Name} + filterLabels := client.MatchingLabels{utils.RayServiceLabelKey: rayServiceInstance.Name} var err error if err = r.List(ctx, &rayClusterList, client.InNamespace(rayServiceInstance.Namespace), filterLabels); err != nil { r.Log.Error(err, "Fail to list RayCluster for "+rayServiceInstance.Name) @@ -472,7 +472,7 @@ func (r *RayServiceReconciler) shouldPrepareNewRayCluster(rayServiceInstance *ra r.Log.Info("No active Ray cluster. RayService operator should prepare a new Ray cluster.") return true } - activeClusterHash := activeRayCluster.ObjectMeta.Annotations[common.RayServiceClusterHashKey] + activeClusterHash := activeRayCluster.ObjectMeta.Annotations[utils.RayServiceClusterHashKey] goalClusterHash, err := generateRayClusterJsonHash(rayServiceInstance.Spec.RayClusterSpec) if err != nil { errContext := "Failed to serialize new RayCluster config. " + @@ -577,15 +577,15 @@ func (r *RayServiceReconciler) constructRayClusterForRayService(rayService *rayv for k, v := range rayService.Labels { rayClusterLabel[k] = v } - rayClusterLabel[common.RayServiceLabelKey] = rayService.Name - rayClusterLabel[common.KubernetesCreatedByLabelKey] = common.RayServiceCreatorLabelValue + rayClusterLabel[utils.RayServiceLabelKey] = rayService.Name + rayClusterLabel[utils.KubernetesCreatedByLabelKey] = utils.RayServiceCreatorLabelValue rayClusterAnnotations := make(map[string]string) for k, v := range rayService.Annotations { rayClusterAnnotations[k] = v } - rayClusterAnnotations[common.EnableServeServiceKey] = common.EnableServeServiceTrue - rayClusterAnnotations[common.RayServiceClusterHashKey], err = generateRayClusterJsonHash(rayService.Spec.RayClusterSpec) + rayClusterAnnotations[utils.EnableServeServiceKey] = utils.EnableServeServiceTrue + rayClusterAnnotations[utils.RayServiceClusterHashKey], err = generateRayClusterJsonHash(rayService.Spec.RayClusterSpec) if err != nil { errContext := "Failed to serialize RayCluster config. " + "Manual config updates will NOT be tracked accurately. " + @@ -759,7 +759,7 @@ func (r *RayServiceReconciler) getAndCheckServeStatus(ctx context.Context, dashb "err: %v", err) return false, err } - serveAppStatuses = map[string]*utils.ServeApplicationStatus{common.DefaultServeAppName: singleApplicationStatus} + serveAppStatuses = map[string]*utils.ServeApplicationStatus{utils.DefaultServeAppName: singleApplicationStatus} } else if serveConfigType == utils.MULTI_APP { if serveAppStatuses, err = dashboardClient.GetMultiApplicationStatus(ctx); err != nil { err = fmt.Errorf( @@ -780,7 +780,7 @@ func (r *RayServiceReconciler) getAndCheckServeStatus(ctx context.Context, dashb newApplications := make(map[string]rayv1.AppStatus) for appName, app := range serveAppStatuses { if appName == "" { - appName = common.DefaultServeAppName + appName = utils.DefaultServeAppName } prevApplicationStatus := rayServiceServeStatus.Applications[appName] @@ -920,7 +920,7 @@ func (r *RayServiceReconciler) reconcileIngress(ctx context.Context, rayServiceI return nil } -func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, serviceType common.ServiceType) error { +func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, serviceType utils.ServiceType) error { r.Log.Info( "reconcileServices", "serviceType", serviceType, "RayService name", rayServiceInstance.Name, "RayService namespace", rayServiceInstance.Namespace, @@ -930,9 +930,9 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService var err error switch serviceType { - case common.HeadService: + case utils.HeadService: newSvc, err = common.BuildHeadServiceForRayService(*rayServiceInstance, *rayClusterInstance) - case common.ServingService: + case utils.ServingService: newSvc, err = common.BuildServeServiceForRayService(*rayServiceInstance, *rayClusterInstance) default: return fmt.Errorf("unknown service type %v", serviceType) @@ -949,8 +949,8 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService if err == nil { // Only update the service if the RayCluster switches. - if newSvc.Spec.Selector[common.RayClusterLabelKey] == oldSvc.Spec.Selector[common.RayClusterLabelKey] { - r.Log.Info(fmt.Sprintf("RayCluster %v's %v has already exists, skip Update", newSvc.Spec.Selector[common.RayClusterLabelKey], serviceType)) + if newSvc.Spec.Selector[utils.RayClusterLabelKey] == oldSvc.Spec.Selector[utils.RayClusterLabelKey] { + r.Log.Info(fmt.Sprintf("RayCluster %v's %v has already exists, skip Update", newSvc.Spec.Selector[utils.RayClusterLabelKey], serviceType)) return nil } @@ -996,7 +996,7 @@ func (r *RayServiceReconciler) updateStatusForActiveCluster(ctx context.Context, var clientURL string rayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus - if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, common.DashboardAgentListenPortName); err != nil || clientURL == "" { + if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, utils.DashboardAgentListenPortName); err != nil || clientURL == "" { updateDashboardStatus(rayServiceStatus, false) return err } @@ -1048,7 +1048,7 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, false, err } - if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, common.DashboardAgentListenPortName); err != nil || clientURL == "" { + if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, utils.DashboardAgentListenPortName); err != nil || clientURL == "" { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, false, err } rayDashboardClient := utils.GetRayDashboardClientFunc() @@ -1092,7 +1092,7 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns func (r *RayServiceReconciler) labelHealthyServePods(ctx context.Context, rayClusterInstance *rayv1.RayCluster) error { allPods := corev1.PodList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: rayClusterInstance.Name} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: rayClusterInstance.Name} if err := r.List(ctx, &allPods, client.InNamespace(rayClusterInstance.Namespace), filterLabels); err != nil { return err @@ -1101,8 +1101,8 @@ func (r *RayServiceReconciler) labelHealthyServePods(ctx context.Context, rayClu httpProxyClient := utils.GetRayHttpProxyClientFunc() httpProxyClient.InitClient() for _, pod := range allPods.Items { - rayContainer := pod.Spec.Containers[common.RayContainerIndex] - servingPort := utils.FindContainerPort(&rayContainer, common.ServingPortName, common.DefaultServingPort) + rayContainer := pod.Spec.Containers[utils.RayContainerIndex] + servingPort := utils.FindContainerPort(&rayContainer, utils.ServingPortName, utils.DefaultServingPort) httpProxyClient.SetHostIp(pod.Status.PodIP, servingPort) if pod.Labels == nil { pod.Labels = make(map[string]string) @@ -1115,9 +1115,9 @@ func (r *RayServiceReconciler) labelHealthyServePods(ctx context.Context, rayClu } if httpProxyClient.CheckHealth() == nil { - pod.Labels[common.RayClusterServingServiceLabelKey] = common.EnableRayClusterServingServiceTrue + pod.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue } else { - pod.Labels[common.RayClusterServingServiceLabelKey] = common.EnableRayClusterServingServiceFalse + pod.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceFalse } if !reflect.DeepEqual(originalLabels, pod.Labels) { @@ -1160,7 +1160,7 @@ func compareRayClusterJsonHash(spec1 rayv1.RayClusterSpec, spec2 rayv1.RayCluste // isHeadPodRunningAndReady checks if the head pod of the RayCluster is running and ready. func (r *RayServiceReconciler) isHeadPodRunningAndReady(ctx context.Context, instance *rayv1.RayCluster) (bool, error) { podList := corev1.PodList{} - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: instance.Name, common.RayNodeTypeLabelKey: string(rayv1.HeadNode)} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name, utils.RayNodeTypeLabelKey: string(rayv1.HeadNode)} if err := r.List(ctx, &podList, client.InNamespace(instance.Namespace), filterLabels); err != nil { r.Log.Error(err, "Failed to list the head Pod of the RayCluster %s in the namespace %s", instance.Name, instance.Namespace) diff --git a/ray-operator/controllers/ray/rayservice_controller_test.go b/ray-operator/controllers/ray/rayservice_controller_test.go index 4397294bc0..cbfa95a25e 100644 --- a/ray-operator/controllers/ray/rayservice_controller_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_test.go @@ -21,7 +21,6 @@ import ( "os" "time" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" "k8s.io/apimachinery/pkg/api/resource" @@ -298,7 +297,7 @@ applications: }) It("should create more than 1 worker", func() { - filterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayService.Status.ActiveServiceStatus.RayClusterName, common.RayNodeGroupLabelKey: "small-group"} + filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: myRayService.Status.ActiveServiceStatus.RayClusterName, utils.RayNodeGroupLabelKey: "small-group"} Eventually( listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}), time.Second*15, time.Millisecond*500).Should(Equal(3), fmt.Sprintf("workerGroup %v", workerPods.Items)) @@ -311,13 +310,13 @@ applications: // Each worker Pod should have a container port with the name "dashboard-agent" exist := false for _, port := range pod.Spec.Containers[0].Ports { - if port.Name == common.DashboardAgentListenPortName { + if port.Name == utils.DashboardAgentListenPortName { exist = true break } } if !exist { - Fail(fmt.Sprintf("Worker Pod %v should have a container port with the name %v", pod.Name, common.DashboardAgentListenPortName)) + Fail(fmt.Sprintf("Worker Pod %v should have a container port with the name %v", pod.Name, utils.DashboardAgentListenPortName)) } } } @@ -336,7 +335,7 @@ applications: Eventually( getResourceFunc(ctx, client.ObjectKey{Name: headSvcName, Namespace: "default"}, svc), time.Second*15, time.Millisecond*500).Should(BeNil(), "My head service = %v", svc) - Expect(svc.Spec.Selector[common.RayIDLabelKey]).Should(Equal(utils.GenerateIdentifier(myRayCluster.Name, rayv1.HeadNode))) + Expect(svc.Spec.Selector[utils.RayIDLabelKey]).Should(Equal(utils.GenerateIdentifier(myRayCluster.Name, rayv1.HeadNode))) }) It("should create a new serve service resource", func() { @@ -344,7 +343,7 @@ applications: Eventually( getResourceFunc(ctx, client.ObjectKey{Name: utils.GenerateServeServiceName(myRayService.Name), Namespace: "default"}, svc), time.Second*15, time.Millisecond*500).Should(BeNil(), "My serve service = %v", svc) - Expect(svc.Spec.Selector[common.RayClusterLabelKey]).Should(Equal(myRayCluster.Name)) + Expect(svc.Spec.Selector[utils.RayClusterLabelKey]).Should(Equal(myRayCluster.Name)) }) It("should update a rayservice object and switch to new Ray Cluster", func() { @@ -554,7 +553,7 @@ applications: time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status) // Only update the LastUpdateTime and HealthLastUpdateTime fields in the active RayCluster. - oldTime := myRayService.Status.ActiveServiceStatus.Applications[common.DefaultServeAppName].HealthLastUpdateTime.DeepCopy() + oldTime := myRayService.Status.ActiveServiceStatus.Applications[utils.DefaultServeAppName].HealthLastUpdateTime.DeepCopy() fakeRayDashboardClient.SetSingleApplicationStatus(generateServeStatus(rayv1.DeploymentStatusEnum.HEALTHY, rayv1.ApplicationStatusEnum.RUNNING)) // Confirm not switch to a new RayCluster @@ -570,7 +569,7 @@ applications: // Status should not be updated if the only differences are the LastUpdateTime and HealthLastUpdateTime fields. // Unlike the test "Status should be updated if the differences are not only LastUpdateTime and HealthLastUpdateTime fields.", // the status update will not be triggered, so we can check whether the LastUpdateTime/HealthLastUpdateTime fields are updated or not by `oldTime`. - Expect(myRayService.Status.ActiveServiceStatus.Applications[common.DefaultServeAppName].HealthLastUpdateTime).Should(Equal(oldTime), "myRayService status = %v", myRayService.Status) + Expect(myRayService.Status.ActiveServiceStatus.Applications[utils.DefaultServeAppName].HealthLastUpdateTime).Should(Equal(oldTime), "myRayService status = %v", myRayService.Status) }) It("Update workerGroup.replicas in RayService and should not switch to new Ray Cluster", func() { @@ -747,8 +746,8 @@ func checkServiceHealth(ctx context.Context, rayService *rayv1.RayService) func( func updateHeadPodToRunningAndReady(ctx context.Context, rayClusterName string) { headPods := corev1.PodList{} headFilterLabels := client.MatchingLabels{ - common.RayClusterLabelKey: rayClusterName, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayClusterLabelKey: rayClusterName, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), } Eventually( diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 14ee385b58..39dd5c9bc9 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -10,7 +10,6 @@ import ( cmap "github.com/orcaman/concurrent-map" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme" "github.com/stretchr/testify/assert" @@ -90,7 +89,7 @@ func TestInconsistentRayServiceStatuses(t *testing.T) { HealthLastUpdateTime: &timeNow, }, Applications: map[string]rayv1.AppStatus{ - common.DefaultServeAppName: { + utils.DefaultServeAppName: { Status: rayv1.ApplicationStatusEnum.RUNNING, Message: "OK", HealthLastUpdateTime: &timeNow, @@ -111,7 +110,7 @@ func TestInconsistentRayServiceStatuses(t *testing.T) { HealthLastUpdateTime: &timeNow, }, Applications: map[string]rayv1.AppStatus{ - common.DefaultServeAppName: { + utils.DefaultServeAppName: { Status: rayv1.ApplicationStatusEnum.NOT_STARTED, Message: "application not started yet", HealthLastUpdateTime: &timeNow, @@ -214,8 +213,8 @@ func TestIsHeadPodRunningAndReady(t *testing.T) { Name: "head-pod", Namespace: cluster.ObjectMeta.Namespace, Labels: map[string]string{ - common.RayClusterLabelKey: cluster.ObjectMeta.Name, - common.RayNodeTypeLabelKey: string(rayv1.HeadNode), + utils.RayClusterLabelKey: cluster.ObjectMeta.Name, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), }, }, Status: corev1.PodStatus{ @@ -326,7 +325,7 @@ func TestReconcileServices_UpdateService(t *testing.T) { ctx := context.TODO() // Create a head service. - err := r.reconcileServices(ctx, &rayService, &cluster, common.HeadService) + err := r.reconcileServices(ctx, &rayService, &cluster, utils.HeadService) assert.Nil(t, err, "Fail to reconcile service") svcList := corev1.ServiceList{} @@ -342,7 +341,7 @@ func TestReconcileServices_UpdateService(t *testing.T) { ContainerPort: 9999, }, } - err = r.reconcileServices(ctx, &rayService, &cluster, common.HeadService) + err = r.reconcileServices(ctx, &rayService, &cluster, utils.HeadService) assert.Nil(t, err, "Fail to reconcile service") svcList = corev1.ServiceList{} @@ -353,7 +352,7 @@ func TestReconcileServices_UpdateService(t *testing.T) { // Test 2: When the RayCluster switches, the service should be updated. cluster.Name = "new-cluster" - err = r.reconcileServices(ctx, &rayService, &cluster, common.HeadService) + err = r.reconcileServices(ctx, &rayService, &cluster, utils.HeadService) assert.Nil(t, err, "Fail to reconcile service") svcList = corev1.ServiceList{} @@ -389,7 +388,7 @@ func TestFetchHeadServiceURL(t *testing.T) { Spec: corev1.ServiceSpec{ Ports: []corev1.ServicePort{ { - Name: common.DashboardPortName, + Name: utils.DashboardPortName, Port: dashboardPort, }, }, @@ -409,7 +408,7 @@ func TestFetchHeadServiceURL(t *testing.T) { Log: ctrl.Log.WithName("controllers").WithName("RayService"), } - url, err := utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, &cluster, common.DashboardPortName) + url, err := utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, &cluster, utils.DashboardPortName) assert.Nil(t, err, "Fail to fetch head service url") assert.Equal(t, fmt.Sprintf("test-cluster-head-svc.%s.svc.cluster.local:%d", namespace, dashboardPort), url, "Head service url is not correct") } @@ -677,7 +676,7 @@ func TestReconcileRayCluster(t *testing.T) { Name: "active-cluster", Namespace: namespace, Annotations: map[string]string{ - common.RayServiceClusterHashKey: hash, + utils.RayServiceClusterHashKey: hash, }, }, } diff --git a/ray-operator/controllers/ray/suite_test.go b/ray-operator/controllers/ray/suite_test.go index 1dbe023445..6149124d8b 100644 --- a/ray-operator/controllers/ray/suite_test.go +++ b/ray-operator/controllers/ray/suite_test.go @@ -21,7 +21,6 @@ import ( "testing" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" . "github.com/onsi/ginkgo/v2" @@ -78,7 +77,7 @@ var _ = BeforeSuite(func(ctx SpecContext) { Expect(k8sClient).ToNot(BeNil()) // Suggested way to run tests - os.Setenv(common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV, "10") + os.Setenv(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV, "10") mgr, err := ctrl.NewManager(cfg, ctrl.Options{ Scheme: scheme.Scheme, Metrics: metricsserver.Options{ diff --git a/ray-operator/controllers/ray/common/constant.go b/ray-operator/controllers/ray/utils/constant.go similarity index 99% rename from ray-operator/controllers/ray/common/constant.go rename to ray-operator/controllers/ray/utils/constant.go index 28bcd15f7d..1f97b10a32 100644 --- a/ray-operator/controllers/ray/common/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -1,4 +1,4 @@ -package common +package utils const ( diff --git a/ray-operator/main.go b/ray-operator/main.go index 69d4c5101b..9d19b50f54 100644 --- a/ray-operator/main.go +++ b/ray-operator/main.go @@ -30,7 +30,7 @@ import ( rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray" "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" // +kubebuilder:scaffold:imports ) @@ -191,7 +191,7 @@ func main() { } func cacheSelectors() (map[client.Object]cache.ByObject, error) { - label, err := labels.NewRequirement(common.KubernetesCreatedByLabelKey, selection.Equals, []string{common.ComponentName}) + label, err := labels.NewRequirement(utils.KubernetesCreatedByLabelKey, selection.Equals, []string{utils.ComponentName}) if err != nil { return nil, err } diff --git a/ray-operator/test/e2e/rayjob_cluster_selector_test.go b/ray-operator/test/e2e/rayjob_cluster_selector_test.go index 11be76c287..87225165ec 100644 --- a/ray-operator/test/e2e/rayjob_cluster_selector_test.go +++ b/ray-operator/test/e2e/rayjob_cluster_selector_test.go @@ -8,7 +8,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" . "github.com/ray-project/kuberay/ray-operator/test/support" ) @@ -59,7 +59,7 @@ func TestRayJobWithClusterSelector(t *testing.T) { }, Spec: rayv1.RayJobSpec{ ClusterSelector: map[string]string{ - common.RayClusterLabelKey: rayCluster.Name, + utils.RayClusterLabelKey: rayCluster.Name, }, Entrypoint: "python /home/ray/jobs/counter.py", RuntimeEnvYAML: ` @@ -98,7 +98,7 @@ env_vars: }, Spec: rayv1.RayJobSpec{ ClusterSelector: map[string]string{ - common.RayClusterLabelKey: rayCluster.Name, + utils.RayClusterLabelKey: rayCluster.Name, }, Entrypoint: "python /home/ray/jobs/fail.py", ShutdownAfterJobFinishes: false,