Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Replace service name with Fully Qualified Domain Name #938

Merged
merged 10 commits into from
Mar 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apiserver/pkg/util/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ func buildWorkerPodTemplate(imageVersion string, envs map[string]string, spec *a
Command: []string{
"sh",
"-c",
"until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done",
"until nslookup $FQ_RAY_IP; do echo waiting for K8s Service $FQ_RAY_IP; sleep 2; done",
},
},
},
Expand Down
4 changes: 2 additions & 2 deletions helm-chart/ray-cluster/templates/raycluster-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ spec:
initContainers:
- name: init
image: {{ $values.initContainerImage | default "busybox:1.28" }}
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
command: ['sh', '-c', "until nslookup $FQ_RAY_IP; do echo waiting for K8s Service $FQ_RAY_IP; sleep 2; done"]
securityContext:
{{- toYaml $values.initContainerSecurityContext | nindent 14 }}
containers:
Expand Down Expand Up @@ -163,7 +163,7 @@ spec:
initContainers:
- name: init
image: {{ .Values.worker.initContainerImage | default "busybox:1.28" }}
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
command: ['sh', '-c', "until nslookup $FQ_RAY_IP; do echo waiting for K8s Service $FQ_RAY_IP; sleep 2; done"]
securityContext:
{{- toYaml .Values.worker.initContainerSecurityContext | nindent 14 }}
containers:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ spec:
key: value
spec:
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
Expand Down
2 changes: 1 addition & 1 deletion ray-operator/config/samples/ray-cluster.autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ spec:
template:
spec:
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ spec:
- mountPath: /tmp/ray
name: ray-logs
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.28
# Change the cluster postfix if you don't have a default setting
Expand Down
2 changes: 1 addition & 1 deletion ray-operator/config/samples/ray-cluster.complete.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ spec:
# For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.
memory: "1G"
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.28
# Change the cluster postfix if you don't have a default setting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ spec:
template:
spec:
initContainers: # to avoid worker crashing before head service is created
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.28
# Change the cluster postfix if you don't have a default setting
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/samples/ray-cluster.heterogeneous.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ spec:
template:
spec:
initContainers: # to avoid worker crashing before head service is created
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.28
# Change the cluster postfix if you don't have a default setting
Expand Down Expand Up @@ -125,7 +125,7 @@ spec:
- name: init
image: busybox:1.28
# Change the cluster postfix if you don't have a default setting
command: ['sh', '-c', "until nslookup raycluster-heterogeneous-head-svc.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
containers:
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: rayproject/ray:2.2.0
Expand Down
8 changes: 4 additions & 4 deletions ray-operator/config/samples/ray_v1alpha1_rayjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ metadata:
name: rayjob-sample
spec:
entrypoint: python /home/ray/samples/sample_code.py
# runtimeEnv decoded to '{
# "pip": [
# "requests==2.26.0",
# runtimeEnv decoded to '{
# "pip": [
# "requests==2.26.0",
# "pendulum==2.1.2"
# ],
# "env_vars": {
Expand Down Expand Up @@ -65,7 +65,7 @@ spec:
template:
spec:
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.28
command: [ 'sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done" ]
Expand Down
2 changes: 1 addition & 1 deletion ray-operator/config/samples/ray_v1alpha1_rayservice.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ spec:
template:
spec:
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/security/ray-cluster.pod-security.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,11 @@ spec:
seccompProfile:
type: RuntimeDefault
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
# the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name
- name: init-myservice
image: busybox:1.28
# Change the cluster postfix if you don't have a default setting
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
securityContext:
runAsUser: 1000
allowPrivilegeEscalation: false
Expand Down
1 change: 1 addition & 0 deletions ray-operator/controllers/ray/common/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ const (
NAMESPACE = "NAMESPACE"
CLUSTER_NAME = "CLUSTER_NAME"
RAY_IP = "RAY_IP"
FQ_RAY_IP = "FQ_RAY_IP"
RAY_PORT = "RAY_PORT"
RAY_ADDRESS = "RAY_ADDRESS"
REDIS_PASSWORD = "REDIS_PASSWORD"
Expand Down
2 changes: 1 addition & 1 deletion ray-operator/controllers/ray/common/ingress.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func BuildIngressForHeadService(cluster rayiov1alpha1.RayCluster) (*networkingv1
PathType: &pathType,
Backend: networkingv1.IngressBackend{
Service: &networkingv1.IngressServiceBackend{
Name: utils.CheckName(utils.GenerateServiceName(cluster.Name)),
Name: utils.GenerateServiceName(cluster.Name),
Port: networkingv1.ServiceBackendPort{
Number: dashboardPort,
},
Expand Down
57 changes: 28 additions & 29 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func initTemplateAnnotations(instance rayiov1alpha1.RayCluster, podTemplate *v1.
}

// DefaultHeadPodTemplate sets the config values
func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1alpha1.HeadGroupSpec, podName string, svcName string, headPort string) v1.PodTemplateSpec {
func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1alpha1.HeadGroupSpec, podName string, headPort string) v1.PodTemplateSpec {
// TODO (Dmitri) The argument headPort is essentially unused;
// headPort is passed into setMissingRayStartParams but unused there for the head pod.
// To mitigate this awkwardness and reduce code redundancy, unify head and worker pod configuration logic.
Expand All @@ -100,7 +100,7 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a
podTemplate.Labels = make(map[string]string)
}
podTemplate.Labels = labelPod(rayiov1alpha1.HeadNode, instance.Name, "headgroup", instance.Spec.HeadGroupSpec.Template.ObjectMeta.Labels)
headSpec.RayStartParams = setMissingRayStartParams(headSpec.RayStartParams, rayiov1alpha1.HeadNode, svcName, headPort)
headSpec.RayStartParams = setMissingRayStartParams(headSpec.RayStartParams, rayiov1alpha1.HeadNode, headPort, "")
headSpec.RayStartParams = setAgentListPortStartParams(instance, headSpec.RayStartParams)

initTemplateAnnotations(instance, &podTemplate)
Expand Down Expand Up @@ -183,7 +183,7 @@ func autoscalerSupportIsStable(rayVersion string) bool {
}

// DefaultWorkerPodTemplate sets the config values
func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayiov1alpha1.WorkerGroupSpec, podName string, svcName string, headPort string) v1.PodTemplateSpec {
func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayiov1alpha1.WorkerGroupSpec, podName string, fqdnRayIP string, headPort string) v1.PodTemplateSpec {
podTemplate := workerSpec.Template
podTemplate.GenerateName = podName
if podTemplate.ObjectMeta.Namespace == "" {
Expand All @@ -198,7 +198,7 @@ func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayi
podTemplate.Labels = make(map[string]string)
}
podTemplate.Labels = labelPod(rayiov1alpha1.WorkerNode, instance.Name, workerSpec.GroupName, workerSpec.Template.ObjectMeta.Labels)
workerSpec.RayStartParams = setMissingRayStartParams(workerSpec.RayStartParams, rayiov1alpha1.WorkerNode, svcName, headPort)
workerSpec.RayStartParams = setMissingRayStartParams(workerSpec.RayStartParams, rayiov1alpha1.WorkerNode, headPort, fqdnRayIP)
workerSpec.RayStartParams = setAgentListPortStartParams(instance, workerSpec.RayStartParams)

initTemplateAnnotations(instance, &podTemplate)
Expand Down Expand Up @@ -271,7 +271,7 @@ func initReadinessProbeHandler(probe *v1.Probe, rayNodeType rayiov1alpha1.RayNod
}

// BuildPod a pod config
func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, svcName string, headPort string, enableRayAutoscaler *bool, creator string) (aPod v1.Pod) {
func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, headPort string, enableRayAutoscaler *bool, creator string, fqdnRayIP string) (aPod v1.Pod) {
pod := v1.Pod{
TypeMeta: metav1.TypeMeta{
APIVersion: "v1",
Expand Down Expand Up @@ -326,10 +326,10 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN
}

for index := range pod.Spec.InitContainers {
setInitContainerEnvVars(&pod.Spec.InitContainers[index], svcName)
setInitContainerEnvVars(&pod.Spec.InitContainers[index], fqdnRayIP)
}

setContainerEnvVars(&pod, rayContainerIndex, rayNodeType, rayStartParams, svcName, headPort, creator)
setContainerEnvVars(&pod, rayContainerIndex, rayNodeType, rayStartParams, fqdnRayIP, headPort, creator)

// health check only if FT enabled
if podTemplateSpec.Annotations != nil {
Expand Down Expand Up @@ -535,40 +535,39 @@ func labelPod(rayNodeType rayiov1alpha1.RayNodeType, rayClusterName string, grou
return labels
}

func setInitContainerEnvVars(container *v1.Container, svcName string) {
// RAY_IP can be used in the DNS lookup
func setInitContainerEnvVars(container *v1.Container, fqdnRayIP string) {
if container.Env == nil || len(container.Env) == 0 {
container.Env = []v1.EnvVar{}
}
if !envVarExists("RAY_IP", container.Env) {
ip := v1.EnvVar{Name: "RAY_IP"}
ip.Value = svcName
container.Env = append(container.Env, ip)
if len(fqdnRayIP) != 0 { // Worker Pod
container.Env = append(container.Env,
v1.EnvVar{Name: FQ_RAY_IP, Value: fqdnRayIP},
// RAY_IP is deprecated and should be kept for backward compatibility purposes only.
v1.EnvVar{Name: RAY_IP, Value: utils.ExtractRayIPFromFQDN(fqdnRayIP)},
)
}
}

func setContainerEnvVars(pod *v1.Pod, rayContainerIndex int, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, svcName string, headPort string, creator string) {
// set IP to local host if head, or the the svc otherwise RAY_IP
func setContainerEnvVars(pod *v1.Pod, rayContainerIndex int, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, fqdnRayIP string, headPort string, creator string) {
// set the port RAY_PORT
// set the password?
container := &pod.Spec.Containers[rayContainerIndex]
if container.Env == nil || len(container.Env) == 0 {
container.Env = []v1.EnvVar{}
}

var rayIP string
if rayNodeType == rayiov1alpha1.HeadNode {
// if head, use localhost
rayIP = LOCAL_HOST
} else {
// if worker, use the service name of the head
rayIP = svcName
// case 1: head => Use LOCAL_HOST
// case 2: worker => Use fqdnRayIP (fully qualified domain name)
ip := LOCAL_HOST
if rayNodeType == rayiov1alpha1.WorkerNode {
ip = fqdnRayIP
container.Env = append(container.Env,
v1.EnvVar{Name: FQ_RAY_IP, Value: ip},
// RAY_IP is deprecated and should be kept for backward compatibility purposes only.
v1.EnvVar{Name: RAY_IP, Value: utils.ExtractRayIPFromFQDN(ip)},
)
}

if !envVarExists(RAY_IP, container.Env) {
ipEnv := v1.EnvVar{Name: RAY_IP, Value: rayIP}
container.Env = append(container.Env, ipEnv)
}
if !envVarExists(RAY_PORT, container.Env) {
portEnv := v1.EnvVar{Name: RAY_PORT, Value: headPort}
container.Env = append(container.Env, portEnv)
Expand All @@ -595,7 +594,7 @@ func setContainerEnvVars(pod *v1.Pod, rayContainerIndex int, rayNodeType rayiov1
// Setting the RAY_ADDRESS env allows connecting to Ray using ray.init() when connecting
// from within the cluster.
if !envVarExists(RAY_ADDRESS, container.Env) {
rayAddress := fmt.Sprintf("%s:%s", rayIP, headPort)
rayAddress := fmt.Sprintf("%s:%s", ip, headPort)
addressEnv := v1.EnvVar{Name: RAY_ADDRESS, Value: rayAddress}
container.Env = append(container.Env, addressEnv)
}
Expand Down Expand Up @@ -636,11 +635,11 @@ func envVarExists(envName string, envVars []v1.EnvVar) bool {
}

// TODO auto complete params
func setMissingRayStartParams(rayStartParams map[string]string, nodeType rayiov1alpha1.RayNodeType, svcName string, headPort string) (completeStartParams map[string]string) {
func setMissingRayStartParams(rayStartParams map[string]string, nodeType rayiov1alpha1.RayNodeType, headPort string, fqdnRayIP string) (completeStartParams map[string]string) {
// Note: The argument headPort is unused for nodeType == rayiov1alpha1.HeadNode.
if nodeType == rayiov1alpha1.WorkerNode {
if _, ok := rayStartParams["address"]; !ok {
address := fmt.Sprintf("%s:%s", svcName, headPort)
address := fmt.Sprintf("%s:%s", fqdnRayIP, headPort)
rayStartParams["address"] = address
}
}
Expand Down
Loading