Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug] KubeRay operator fails to get serve deployment status due to 500 Internal Server Error #1173

Merged
merged 3 commits into from
Jun 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
clientURL := rayJobInstance.Status.DashboardURL
if clientURL == "" {
// TODO: dashboard service may be changed. Check it instead of using the same URL always
if clientURL, err = utils.FetchDashboardURL(ctx, &r.Log, r.Client, rayClusterInstance); err != nil || clientURL == "" {
if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, common.DefaultDashboardName); err != nil || clientURL == "" {
if clientURL == "" {
err = fmt.Errorf("empty dashboardURL")
}
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,7 @@ func (r *RayServiceReconciler) updateStatusForActiveCluster(ctx context.Context,
var clientURL string
rayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus

if clientURL, err = utils.FetchDashboardAgentURL(ctx, &r.Log, r.Client, rayClusterInstance); err != nil || clientURL == "" {
if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, common.DefaultDashboardAgentListenPortName); err != nil || clientURL == "" {
r.updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
return err
}
Expand Down Expand Up @@ -1025,7 +1025,7 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
rayServiceStatus = &rayServiceInstance.Status.PendingServiceStatus
}

if clientURL, err = utils.FetchDashboardAgentURL(ctx, &r.Log, r.Client, rayClusterInstance); err != nil || clientURL == "" {
if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, common.DefaultDashboardAgentListenPortName); err != nil || clientURL == "" {
if !r.updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold) {
logger.Info("Dashboard is unhealthy, restart the cluster.")
r.markRestart(rayServiceInstance)
Expand Down
50 changes: 50 additions & 0 deletions ray-operator/controllers/ray/rayservice_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@ package ray

import (
"context"
"fmt"
"reflect"
"testing"

rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
"github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -372,3 +374,51 @@ func TestReconcileServices_UpdateService(t *testing.T) {
assert.Equal(t, 1, len(svcList.Items), "Service list should have one item")
assert.False(t, reflect.DeepEqual(*oldSvc, svcList.Items[0]))
}

func TestFetchHeadServiceURL(t *testing.T) {
// Create a new scheme with CRDs, Pod, Service schemes.
newScheme := runtime.NewScheme()
_ = rayv1alpha1.AddToScheme(newScheme)
_ = corev1.AddToScheme(newScheme)

// Mock data
namespace := "ray"
dashboardPort := int32(9999)
cluster := rayv1alpha1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "test-cluster",
Namespace: namespace,
},
}
headSvc := corev1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: utils.GenerateServiceName(cluster.Name),
Namespace: cluster.ObjectMeta.Namespace,
},
Spec: corev1.ServiceSpec{
Ports: []corev1.ServicePort{
{
Name: common.DefaultDashboardName,
Port: dashboardPort,
},
},
},
}

// Initialize a fake client with newScheme and runtimeObjects.
runtimeObjects := []runtime.Object{&headSvc}
fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build()

// Initialize RayService reconciler.
ctx := context.TODO()
r := RayServiceReconciler{
Client: fakeClient,
Recorder: &record.FakeRecorder{},
Scheme: scheme.Scheme,
Log: ctrl.Log.WithName("controllers").WithName("RayService"),
}

url, err := utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, &cluster, common.DefaultDashboardName)
assert.Nil(t, err, "Fail to fetch head service url")
assert.Equal(t, fmt.Sprintf("test-cluster-head-svc.%s.svc.cluster.local:%d", namespace, dashboardPort), url, "Head service url is not correct")
}
65 changes: 13 additions & 52 deletions ray-operator/controllers/ray/utils/dashboard_httpclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,6 @@ import (
rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
)

// TODO: currently the following constants are also declared in ray-operator/controllers/ray/common
// We cannot import them to avoid cycles
const (
DefaultDashboardName = "dashboard"
DefaultDashboardAgentListenPortName = "dashboard-agent"
)

var (
// Single-application URL paths
DeployPath = "/api/serve/deployments/"
Expand Down Expand Up @@ -70,70 +63,38 @@ type RayDashboardClient struct {
BaseDashboardClient
}

func FetchDashboardAgentURL(ctx context.Context, log *logr.Logger, cli client.Client, rayCluster *rayv1alpha1.RayCluster) (string, error) {
dashboardAgentService := &corev1.Service{}
dashboardAgentServiceName := CheckName(GenerateDashboardServiceName(rayCluster.Name))
if err := cli.Get(ctx, client.ObjectKey{Name: dashboardAgentServiceName, Namespace: rayCluster.Namespace}, dashboardAgentService); err != nil {
return "", err
}

log.V(1).Info("fetchDashboardAgentURL ", "dashboard agent service found", dashboardAgentService.Name)
// TODO: compare diff and reconcile the object. For example. ServiceType might be changed or port might be modified
servicePorts := dashboardAgentService.Spec.Ports

dashboardPort := int32(-1)

for _, servicePort := range servicePorts {
if servicePort.Name == DefaultDashboardAgentListenPortName {
dashboardPort = servicePort.Port
break
}
}

if dashboardPort == int32(-1) {
return "", fmtErrors.Errorf("dashboard port not found")
}

domainName := GetClusterDomainName()
dashboardAgentURL := fmt.Sprintf("%s.%s.svc.%s:%v",
dashboardAgentService.Name,
dashboardAgentService.Namespace,
domainName,
dashboardPort)
log.V(1).Info("fetchDashboardAgentURL ", "dashboardURL", dashboardAgentURL)
return dashboardAgentURL, nil
}

func FetchDashboardURL(ctx context.Context, log *logr.Logger, cli client.Client, rayCluster *rayv1alpha1.RayCluster) (string, error) {
// FetchHeadServiceURL fetches the URL that consists of the FQDN for the RayCluster's head service
// and the port with the given port name (defaultPortName).
func FetchHeadServiceURL(ctx context.Context, log *logr.Logger, cli client.Client, rayCluster *rayv1alpha1.RayCluster, defaultPortName string) (string, error) {
headSvc := &corev1.Service{}
headSvcName := GenerateServiceName(rayCluster.Name)
if err := cli.Get(ctx, client.ObjectKey{Name: headSvcName, Namespace: rayCluster.Namespace}, headSvc); err != nil {
return "", err
}

log.V(3).Info("fetchDashboardURL ", "dashboard service found", headSvc.Name)
architkulkarni marked this conversation as resolved.
Show resolved Hide resolved
log.Info("FetchHeadServiceURL", "head service name", headSvc.Name, "namespace", headSvc.Namespace)
servicePorts := headSvc.Spec.Ports
dashboardPort := int32(-1)
port := int32(-1)

for _, servicePort := range servicePorts {
if servicePort.Name == DefaultDashboardName {
dashboardPort = servicePort.Port
if servicePort.Name == defaultPortName {
port = servicePort.Port
break
}
}

if dashboardPort == int32(-1) {
return "", fmtErrors.Errorf("dashboard port not found")
if port == int32(-1) {
return "", fmtErrors.Errorf("%s port is not found", defaultPortName)
}

domainName := GetClusterDomainName()
dashboardURL := fmt.Sprintf("%s.%s.svc.%s:%v",
headServiceURL := fmt.Sprintf("%s.%s.svc.%s:%v",
headSvc.Name,
headSvc.Namespace,
domainName,
dashboardPort)
log.V(1).Info("fetchDashboardURL ", "dashboardURL", dashboardURL)
return dashboardURL, nil
port)
log.Info("FetchHeadServiceURL", "head service URL", headServiceURL, "port", defaultPortName)
return headServiceURL, nil
}

func (r *RayDashboardClient) InitClient(url string) {
Expand Down
2 changes: 1 addition & 1 deletion ray-operator/controllers/ray/utils/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func GetNamespace(metaData metav1.ObjectMeta) string {
return metaData.Namespace
}

// GenerateServiceName generates a ray head service name from cluster name
// GenerateServiceName generates a Ray head service name from cluster name
func GenerateServiceName(clusterName string) string {
return CheckName(fmt.Sprintf("%s-%s-%s", clusterName, rayv1alpha1.HeadNode, "svc"))
}
Expand Down