From d2372cd0ee33117a933768579589691ac6130562 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Thu, 1 Jun 2023 23:25:54 -0700 Subject: [PATCH] wip Signed-off-by: Cindy Zhang --- apiserver/pkg/model/converter.go | 40 +-- .../crds/ray.io_rayservices.yaml | 140 +++++------ proto/go_client/serve.pb.go | 233 +++++++++++++----- proto/kuberay_api.swagger.json | 37 ++- proto/serve.proto | 20 +- proto/swagger/serve.swagger.json | 37 ++- .../apis/ray/v1alpha1/rayservice_types.go | 16 +- .../ray/v1alpha1/rayservice_types_test.go | 7 +- .../ray/v1alpha1/zz_generated.deepcopy.go | 34 ++- .../config/crd/bases/ray.io_rayservices.yaml | 140 +++++------ .../config/manager/kustomization.yaml | 4 + .../controllers/ray/rayservice_controller.go | 146 +++++++---- .../ray/rayservice_controller_test.go | 111 +++++---- .../ray/rayservice_controller_unit_test.go | 69 +++--- .../ray/utils/dashboard_httpclient.go | 119 ++++++--- .../ray/utils/fake_serve_httpclient.go | 24 +- .../controllers/ray/utils/serve_api_models.go | 82 ++++++ 17 files changed, 837 insertions(+), 422 deletions(-) create mode 100644 ray-operator/controllers/ray/utils/serve_api_models.go diff --git a/apiserver/pkg/model/converter.go b/apiserver/pkg/model/converter.go index 28306402e07..e5270a6054a 100755 --- a/apiserver/pkg/model/converter.go +++ b/apiserver/pkg/model/converter.go @@ -368,12 +368,10 @@ func PopulateServeConfig(serveConfigSpecs []v1alpha1.ServeConfigSpec) []*api.Ser func PoplulateRayServiceStatus(serviceName string, serviceStatus v1alpha1.RayServiceStatuses, events []v1.Event) *api.RayServiceStatus { status := &api.RayServiceStatus{ - ApplicationStatus: serviceStatus.ActiveServiceStatus.ApplicationStatus.Status, - ApplicationMessage: serviceStatus.ActiveServiceStatus.ApplicationStatus.Message, - ServeDeploymentStatus: PopulateServeDeploymentStatus(serviceStatus.ActiveServiceStatus.ServeStatuses), - RayServiceEvents: PopulateRayServiceEvent(serviceName, events), - RayClusterName: serviceStatus.ActiveServiceStatus.RayClusterName, - RayClusterState: string(serviceStatus.ActiveServiceStatus.RayClusterStatus.State), + RayServiceEvents: PopulateRayServiceEvent(serviceName, events), + RayClusterName: serviceStatus.ActiveServiceStatus.RayClusterName, + RayClusterState: string(serviceStatus.ActiveServiceStatus.RayClusterStatus.State), + ServeApplicationStatus: PopulateServeApplicationStatus(serviceStatus.ActiveServiceStatus.Applications), } status.ServiceEndpoint = map[string]string{} for name, port := range serviceStatus.ActiveServiceStatus.RayClusterStatus.Endpoints { @@ -382,17 +380,31 @@ func PoplulateRayServiceStatus(serviceName string, serviceStatus v1alpha1.RaySer return status } -func PopulateServeDeploymentStatus(serveDeploymentStatuses []v1alpha1.ServeDeploymentStatus) []*api.ServeDeploymentStatus { - deploymentStatus := make([]*api.ServeDeploymentStatus, 0) - for _, serveDeploymentStatus := range serveDeploymentStatuses { +func PopulateServeApplicationStatus(serveApplicationStatuses map[string]v1alpha1.AppStatus) []*api.ServeApplicationStatus { + appStatuses := make([]*api.ServeApplicationStatus, 0) + for appName, appStatus := range serveApplicationStatuses { + ds := &api.ServeApplicationStatus{ + Name: appName, + Status: appStatus.Status, + Message: appStatus.Message, + ServeDeploymentStatus: PopulateServeDeploymentStatus(appStatus.Deployments), + } + appStatuses = append(appStatuses, ds) + } + return appStatuses +} + +func PopulateServeDeploymentStatus(serveDeploymentStatuses map[string]v1alpha1.ServeDeploymentStatus) []*api.ServeDeploymentStatus { + deploymentStatuses := make([]*api.ServeDeploymentStatus, 0) + for deploymentName, deploymentStatus := range serveDeploymentStatuses { ds := &api.ServeDeploymentStatus{ - DeploymentName: serveDeploymentStatus.Name, - Status: serveDeploymentStatus.Status, - Message: serveDeploymentStatus.Message, + DeploymentName: deploymentName, + Status: deploymentStatus.Status, + Message: deploymentStatus.Message, } - deploymentStatus = append(deploymentStatus, ds) + deploymentStatuses = append(deploymentStatuses, ds) } - return deploymentStatus + return deploymentStatuses } func PopulateRayServiceEvent(serviceName string, events []v1.Event) []*api.RayServiceEvent { diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 0317f5e8822..acf220c04a1 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -12144,6 +12144,8 @@ spec: required: - importPath type: object + serveConfigV2: + type: object serviceUnhealthySecondThreshold: format: int32 type: integer @@ -12153,21 +12155,44 @@ spec: properties: activeServiceStatus: properties: - appStatus: + applicationStatuses: + additionalProperties: + properties: + healthLastUpdateTime: + description: Keep track of how long the service is healthy. + format: date-time + type: string + lastUpdateTime: + format: date-time + type: string + message: + type: string + serveDeploymentStatuses: + additionalProperties: + description: ServeDeploymentStatus defines the current + state of a Serve deployment + properties: + healthLastUpdateTime: + description: Keep track of how long the service is + healthy. + format: date-time + type: string + lastUpdateTime: + format: date-time + type: string + message: + type: string + status: + description: Name, Status, Message are from Ray Dashboard + and represent a Serve deployment's state. + type: string + type: object + type: object + status: + type: string + type: object description: 'Important: Run "make" to regenerate code after modifying this file' - properties: - healthLastUpdateTime: - description: Keep track of how long the service is healthy. - format: date-time - type: string - lastUpdateTime: - format: date-time - type: string - message: - type: string - status: - type: string type: object dashboardStatus: description: DashboardStatus defines the current states of Ray @@ -12241,10 +12266,18 @@ spec: state of cluster Important: Run "make" to regenerat' type: string type: object - serveDeploymentStatuses: - items: - description: ServeDeploymentStatus defines the current state - of a Serve deployment + type: object + observedGeneration: + description: observedGeneration is the most recent generation observed + for this RayService. + format: int64 + type: integer + pendingServiceStatus: + description: Pending Service Status indicates a RayCluster will be + created or is being created. + properties: + applicationStatuses: + additionalProperties: properties: healthLastUpdateTime: description: Keep track of how long the service is healthy. @@ -12255,40 +12288,32 @@ spec: type: string message: type: string - name: - description: Name, Status, Message are from Ray Dashboard - and represent a Serve deployment's state. - type: string + serveDeploymentStatuses: + additionalProperties: + description: ServeDeploymentStatus defines the current + state of a Serve deployment + properties: + healthLastUpdateTime: + description: Keep track of how long the service is + healthy. + format: date-time + type: string + lastUpdateTime: + format: date-time + type: string + message: + type: string + status: + description: Name, Status, Message are from Ray Dashboard + and represent a Serve deployment's state. + type: string + type: object + type: object status: - description: 'TODO: change status type to enum' type: string type: object - type: array - type: object - observedGeneration: - description: observedGeneration is the most recent generation observed - for this RayService. - format: int64 - type: integer - pendingServiceStatus: - description: Pending Service Status indicates a RayCluster will be - created or is being created. - properties: - appStatus: description: 'Important: Run "make" to regenerate code after modifying this file' - properties: - healthLastUpdateTime: - description: Keep track of how long the service is healthy. - format: date-time - type: string - lastUpdateTime: - format: date-time - type: string - message: - type: string - status: - type: string type: object dashboardStatus: description: DashboardStatus defines the current states of Ray @@ -12362,29 +12387,6 @@ spec: state of cluster Important: Run "make" to regenerat' type: string type: object - serveDeploymentStatuses: - items: - description: ServeDeploymentStatus defines the current state - of a Serve deployment - properties: - healthLastUpdateTime: - description: Keep track of how long the service is healthy. - format: date-time - type: string - lastUpdateTime: - format: date-time - type: string - message: - type: string - name: - description: Name, Status, Message are from Ray Dashboard - and represent a Serve deployment's state. - type: string - status: - description: 'TODO: change status type to enum' - type: string - type: object - type: array type: object serviceStatus: description: ServiceStatus indicates the current RayService status. diff --git a/proto/go_client/serve.pb.go b/proto/go_client/serve.pb.go index 397aa514883..673f5f6f65c 100644 --- a/proto/go_client/serve.pb.go +++ b/proto/go_client/serve.pb.go @@ -1032,11 +1032,12 @@ type RayServiceStatus struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - // The ray serve application status. + // NOTE(zcin): the first three fields are deprecated in favor of serve_application_status. + // (Deprecated) The ray serve application status. ApplicationStatus string `protobuf:"bytes,1,opt,name=application_status,json=applicationStatus,proto3" json:"application_status,omitempty"` - // A human-readable description of the status of this operation. + // (Deprecated) A human-readable description of the status of this operation. ApplicationMessage string `protobuf:"bytes,2,opt,name=application_message,json=applicationMessage,proto3" json:"application_message,omitempty"` - // The status for each deployment. + // (Deprecated) The status for each deployment. ServeDeploymentStatus []*ServeDeploymentStatus `protobuf:"bytes,3,rep,name=serve_deployment_status,json=serveDeploymentStatus,proto3" json:"serve_deployment_status,omitempty"` // The related event for the ray service. RayServiceEvents []*RayServiceEvent `protobuf:"bytes,4,rep,name=ray_service_events,json=rayServiceEvents,proto3" json:"ray_service_events,omitempty"` @@ -1046,6 +1047,8 @@ type RayServiceStatus struct { RayClusterState string `protobuf:"bytes,6,opt,name=ray_cluster_state,json=rayClusterState,proto3" json:"ray_cluster_state,omitempty"` // The service endpoint of the cluster and service. ServiceEndpoint map[string]string `protobuf:"bytes,7,rep,name=service_endpoint,json=serviceEndpoint,proto3" json:"service_endpoint,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + // All ray serve application statuses + ServeApplicationStatus []*ServeApplicationStatus `protobuf:"bytes,8,rep,name=serve_application_status,json=serveApplicationStatus,proto3" json:"serve_application_status,omitempty"` } func (x *RayServiceStatus) Reset() { @@ -1129,6 +1132,88 @@ func (x *RayServiceStatus) GetServiceEndpoint() map[string]string { return nil } +func (x *RayServiceStatus) GetServeApplicationStatus() []*ServeApplicationStatus { + if x != nil { + return x.ServeApplicationStatus + } + return nil +} + +type ServeApplicationStatus struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // Application name + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + // Application status + Status string `protobuf:"bytes,2,opt,name=status,proto3" json:"status,omitempty"` + // Details about the application status. + Message string `protobuf:"bytes,3,opt,name=message,proto3" json:"message,omitempty"` + // All ray serve deployment statuses in this application + ServeDeploymentStatus []*ServeDeploymentStatus `protobuf:"bytes,4,rep,name=serve_deployment_status,json=serveDeploymentStatus,proto3" json:"serve_deployment_status,omitempty"` +} + +func (x *ServeApplicationStatus) Reset() { + *x = ServeApplicationStatus{} + if protoimpl.UnsafeEnabled { + mi := &file_serve_proto_msgTypes[15] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ServeApplicationStatus) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ServeApplicationStatus) ProtoMessage() {} + +func (x *ServeApplicationStatus) ProtoReflect() protoreflect.Message { + mi := &file_serve_proto_msgTypes[15] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ServeApplicationStatus.ProtoReflect.Descriptor instead. +func (*ServeApplicationStatus) Descriptor() ([]byte, []int) { + return file_serve_proto_rawDescGZIP(), []int{15} +} + +func (x *ServeApplicationStatus) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *ServeApplicationStatus) GetStatus() string { + if x != nil { + return x.Status + } + return "" +} + +func (x *ServeApplicationStatus) GetMessage() string { + if x != nil { + return x.Message + } + return "" +} + +func (x *ServeApplicationStatus) GetServeDeploymentStatus() []*ServeDeploymentStatus { + if x != nil { + return x.ServeDeploymentStatus + } + return nil +} + type ServeDeploymentStatus struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -1145,7 +1230,7 @@ type ServeDeploymentStatus struct { func (x *ServeDeploymentStatus) Reset() { *x = ServeDeploymentStatus{} if protoimpl.UnsafeEnabled { - mi := &file_serve_proto_msgTypes[15] + mi := &file_serve_proto_msgTypes[16] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1158,7 +1243,7 @@ func (x *ServeDeploymentStatus) String() string { func (*ServeDeploymentStatus) ProtoMessage() {} func (x *ServeDeploymentStatus) ProtoReflect() protoreflect.Message { - mi := &file_serve_proto_msgTypes[15] + mi := &file_serve_proto_msgTypes[16] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1171,7 +1256,7 @@ func (x *ServeDeploymentStatus) ProtoReflect() protoreflect.Message { // Deprecated: Use ServeDeploymentStatus.ProtoReflect.Descriptor instead. func (*ServeDeploymentStatus) Descriptor() ([]byte, []int) { - return file_serve_proto_rawDescGZIP(), []int{15} + return file_serve_proto_rawDescGZIP(), []int{16} } func (x *ServeDeploymentStatus) GetDeploymentName() string { @@ -1223,7 +1308,7 @@ type RayServiceEvent struct { func (x *RayServiceEvent) Reset() { *x = RayServiceEvent{} if protoimpl.UnsafeEnabled { - mi := &file_serve_proto_msgTypes[16] + mi := &file_serve_proto_msgTypes[17] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1236,7 +1321,7 @@ func (x *RayServiceEvent) String() string { func (*RayServiceEvent) ProtoMessage() {} func (x *RayServiceEvent) ProtoReflect() protoreflect.Message { - mi := &file_serve_proto_msgTypes[16] + mi := &file_serve_proto_msgTypes[17] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1249,7 +1334,7 @@ func (x *RayServiceEvent) ProtoReflect() protoreflect.Message { // Deprecated: Use RayServiceEvent.ProtoReflect.Descriptor instead. func (*RayServiceEvent) Descriptor() ([]byte, []int) { - return file_serve_proto_rawDescGZIP(), []int{16} + return file_serve_proto_rawDescGZIP(), []int{17} } func (x *RayServiceEvent) GetId() string { @@ -1333,7 +1418,7 @@ type WorkerGroupUpdateSpec struct { func (x *WorkerGroupUpdateSpec) Reset() { *x = WorkerGroupUpdateSpec{} if protoimpl.UnsafeEnabled { - mi := &file_serve_proto_msgTypes[17] + mi := &file_serve_proto_msgTypes[18] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1346,7 +1431,7 @@ func (x *WorkerGroupUpdateSpec) String() string { func (*WorkerGroupUpdateSpec) ProtoMessage() {} func (x *WorkerGroupUpdateSpec) ProtoReflect() protoreflect.Message { - mi := &file_serve_proto_msgTypes[17] + mi := &file_serve_proto_msgTypes[18] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1359,7 +1444,7 @@ func (x *WorkerGroupUpdateSpec) ProtoReflect() protoreflect.Message { // Deprecated: Use WorkerGroupUpdateSpec.ProtoReflect.Descriptor instead. func (*WorkerGroupUpdateSpec) Descriptor() ([]byte, []int) { - return file_serve_proto_rawDescGZIP(), []int{17} + return file_serve_proto_rawDescGZIP(), []int{18} } func (x *WorkerGroupUpdateSpec) GetGroupName() string { @@ -1556,7 +1641,7 @@ var file_serve_proto_rawDesc = []byte{ 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x52, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x12, 0x2b, 0x0a, 0x11, 0x61, 0x63, 0x63, 0x63, 0x65, 0x6c, 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x10, 0x61, 0x63, 0x63, 0x63, 0x65, 0x6c, - 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x54, 0x79, 0x70, 0x65, 0x22, 0x81, 0x04, 0x0a, 0x10, 0x52, + 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x54, 0x79, 0x70, 0x65, 0x22, 0xda, 0x04, 0x0a, 0x10, 0x52, 0x61, 0x79, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2d, 0x0a, 0x12, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x11, 0x61, 0x70, 0x70, @@ -1584,11 +1669,28 @@ var file_serve_proto_rawDesc = []byte{ 0x6f, 0x74, 0x6f, 0x2e, 0x52, 0x61, 0x79, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x45, 0x6e, 0x64, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0f, 0x73, 0x65, 0x72, 0x76, 0x69, - 0x63, 0x65, 0x45, 0x6e, 0x64, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x1a, 0x42, 0x0a, 0x14, 0x53, 0x65, - 0x72, 0x76, 0x69, 0x63, 0x65, 0x45, 0x6e, 0x64, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x45, 0x6e, 0x74, - 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, - 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x72, + 0x63, 0x65, 0x45, 0x6e, 0x64, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x12, 0x57, 0x0a, 0x18, 0x73, 0x65, + 0x72, 0x76, 0x65, 0x5f, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, + 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1d, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x41, 0x70, 0x70, 0x6c, 0x69, 0x63, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x16, 0x73, 0x65, 0x72, + 0x76, 0x65, 0x41, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x53, 0x74, 0x61, + 0x74, 0x75, 0x73, 0x1a, 0x42, 0x0a, 0x14, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x45, 0x6e, + 0x64, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, + 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, + 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0xb4, 0x01, 0x0a, 0x16, 0x53, 0x65, 0x72, 0x76, + 0x65, 0x41, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x53, 0x74, 0x61, 0x74, + 0x75, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x18, + 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x54, 0x0a, 0x17, 0x73, 0x65, 0x72, 0x76, + 0x65, 0x5f, 0x64, 0x65, 0x70, 0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, 0x74, 0x5f, 0x73, 0x74, 0x61, + 0x74, 0x75, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1c, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x44, 0x65, 0x70, 0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, + 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x15, 0x73, 0x65, 0x72, 0x76, 0x65, 0x44, 0x65, + 0x70, 0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x72, 0x0a, 0x15, 0x53, 0x65, 0x72, 0x76, 0x65, 0x44, 0x65, 0x70, 0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x27, 0x0a, 0x0f, 0x64, 0x65, 0x70, 0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, 0x74, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, @@ -1710,7 +1812,7 @@ func file_serve_proto_rawDescGZIP() []byte { return file_serve_proto_rawDescData } -var file_serve_proto_msgTypes = make([]protoimpl.MessageInfo, 19) +var file_serve_proto_msgTypes = make([]protoimpl.MessageInfo, 20) var file_serve_proto_goTypes = []interface{}{ (*CreateRayServiceRequest)(nil), // 0: proto.CreateRayServiceRequest (*UpdateRayServiceRequest)(nil), // 1: proto.UpdateRayServiceRequest @@ -1727,54 +1829,57 @@ var file_serve_proto_goTypes = []interface{}{ (*ServeConfig)(nil), // 12: proto.ServeConfig (*ActorOptions)(nil), // 13: proto.ActorOptions (*RayServiceStatus)(nil), // 14: proto.RayServiceStatus - (*ServeDeploymentStatus)(nil), // 15: proto.ServeDeploymentStatus - (*RayServiceEvent)(nil), // 16: proto.RayServiceEvent - (*WorkerGroupUpdateSpec)(nil), // 17: proto.WorkerGroupUpdateSpec - nil, // 18: proto.RayServiceStatus.ServiceEndpointEntry - (*ClusterSpec)(nil), // 19: proto.ClusterSpec - (*timestamppb.Timestamp)(nil), // 20: google.protobuf.Timestamp - (*emptypb.Empty)(nil), // 21: google.protobuf.Empty + (*ServeApplicationStatus)(nil), // 15: proto.ServeApplicationStatus + (*ServeDeploymentStatus)(nil), // 16: proto.ServeDeploymentStatus + (*RayServiceEvent)(nil), // 17: proto.RayServiceEvent + (*WorkerGroupUpdateSpec)(nil), // 18: proto.WorkerGroupUpdateSpec + nil, // 19: proto.RayServiceStatus.ServiceEndpointEntry + (*ClusterSpec)(nil), // 20: proto.ClusterSpec + (*timestamppb.Timestamp)(nil), // 21: google.protobuf.Timestamp + (*emptypb.Empty)(nil), // 22: google.protobuf.Empty } var file_serve_proto_depIdxs = []int32{ 10, // 0: proto.CreateRayServiceRequest.service:type_name -> proto.RayService 10, // 1: proto.UpdateRayServiceRequest.service:type_name -> proto.RayService 3, // 2: proto.UpdateRayServiceConfigsRequest.update_service:type_name -> proto.UpdateRayServiceBody - 17, // 3: proto.UpdateRayServiceBody.worker_group_update_spec:type_name -> proto.WorkerGroupUpdateSpec + 18, // 3: proto.UpdateRayServiceBody.worker_group_update_spec:type_name -> proto.WorkerGroupUpdateSpec 11, // 4: proto.UpdateRayServiceBody.serve_deployment_graph_spec:type_name -> proto.ServeDeploymentGraphSpec 10, // 5: proto.ListRayServicesResponse.services:type_name -> proto.RayService 10, // 6: proto.ListAllRayServicesResponse.services:type_name -> proto.RayService 11, // 7: proto.RayService.serve_deployment_graph_spec:type_name -> proto.ServeDeploymentGraphSpec - 19, // 8: proto.RayService.cluster_spec:type_name -> proto.ClusterSpec + 20, // 8: proto.RayService.cluster_spec:type_name -> proto.ClusterSpec 14, // 9: proto.RayService.ray_service_status:type_name -> proto.RayServiceStatus - 20, // 10: proto.RayService.created_at:type_name -> google.protobuf.Timestamp - 20, // 11: proto.RayService.delete_at:type_name -> google.protobuf.Timestamp + 21, // 10: proto.RayService.created_at:type_name -> google.protobuf.Timestamp + 21, // 11: proto.RayService.delete_at:type_name -> google.protobuf.Timestamp 12, // 12: proto.ServeDeploymentGraphSpec.serve_configs:type_name -> proto.ServeConfig 13, // 13: proto.ServeConfig.actor_options:type_name -> proto.ActorOptions - 15, // 14: proto.RayServiceStatus.serve_deployment_status:type_name -> proto.ServeDeploymentStatus - 16, // 15: proto.RayServiceStatus.ray_service_events:type_name -> proto.RayServiceEvent - 18, // 16: proto.RayServiceStatus.service_endpoint:type_name -> proto.RayServiceStatus.ServiceEndpointEntry - 20, // 17: proto.RayServiceEvent.created_at:type_name -> google.protobuf.Timestamp - 20, // 18: proto.RayServiceEvent.first_timestamp:type_name -> google.protobuf.Timestamp - 20, // 19: proto.RayServiceEvent.last_timestamp:type_name -> google.protobuf.Timestamp - 0, // 20: proto.RayServeService.CreateRayService:input_type -> proto.CreateRayServiceRequest - 1, // 21: proto.RayServeService.UpdateRayService:input_type -> proto.UpdateRayServiceRequest - 2, // 22: proto.RayServeService.UpdateRayServiceConfigs:input_type -> proto.UpdateRayServiceConfigsRequest - 4, // 23: proto.RayServeService.GetRayService:input_type -> proto.GetRayServiceRequest - 5, // 24: proto.RayServeService.ListRayServices:input_type -> proto.ListRayServicesRequest - 7, // 25: proto.RayServeService.ListAllRayServices:input_type -> proto.ListAllRayServicesRequest - 9, // 26: proto.RayServeService.DeleteRayService:input_type -> proto.DeleteRayServiceRequest - 10, // 27: proto.RayServeService.CreateRayService:output_type -> proto.RayService - 10, // 28: proto.RayServeService.UpdateRayService:output_type -> proto.RayService - 10, // 29: proto.RayServeService.UpdateRayServiceConfigs:output_type -> proto.RayService - 10, // 30: proto.RayServeService.GetRayService:output_type -> proto.RayService - 6, // 31: proto.RayServeService.ListRayServices:output_type -> proto.ListRayServicesResponse - 8, // 32: proto.RayServeService.ListAllRayServices:output_type -> proto.ListAllRayServicesResponse - 21, // 33: proto.RayServeService.DeleteRayService:output_type -> google.protobuf.Empty - 27, // [27:34] is the sub-list for method output_type - 20, // [20:27] is the sub-list for method input_type - 20, // [20:20] is the sub-list for extension type_name - 20, // [20:20] is the sub-list for extension extendee - 0, // [0:20] is the sub-list for field type_name + 16, // 14: proto.RayServiceStatus.serve_deployment_status:type_name -> proto.ServeDeploymentStatus + 17, // 15: proto.RayServiceStatus.ray_service_events:type_name -> proto.RayServiceEvent + 19, // 16: proto.RayServiceStatus.service_endpoint:type_name -> proto.RayServiceStatus.ServiceEndpointEntry + 15, // 17: proto.RayServiceStatus.serve_application_status:type_name -> proto.ServeApplicationStatus + 16, // 18: proto.ServeApplicationStatus.serve_deployment_status:type_name -> proto.ServeDeploymentStatus + 21, // 19: proto.RayServiceEvent.created_at:type_name -> google.protobuf.Timestamp + 21, // 20: proto.RayServiceEvent.first_timestamp:type_name -> google.protobuf.Timestamp + 21, // 21: proto.RayServiceEvent.last_timestamp:type_name -> google.protobuf.Timestamp + 0, // 22: proto.RayServeService.CreateRayService:input_type -> proto.CreateRayServiceRequest + 1, // 23: proto.RayServeService.UpdateRayService:input_type -> proto.UpdateRayServiceRequest + 2, // 24: proto.RayServeService.UpdateRayServiceConfigs:input_type -> proto.UpdateRayServiceConfigsRequest + 4, // 25: proto.RayServeService.GetRayService:input_type -> proto.GetRayServiceRequest + 5, // 26: proto.RayServeService.ListRayServices:input_type -> proto.ListRayServicesRequest + 7, // 27: proto.RayServeService.ListAllRayServices:input_type -> proto.ListAllRayServicesRequest + 9, // 28: proto.RayServeService.DeleteRayService:input_type -> proto.DeleteRayServiceRequest + 10, // 29: proto.RayServeService.CreateRayService:output_type -> proto.RayService + 10, // 30: proto.RayServeService.UpdateRayService:output_type -> proto.RayService + 10, // 31: proto.RayServeService.UpdateRayServiceConfigs:output_type -> proto.RayService + 10, // 32: proto.RayServeService.GetRayService:output_type -> proto.RayService + 6, // 33: proto.RayServeService.ListRayServices:output_type -> proto.ListRayServicesResponse + 8, // 34: proto.RayServeService.ListAllRayServices:output_type -> proto.ListAllRayServicesResponse + 22, // 35: proto.RayServeService.DeleteRayService:output_type -> google.protobuf.Empty + 29, // [29:36] is the sub-list for method output_type + 22, // [22:29] is the sub-list for method input_type + 22, // [22:22] is the sub-list for extension type_name + 22, // [22:22] is the sub-list for extension extendee + 0, // [0:22] is the sub-list for field type_name } func init() { file_serve_proto_init() } @@ -1965,7 +2070,7 @@ func file_serve_proto_init() { } } file_serve_proto_msgTypes[15].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*ServeDeploymentStatus); i { + switch v := v.(*ServeApplicationStatus); i { case 0: return &v.state case 1: @@ -1977,7 +2082,7 @@ func file_serve_proto_init() { } } file_serve_proto_msgTypes[16].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*RayServiceEvent); i { + switch v := v.(*ServeDeploymentStatus); i { case 0: return &v.state case 1: @@ -1989,6 +2094,18 @@ func file_serve_proto_init() { } } file_serve_proto_msgTypes[17].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*RayServiceEvent); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_serve_proto_msgTypes[18].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*WorkerGroupUpdateSpec); i { case 0: return &v.state @@ -2007,7 +2124,7 @@ func file_serve_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_serve_proto_rawDesc, NumEnums: 0, - NumMessages: 19, + NumMessages: 20, NumExtensions: 0, NumServices: 1, }, diff --git a/proto/kuberay_api.swagger.json b/proto/kuberay_api.swagger.json index c4ac4a7c293..c8f6e9727ac 100644 --- a/proto/kuberay_api.swagger.json +++ b/proto/kuberay_api.swagger.json @@ -1762,18 +1762,18 @@ "properties": { "applicationStatus": { "type": "string", - "description": "The ray serve application status." + "description": "NOTE(zcin): the first three fields are deprecated in favor of serve_application_status.\n(Deprecated) The ray serve application status." }, "applicationMessage": { "type": "string", - "description": "A human-readable description of the status of this operation." + "description": "(Deprecated) A human-readable description of the status of this operation." }, "serveDeploymentStatus": { "type": "array", "items": { "$ref": "#/definitions/protoServeDeploymentStatus" }, - "description": "The status for each deployment." + "description": "(Deprecated) The status for each deployment." }, "rayServiceEvents": { "type": "array", @@ -1796,6 +1796,37 @@ "type": "string" }, "description": "The service endpoint of the cluster and service." + }, + "serveApplicationStatus": { + "type": "array", + "items": { + "$ref": "#/definitions/protoServeApplicationStatus" + }, + "title": "All ray serve application statuses" + } + } + }, + "protoServeApplicationStatus": { + "type": "object", + "properties": { + "name": { + "type": "string", + "title": "Application name" + }, + "status": { + "type": "string", + "title": "Application status" + }, + "message": { + "type": "string", + "description": "Details about the application status." + }, + "serveDeploymentStatus": { + "type": "array", + "items": { + "$ref": "#/definitions/protoServeDeploymentStatus" + }, + "title": "All ray serve deployment statuses in this application" } } }, diff --git a/proto/serve.proto b/proto/serve.proto index aa345a8b634..20d0ffc4d9a 100644 --- a/proto/serve.proto +++ b/proto/serve.proto @@ -230,11 +230,12 @@ message ActorOptions { } message RayServiceStatus { - // The ray serve application status. + // NOTE(zcin): the first three fields are deprecated in favor of serve_application_status. + // (Deprecated) The ray serve application status. string application_status = 1; - // A human-readable description of the status of this operation. + // (Deprecated) A human-readable description of the status of this operation. string application_message = 2; - // The status for each deployment. + // (Deprecated) The status for each deployment. repeated ServeDeploymentStatus serve_deployment_status = 3; // The related event for the ray service. repeated RayServiceEvent ray_service_events = 4; @@ -244,6 +245,19 @@ message RayServiceStatus { string ray_cluster_state = 6; // The service endpoint of the cluster and service. map service_endpoint = 7; + // All ray serve application statuses + repeated ServeApplicationStatus serve_application_status = 8; +} + +message ServeApplicationStatus { + // Application name + string name = 1; + // Application status + string status = 2; + // Details about the application status. + string message = 3; + // All ray serve deployment statuses in this application + repeated ServeDeploymentStatus serve_deployment_status = 4; } message ServeDeploymentStatus { diff --git a/proto/swagger/serve.swagger.json b/proto/swagger/serve.swagger.json index 42e44ae0e61..8b70d3cd8b9 100644 --- a/proto/swagger/serve.swagger.json +++ b/proto/swagger/serve.swagger.json @@ -596,18 +596,18 @@ "properties": { "applicationStatus": { "type": "string", - "description": "The ray serve application status." + "description": "NOTE(zcin): the first three fields are deprecated in favor of serve_application_status.\n(Deprecated) The ray serve application status." }, "applicationMessage": { "type": "string", - "description": "A human-readable description of the status of this operation." + "description": "(Deprecated) A human-readable description of the status of this operation." }, "serveDeploymentStatus": { "type": "array", "items": { "$ref": "#/definitions/protoServeDeploymentStatus" }, - "description": "The status for each deployment." + "description": "(Deprecated) The status for each deployment." }, "rayServiceEvents": { "type": "array", @@ -630,6 +630,37 @@ "type": "string" }, "description": "The service endpoint of the cluster and service." + }, + "serveApplicationStatus": { + "type": "array", + "items": { + "$ref": "#/definitions/protoServeApplicationStatus" + }, + "title": "All ray serve application statuses" + } + } + }, + "protoServeApplicationStatus": { + "type": "object", + "properties": { + "name": { + "type": "string", + "title": "Application name" + }, + "status": { + "type": "string", + "title": "Application status" + }, + "message": { + "type": "string", + "description": "Details about the application status." + }, + "serveDeploymentStatus": { + "type": "array", + "items": { + "$ref": "#/definitions/protoServeDeploymentStatus" + }, + "title": "All ray serve deployment statuses in this application" } } }, diff --git a/ray-operator/apis/ray/v1alpha1/rayservice_types.go b/ray-operator/apis/ray/v1alpha1/rayservice_types.go index 4c1618703af..cd93102b8e3 100644 --- a/ray-operator/apis/ray/v1alpha1/rayservice_types.go +++ b/ray-operator/apis/ray/v1alpha1/rayservice_types.go @@ -49,11 +49,14 @@ var DeploymentStatusEnum = struct { type RayServiceSpec struct { // Important: Run "make" to regenerate code after modifying this file ServeDeploymentGraphSpec ServeDeploymentGraphSpec `json:"serveConfig,omitempty"` + ServeConfigV2 ServeConfigV2 `json:"serveConfigV2,omitempty"` RayClusterSpec RayClusterSpec `json:"rayClusterConfig,omitempty"` ServiceUnhealthySecondThreshold *int32 `json:"serviceUnhealthySecondThreshold,omitempty"` DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"` } +type ServeConfigV2 struct{} + type ServeDeploymentGraphSpec struct { ImportPath string `json:"importPath"` RuntimeEnv string `json:"runtimeEnv,omitempty"` @@ -104,11 +107,10 @@ type RayServiceStatuses struct { type RayServiceStatus struct { // Important: Run "make" to regenerate code after modifying this file - ApplicationStatus AppStatus `json:"appStatus,omitempty"` - ServeStatuses []ServeDeploymentStatus `json:"serveDeploymentStatuses,omitempty"` - DashboardStatus DashboardStatus `json:"dashboardStatus,omitempty"` - RayClusterName string `json:"rayClusterName,omitempty"` - RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"` + Applications map[string]AppStatus `json:"applicationStatuses,omitempty"` + DashboardStatus DashboardStatus `json:"dashboardStatus,omitempty"` + RayClusterName string `json:"rayClusterName,omitempty"` + RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"` } // DashboardStatus defines the current states of Ray Dashboard @@ -126,13 +128,13 @@ type AppStatus struct { LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` // Keep track of how long the service is healthy. // Update when Serve deployment is healthy or first time convert to unhealthy from healthy. - HealthLastUpdateTime *metav1.Time `json:"healthLastUpdateTime,omitempty"` + HealthLastUpdateTime *metav1.Time `json:"healthLastUpdateTime,omitempty"` + Deployments map[string]ServeDeploymentStatus `json:"serveDeploymentStatuses,omitempty"` } // ServeDeploymentStatus defines the current state of a Serve deployment type ServeDeploymentStatus struct { // Name, Status, Message are from Ray Dashboard and represent a Serve deployment's state. - Name string `json:"name,omitempty"` // TODO: change status type to enum Status string `json:"status,omitempty"` Message string `json:"message,omitempty"` diff --git a/ray-operator/apis/ray/v1alpha1/rayservice_types_test.go b/ray-operator/apis/ray/v1alpha1/rayservice_types_test.go index e4d2358e16e..15499141f95 100644 --- a/ray-operator/apis/ray/v1alpha1/rayservice_types_test.go +++ b/ray-operator/apis/ray/v1alpha1/rayservice_types_test.go @@ -235,6 +235,7 @@ var expected = `{ } ] }, + "serveConfigV2": {}, "rayClusterConfig":{ "headGroupSpec":{ "replicas":1, @@ -361,9 +362,6 @@ var expected = `{ }, "status":{ "activeServiceStatus":{ - "appStatus":{ - - }, "dashboardStatus":{ }, @@ -372,9 +370,6 @@ var expected = `{ } }, "pendingServiceStatus":{ - "appStatus":{ - - }, "dashboardStatus":{ }, diff --git a/ray-operator/apis/ray/v1alpha1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1alpha1/zz_generated.deepcopy.go index a0f855c9c76..698e62defec 100644 --- a/ray-operator/apis/ray/v1alpha1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1alpha1/zz_generated.deepcopy.go @@ -21,6 +21,13 @@ func (in *AppStatus) DeepCopyInto(out *AppStatus) { in, out := &in.HealthLastUpdateTime, &out.HealthLastUpdateTime *out = (*in).DeepCopy() } + if in.Deployments != nil { + in, out := &in.Deployments, &out.Deployments + *out = make(map[string]ServeDeploymentStatus, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppStatus. @@ -521,6 +528,7 @@ func (in *RayServiceList) DeepCopyObject() runtime.Object { func (in *RayServiceSpec) DeepCopyInto(out *RayServiceSpec) { *out = *in in.ServeDeploymentGraphSpec.DeepCopyInto(&out.ServeDeploymentGraphSpec) + out.ServeConfigV2 = in.ServeConfigV2 in.RayClusterSpec.DeepCopyInto(&out.RayClusterSpec) if in.ServiceUnhealthySecondThreshold != nil { in, out := &in.ServiceUnhealthySecondThreshold, &out.ServiceUnhealthySecondThreshold @@ -547,12 +555,11 @@ func (in *RayServiceSpec) DeepCopy() *RayServiceSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RayServiceStatus) DeepCopyInto(out *RayServiceStatus) { *out = *in - in.ApplicationStatus.DeepCopyInto(&out.ApplicationStatus) - if in.ServeStatuses != nil { - in, out := &in.ServeStatuses, &out.ServeStatuses - *out = make([]ServeDeploymentStatus, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) + if in.Applications != nil { + in, out := &in.Applications, &out.Applications + *out = make(map[string]AppStatus, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() } } in.DashboardStatus.DeepCopyInto(&out.DashboardStatus) @@ -652,6 +659,21 @@ func (in *ServeConfigSpec) DeepCopy() *ServeConfigSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServeConfigV2) DeepCopyInto(out *ServeConfigV2) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServeConfigV2. +func (in *ServeConfigV2) DeepCopy() *ServeConfigV2 { + if in == nil { + return nil + } + out := new(ServeConfigV2) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServeDeploymentGraphSpec) DeepCopyInto(out *ServeDeploymentGraphSpec) { *out = *in diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 0317f5e8822..acf220c04a1 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -12144,6 +12144,8 @@ spec: required: - importPath type: object + serveConfigV2: + type: object serviceUnhealthySecondThreshold: format: int32 type: integer @@ -12153,21 +12155,44 @@ spec: properties: activeServiceStatus: properties: - appStatus: + applicationStatuses: + additionalProperties: + properties: + healthLastUpdateTime: + description: Keep track of how long the service is healthy. + format: date-time + type: string + lastUpdateTime: + format: date-time + type: string + message: + type: string + serveDeploymentStatuses: + additionalProperties: + description: ServeDeploymentStatus defines the current + state of a Serve deployment + properties: + healthLastUpdateTime: + description: Keep track of how long the service is + healthy. + format: date-time + type: string + lastUpdateTime: + format: date-time + type: string + message: + type: string + status: + description: Name, Status, Message are from Ray Dashboard + and represent a Serve deployment's state. + type: string + type: object + type: object + status: + type: string + type: object description: 'Important: Run "make" to regenerate code after modifying this file' - properties: - healthLastUpdateTime: - description: Keep track of how long the service is healthy. - format: date-time - type: string - lastUpdateTime: - format: date-time - type: string - message: - type: string - status: - type: string type: object dashboardStatus: description: DashboardStatus defines the current states of Ray @@ -12241,10 +12266,18 @@ spec: state of cluster Important: Run "make" to regenerat' type: string type: object - serveDeploymentStatuses: - items: - description: ServeDeploymentStatus defines the current state - of a Serve deployment + type: object + observedGeneration: + description: observedGeneration is the most recent generation observed + for this RayService. + format: int64 + type: integer + pendingServiceStatus: + description: Pending Service Status indicates a RayCluster will be + created or is being created. + properties: + applicationStatuses: + additionalProperties: properties: healthLastUpdateTime: description: Keep track of how long the service is healthy. @@ -12255,40 +12288,32 @@ spec: type: string message: type: string - name: - description: Name, Status, Message are from Ray Dashboard - and represent a Serve deployment's state. - type: string + serveDeploymentStatuses: + additionalProperties: + description: ServeDeploymentStatus defines the current + state of a Serve deployment + properties: + healthLastUpdateTime: + description: Keep track of how long the service is + healthy. + format: date-time + type: string + lastUpdateTime: + format: date-time + type: string + message: + type: string + status: + description: Name, Status, Message are from Ray Dashboard + and represent a Serve deployment's state. + type: string + type: object + type: object status: - description: 'TODO: change status type to enum' type: string type: object - type: array - type: object - observedGeneration: - description: observedGeneration is the most recent generation observed - for this RayService. - format: int64 - type: integer - pendingServiceStatus: - description: Pending Service Status indicates a RayCluster will be - created or is being created. - properties: - appStatus: description: 'Important: Run "make" to regenerate code after modifying this file' - properties: - healthLastUpdateTime: - description: Keep track of how long the service is healthy. - format: date-time - type: string - lastUpdateTime: - format: date-time - type: string - message: - type: string - status: - type: string type: object dashboardStatus: description: DashboardStatus defines the current states of Ray @@ -12362,29 +12387,6 @@ spec: state of cluster Important: Run "make" to regenerat' type: string type: object - serveDeploymentStatuses: - items: - description: ServeDeploymentStatus defines the current state - of a Serve deployment - properties: - healthLastUpdateTime: - description: Keep track of how long the service is healthy. - format: date-time - type: string - lastUpdateTime: - format: date-time - type: string - message: - type: string - name: - description: Name, Status, Message are from Ray Dashboard - and represent a Serve deployment's state. - type: string - status: - description: 'TODO: change status type to enum' - type: string - type: object - type: array type: object serviceStatus: description: ServiceStatus indicates the current RayService status. diff --git a/ray-operator/config/manager/kustomization.yaml b/ray-operator/config/manager/kustomization.yaml index 6ba801db9a7..6f46c3363f4 100644 --- a/ray-operator/config/manager/kustomization.yaml +++ b/ray-operator/config/manager/kustomization.yaml @@ -7,3 +7,7 @@ kind: Kustomization commonLabels: app.kubernetes.io/component: kuberay-operator app.kubernetes.io/name: kuberay +images: +- name: kuberay/operator + newName: zcin/test-kuberay + newTag: test29 diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 4375295ec29..74d2bf48de5 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -238,24 +238,45 @@ func (r *RayServiceReconciler) inconsistentRayServiceStatus(oldStatus rayv1alpha return true } - if oldStatus.ApplicationStatus.Status != newStatus.ApplicationStatus.Status || - oldStatus.ApplicationStatus.Message != newStatus.ApplicationStatus.Message { - r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ApplicationStatus changed from %v to %v", oldStatus.ApplicationStatus, newStatus.ApplicationStatus)) + if len(oldStatus.Applications) != len(newStatus.Applications) { return true } - if len(oldStatus.ServeStatuses) != len(newStatus.ServeStatuses) { - r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService number of ServeStatus changed from %v to %v", len(oldStatus.ServeStatuses), len(newStatus.ServeStatuses))) - return true - } + var ok bool + for appName, newAppStatus := range newStatus.Applications { + var oldAppStatus rayv1alpha1.AppStatus + if oldAppStatus, ok = oldStatus.Applications[appName]; !ok { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService new application %s found", appName)) + return true + } + + if oldAppStatus.Status != newAppStatus.Status { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService application %s status changed from %v to %v", appName, oldAppStatus.Status, newAppStatus.Status)) + return true + } else if oldAppStatus.Message != newAppStatus.Message { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService application %s status message changed from %v to %v", appName, oldAppStatus.Message, newAppStatus.Message)) + return true + } - for i := 0; i < len(oldStatus.ServeStatuses); i++ { - if oldStatus.ServeStatuses[i].Name != newStatus.ServeStatuses[i].Name || - oldStatus.ServeStatuses[i].Status != newStatus.ServeStatuses[i].Status || - oldStatus.ServeStatuses[i].Message != newStatus.ServeStatuses[i].Message { - r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ServeDeploymentStatus changed from %v to %v", oldStatus.ServeStatuses[i], newStatus.ServeStatuses[i])) + if len(oldAppStatus.Deployments) != len(newAppStatus.Deployments) { return true } + + for deploymentName, newDeploymentStatus := range newAppStatus.Deployments { + var oldDeploymentStatus rayv1alpha1.ServeDeploymentStatus + if oldDeploymentStatus, ok = oldAppStatus.Deployments[deploymentName]; !ok { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService new deployment %s found in application %s", deploymentName, appName)) + return true + } + + if oldDeploymentStatus.Status != newDeploymentStatus.Status { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService DeploymentStatus changed from %v to %v", oldDeploymentStatus.Status, newDeploymentStatus.Status)) + return true + } else if oldDeploymentStatus.Message != newDeploymentStatus.Message { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService deployment status message changed from %v to %v", oldDeploymentStatus.Message, newDeploymentStatus.Message)) + return true + } + } } return false @@ -649,67 +670,96 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer // It's return values should be interpreted as // (Serve app healthy?, Serve app ready?, error if any) func (r *RayServiceReconciler) getAndCheckServeStatus(ctx context.Context, dashboardClient utils.RayDashboardClientInterface, rayServiceServeStatus *rayv1alpha1.RayServiceStatus, unhealthySecondThreshold *int32) (bool, bool, error) { + serveConfigType := utils.SINGLE_APP serviceUnhealthySecondThreshold := ServiceUnhealthySecondThreshold if unhealthySecondThreshold != nil { serviceUnhealthySecondThreshold = float64(*unhealthySecondThreshold) } - var serveStatuses *utils.ServeDeploymentStatuses + var serveAppStatuses map[string]*utils.ServeApplicationStatus var err error - if serveStatuses, err = dashboardClient.GetDeploymentsStatus(ctx); err != nil { - r.Log.Error(err, "Failed to get Serve deployment statuses from dashboard!") - return false, false, err + if serveConfigType == utils.SINGLE_APP { + var singleApplicationStatus *utils.ServeApplicationStatus + if singleApplicationStatus, err = dashboardClient.GetSingleApplicationStatus(ctx); err != nil { + r.Log.Error(err, "Failed to get Serve deployment statuses from dashboard!") + return false, false, err + } + serveAppStatuses = map[string]*utils.ServeApplicationStatus{"default": singleApplicationStatus} + } else if serveConfigType == utils.MULTI_APP { + if serveAppStatuses, err = dashboardClient.GetMultiApplicationStatus(ctx); err != nil { + r.Log.Error(err, "Failed to get Serve deployment statuses from dashboard!") + return false, false, err + } + } else { + return false, false, fmt.Errorf("Unrecognized serve config type %s", string(serveConfigType)) } - statusMap := make(map[string]rayv1alpha1.ServeDeploymentStatus) - - for _, status := range rayServiceServeStatus.ServeStatuses { - statusMap[status.Name] = status - } + r.Log.V(1).Info("getAndCheckServeStatus", "prev statuses", rayServiceServeStatus.Applications, "serve statuses", serveAppStatuses) isHealthy := true isReady := true timeNow := metav1.Now() - for i := 0; i < len(serveStatuses.DeploymentStatuses); i++ { - serveStatuses.DeploymentStatuses[i].LastUpdateTime = &timeNow - serveStatuses.DeploymentStatuses[i].HealthLastUpdateTime = &timeNow - if serveStatuses.DeploymentStatuses[i].Status != rayv1alpha1.DeploymentStatusEnum.HEALTHY { - prevStatus, exist := statusMap[serveStatuses.DeploymentStatuses[i].Name] - if exist { - if prevStatus.Status != rayv1alpha1.DeploymentStatusEnum.HEALTHY { - serveStatuses.DeploymentStatuses[i].HealthLastUpdateTime = prevStatus.HealthLastUpdateTime - - if prevStatus.HealthLastUpdateTime != nil && time.Since(prevStatus.HealthLastUpdateTime.Time).Seconds() > serviceUnhealthySecondThreshold { + + newApplications := make(map[string]rayv1alpha1.AppStatus) + for appName, app := range serveAppStatuses { + if appName == "" { + appName = "default" + } + + prevApplicationStatus := rayServiceServeStatus.Applications[appName] + + applicationStatus := rayv1alpha1.AppStatus{ + Message: app.Message, + Status: app.Status, + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + Deployments: make(map[string]rayv1alpha1.ServeDeploymentStatus), + } + + // Check app status + if app.Status != rayv1alpha1.ApplicationStatusEnum.RUNNING { + // Check previous app status + if prevApplicationStatus.Status != rayv1alpha1.ApplicationStatusEnum.RUNNING { + if prevApplicationStatus.HealthLastUpdateTime != nil { + applicationStatus.HealthLastUpdateTime = prevApplicationStatus.HealthLastUpdateTime + + if time.Since(prevApplicationStatus.HealthLastUpdateTime.Time).Seconds() > serviceUnhealthySecondThreshold { isHealthy = false } } } isReady = false } - } - // Check app status - serveStatuses.ApplicationStatus.LastUpdateTime = &timeNow - serveStatuses.ApplicationStatus.HealthLastUpdateTime = &timeNow - if serveStatuses.ApplicationStatus.Status != rayv1alpha1.ApplicationStatusEnum.RUNNING { - // Check previous app status - if rayServiceServeStatus.ApplicationStatus.Status != rayv1alpha1.ApplicationStatusEnum.RUNNING { - if rayServiceServeStatus.ApplicationStatus.HealthLastUpdateTime != nil { - serveStatuses.ApplicationStatus.HealthLastUpdateTime = rayServiceServeStatus.ApplicationStatus.HealthLastUpdateTime + // Check deployment statuses + for deploymentName, deployment := range app.Deployments { + deploymentStatus := rayv1alpha1.ServeDeploymentStatus{ + Status: deployment.Status, + Message: deployment.Message, + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + } + + if deployment.Status != rayv1alpha1.DeploymentStatusEnum.HEALTHY { + prevStatus, exist := prevApplicationStatus.Deployments[deploymentName] + if exist { + if prevStatus.Status != rayv1alpha1.DeploymentStatusEnum.HEALTHY { + deploymentStatus.HealthLastUpdateTime = prevStatus.HealthLastUpdateTime - if time.Since(rayServiceServeStatus.ApplicationStatus.HealthLastUpdateTime.Time).Seconds() > serviceUnhealthySecondThreshold { - isHealthy = false + if prevStatus.HealthLastUpdateTime != nil && time.Since(prevStatus.HealthLastUpdateTime.Time).Seconds() > serviceUnhealthySecondThreshold { + isHealthy = false + } + } } + isReady = false } + applicationStatus.Deployments[deploymentName] = deploymentStatus } - isReady = false + newApplications[appName] = applicationStatus } - rayServiceServeStatus.ServeStatuses = serveStatuses.DeploymentStatuses - rayServiceServeStatus.ApplicationStatus = serveStatuses.ApplicationStatus - - r.Log.V(1).Info("getAndCheckServeStatus", "statusMap", statusMap, "serveStatuses", serveStatuses) - + rayServiceServeStatus.Applications = newApplications + r.Log.V(1).Info("getAndCheckServeStatus", "new statuses", rayServiceServeStatus.Applications) return isHealthy, isReady, nil } diff --git a/ray-operator/controllers/ray/rayservice_controller_test.go b/ray-operator/controllers/ray/rayservice_controller_test.go index 07886198d21..74260158007 100644 --- a/ray-operator/controllers/ray/rayservice_controller_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_test.go @@ -364,7 +364,7 @@ var _ = Context("Inside the default namespace", func() { // the RayService controller will consider the active RayCluster as unhealthy and prepare a new RayCluster. orignalServeDeploymentUnhealthySecondThreshold := ServiceUnhealthySecondThreshold ServiceUnhealthySecondThreshold = 5 - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(time.Now().Add(time.Duration(-5)*time.Minute)), "UNHEALTHY")) + fakeRayDashboardClient.SetServeStatus(generateServeStatus("UNHEALTHY")) Eventually( getPreparingRayClusterNameFunc(ctx, myRayService), time.Second*60, time.Millisecond*500).Should(Not(BeEmpty()), "New pending RayCluster name = %v", myRayService.Status.PendingServiceStatus.RayClusterName) @@ -392,7 +392,7 @@ var _ = Context("Inside the default namespace", func() { // (2) The pending RayCluster's Serve Deployments are HEALTHY. updateHeadPodToRunningAndReady(ctx, initialPendingClusterName) ServiceUnhealthySecondThreshold = orignalServeDeploymentUnhealthySecondThreshold - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.Now(), "HEALTHY")) + fakeRayDashboardClient.SetServeStatus(generateServeStatus("HEALTHY")) Eventually( getPreparingRayClusterNameFunc(ctx, myRayService), time.Second*15, time.Millisecond*500).Should(BeEmpty(), "Pending RayCluster name = %v", myRayService.Status.PendingServiceStatus.RayClusterName) @@ -414,24 +414,24 @@ var _ = Context("Inside the default namespace", func() { orignalServeDeploymentUnhealthySecondThreshold := ServiceUnhealthySecondThreshold ServiceUnhealthySecondThreshold = 500 - // Only update the LastUpdateTime and HealthLastUpdateTime fields in the active RayCluster. - oldTime := myRayService.Status.ActiveServiceStatus.ServeStatuses[0].HealthLastUpdateTime.DeepCopy() - newTime := oldTime.Add(time.Duration(5) * time.Minute) // 300 seconds - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(newTime), "UNHEALTHY")) + // Change serve status to be unhealthy + fakeRayDashboardClient.SetServeStatus(generateServeStatus("UNHEALTHY")) // Confirm not switch to a new RayCluster because ServiceUnhealthySecondThreshold is 500 seconds. Consistently( getRayClusterNameFunc(ctx, myRayService), time.Second*3, time.Millisecond*500).Should(Equal(initialClusterName), "Active RayCluster name = %v", myRayService.Status.ActiveServiceStatus.RayClusterName) - // Check if all the ServeStatuses[i].Status are UNHEALTHY. - checkAllServeStatusesUnhealthy := func(ctx context.Context, rayService *rayv1alpha1.RayService) bool { + // Check if all the deployment statuses are UNHEALTHY. + checkAllDeploymentStatusesUnhealthy := func(ctx context.Context, rayService *rayv1alpha1.RayService) bool { if err := k8sClient.Get(ctx, client.ObjectKey{Name: rayService.Name, Namespace: rayService.Namespace}, rayService); err != nil { return false } - for _, serveStatus := range rayService.Status.ActiveServiceStatus.ServeStatuses { - if serveStatus.Status != "UNHEALTHY" { - return false + for _, appStatus := range rayService.Status.ActiveServiceStatus.Applications { + for _, deploymentStatus := range appStatus.Deployments { + if deploymentStatus.Status != "UNHEALTHY" { + return false + } } } return true @@ -443,10 +443,10 @@ var _ = Context("Inside the default namespace", func() { // Note: LastUpdateTime/HealthLastUpdateTime will be overwritten via metav1.Now() in rayservice_controller.go. // Hence, we cannot use `newTime`` to check whether the status is updated or not. Eventually( - checkAllServeStatusesUnhealthy(ctx, myRayService), + checkAllDeploymentStatusesUnhealthy(ctx, myRayService), time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status) - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(newTime), "HEALTHY")) + fakeRayDashboardClient.SetServeStatus(generateServeStatus("HEALTHY")) // Confirm not switch to a new RayCluster because ServiceUnhealthySecondThreshold is 500 seconds. Consistently( @@ -469,9 +469,8 @@ var _ = Context("Inside the default namespace", func() { time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status) // Only update the LastUpdateTime and HealthLastUpdateTime fields in the active RayCluster. - oldTime := myRayService.Status.ActiveServiceStatus.ServeStatuses[0].HealthLastUpdateTime.DeepCopy() - newTime := oldTime.Add(time.Duration(5) * time.Minute) // 300 seconds - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(newTime), "HEALTHY")) + oldTime := myRayService.Status.ActiveServiceStatus.Applications["default"].HealthLastUpdateTime.DeepCopy() + fakeRayDashboardClient.SetServeStatus(generateServeStatus("HEALTHY")) // Confirm not switch to a new RayCluster Consistently( @@ -486,7 +485,7 @@ var _ = Context("Inside the default namespace", func() { // Status should not be updated if the only differences are the LastUpdateTime and HealthLastUpdateTime fields. // Unlike the test "Status should be updated if the differences are not only LastUpdateTime and HealthLastUpdateTime fields.", // the status update will not be triggered, so we can check whether the LastUpdateTime/HealthLastUpdateTime fields are updated or not by `oldTime`. - Expect(myRayService.Status.ActiveServiceStatus.ServeStatuses[0].HealthLastUpdateTime).Should(Equal(oldTime), "myRayService status = %v", myRayService.Status) + Expect(myRayService.Status.ActiveServiceStatus.Applications["default"].HealthLastUpdateTime).Should(Equal(oldTime), "myRayService status = %v", myRayService.Status) }) It("Update workerGroup.replicas in RayService and should not switch to new Ray Cluster", func() { @@ -516,7 +515,7 @@ var _ = Context("Inside the default namespace", func() { // Set deployment statuses to UNHEALTHY orignalServeDeploymentUnhealthySecondThreshold := ServiceUnhealthySecondThreshold ServiceUnhealthySecondThreshold = 5 - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(time.Now().Add(time.Duration(-5)*time.Minute)), "UNHEALTHY")) + fakeRayDashboardClient.SetServeStatus(generateServeStatus("UNHEALTHY")) Eventually( getPreparingRayClusterNameFunc(ctx, myRayService), @@ -524,7 +523,7 @@ var _ = Context("Inside the default namespace", func() { ServiceUnhealthySecondThreshold = orignalServeDeploymentUnhealthySecondThreshold pendingRayClusterName := myRayService.Status.PendingServiceStatus.RayClusterName - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.Now(), "HEALTHY")) + fakeRayDashboardClient.SetServeStatus(generateServeStatus("HEALTHY")) updateHeadPodToRunningAndReady(ctx, pendingRayClusterName) Eventually( @@ -539,7 +538,7 @@ var _ = Context("Inside the default namespace", func() { initialClusterName, _ := getRayClusterNameFunc(ctx, myRayService)() // The cluster shouldn't switch until deployments are finished updating - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(time.Now().Add(time.Duration(-5)*time.Minute)), "UPDATING")) + fakeRayDashboardClient.SetServeStatus(generateServeStatus("UPDATING")) err := retry.RetryOnConflict(retry.DefaultRetry, func() error { Eventually( @@ -562,7 +561,7 @@ var _ = Context("Inside the default namespace", func() { time.Second*5, time.Millisecond*500).Should(Equal(initialClusterName), "My current RayCluster name = %v", myRayService.Status.ActiveServiceStatus.RayClusterName) // The cluster should switch once the deployments are finished updating - fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(time.Now().Add(time.Duration(-5)*time.Minute)), "HEALTHY")) + fakeRayDashboardClient.SetServeStatus(generateServeStatus("HEALTHY")) updateHeadPodToRunningAndReady(ctx, pendingRayClusterName) Eventually( @@ -575,44 +574,34 @@ var _ = Context("Inside the default namespace", func() { func prepareFakeRayDashboardClient() utils.FakeRayDashboardClient { client := utils.FakeRayDashboardClient{} - client.SetServeStatus(generateServeStatus(metav1.Now(), "HEALTHY")) + client.SetServeStatus(generateServeStatus("HEALTHY")) return client } -func generateServeStatus(time metav1.Time, status string) utils.ServeDeploymentStatuses { - serveStatuses := utils.ServeDeploymentStatuses{ - ApplicationStatus: rayv1alpha1.AppStatus{ - Status: "RUNNING", - LastUpdateTime: &time, - HealthLastUpdateTime: &time, - }, - DeploymentStatuses: []rayv1alpha1.ServeDeploymentStatus{ - { - Name: "shallow", - Status: status, - Message: "", - LastUpdateTime: &time, - HealthLastUpdateTime: &time, +func generateServeStatus(status string) utils.ServeApplicationStatus { + appStatus := utils.ServeApplicationStatus{ + Status: "RUNNING", + Deployments: map[string]utils.ServeDeploymentStatus{ + "shallow": { + Name: "shallow", + Status: status, + Message: "", }, - { - Name: "deep", - Status: status, - Message: "", - LastUpdateTime: &time, - HealthLastUpdateTime: &time, + "deep": { + Name: "deep", + Status: status, + Message: "", }, - { - Name: "one", - Status: status, - Message: "", - LastUpdateTime: &time, - HealthLastUpdateTime: &time, + "one": { + Name: "one", + Status: status, + Message: "", }, }, } - return serveStatuses + return appStatus } func getRayClusterNameFunc(ctx context.Context, rayService *rayv1alpha1.RayService) func() (string, error) { @@ -639,15 +628,25 @@ func checkServiceHealth(ctx context.Context, rayService *rayv1alpha1.RayService) return false, err } - healthy := true + if !rayService.Status.ActiveServiceStatus.DashboardStatus.IsHealthy { + return false, nil + } + + defaultAppStatus, ok := rayService.Status.ActiveServiceStatus.Applications["default"] + if !ok { + return false, fmt.Errorf("default app not found in ActiveServiceStatus") + } - healthy = healthy && rayService.Status.ActiveServiceStatus.DashboardStatus.IsHealthy - healthy = healthy && (len(rayService.Status.ActiveServiceStatus.ServeStatuses) == 3) - healthy = healthy && rayService.Status.ActiveServiceStatus.ServeStatuses[0].Status == "HEALTHY" - healthy = healthy && rayService.Status.ActiveServiceStatus.ServeStatuses[1].Status == "HEALTHY" - healthy = healthy && rayService.Status.ActiveServiceStatus.ServeStatuses[2].Status == "HEALTHY" + if len(defaultAppStatus.Deployments) != 3 { + return false, nil + } + for _, deploymentStatus := range defaultAppStatus.Deployments { + if deploymentStatus.Status != "HEALTHY" { + return false, nil + } + } - return healthy, nil + return true, nil } } diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index f9b8c7e970c..11d602fae22 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -85,19 +85,20 @@ func TestInconsistentRayServiceStatuses(t *testing.T) { LastUpdateTime: &timeNow, HealthLastUpdateTime: &timeNow, }, - ApplicationStatus: rayv1alpha1.AppStatus{ - Status: "running", - Message: "OK", - LastUpdateTime: &timeNow, - HealthLastUpdateTime: &timeNow, - }, - ServeStatuses: []rayv1alpha1.ServeDeploymentStatus{ - { - Name: "serve-1", - Status: "unhealthy", - Message: "error", + Applications: map[string]rayv1alpha1.AppStatus{ + "default": { + Status: "running", + Message: "OK", LastUpdateTime: &timeNow, HealthLastUpdateTime: &timeNow, + Deployments: map[string]rayv1alpha1.ServeDeploymentStatus{ + "serve-1": { + Status: "unhealthy", + Message: "error", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + }, }, }, }, @@ -108,19 +109,20 @@ func TestInconsistentRayServiceStatuses(t *testing.T) { LastUpdateTime: &timeNow, HealthLastUpdateTime: &timeNow, }, - ApplicationStatus: rayv1alpha1.AppStatus{ - Status: "stopped", - Message: "stopped", - LastUpdateTime: &timeNow, - HealthLastUpdateTime: &timeNow, - }, - ServeStatuses: []rayv1alpha1.ServeDeploymentStatus{ - { - Name: "serve-1", - Status: "healthy", - Message: "Serve is healthy", + Applications: map[string]rayv1alpha1.AppStatus{ + "default": { + Status: "stopped", + Message: "stopped", LastUpdateTime: &timeNow, HealthLastUpdateTime: &timeNow, + Deployments: map[string]rayv1alpha1.ServeDeploymentStatus{ + "serve-1": { + Status: "healthy", + Message: "Serve is healthy", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + }, }, }, }, @@ -150,19 +152,20 @@ func TestInconsistentRayServiceStatus(t *testing.T) { LastUpdateTime: &timeNow, HealthLastUpdateTime: &timeNow, }, - ApplicationStatus: rayv1alpha1.AppStatus{ - Status: "running", - Message: "Application is running", - LastUpdateTime: &timeNow, - HealthLastUpdateTime: &timeNow, - }, - ServeStatuses: []rayv1alpha1.ServeDeploymentStatus{ - { - Name: "serve-1", - Status: "healthy", - Message: "Serve is healthy", + Applications: map[string]rayv1alpha1.AppStatus{ + "default": { + Status: "running", + Message: "Application is running", LastUpdateTime: &timeNow, HealthLastUpdateTime: &timeNow, + Deployments: map[string]rayv1alpha1.ServeDeploymentStatus{ + "serve-1": { + Status: "healthy", + Message: "Serve is healthy", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + }, }, }, } diff --git a/ray-operator/controllers/ray/utils/dashboard_httpclient.go b/ray-operator/controllers/ray/utils/dashboard_httpclient.go index 415b6f42ccd..9979f99bd74 100644 --- a/ray-operator/controllers/ray/utils/dashboard_httpclient.go +++ b/ray-operator/controllers/ray/utils/dashboard_httpclient.go @@ -29,43 +29,12 @@ const ( ) var ( - DeployPath = "/api/serve/deployments/" - StatusPath = "/api/serve/deployments/status" - JobPath = "/api/jobs/" + DeployPath = "/api/serve/deployments/" + StatusPath = "/api/serve/deployments/status" + ServeDetailsPath = "/api/serve/applications/" + JobPath = "/api/jobs/" ) -// ServeConfigSpec defines the desired state of RayService, used by Ray Dashboard. -type ServeConfigSpec struct { - Name string `json:"name"` - NumReplicas *int32 `json:"num_replicas,omitempty"` - RoutePrefix string `json:"route_prefix,omitempty"` - MaxConcurrentQueries *int32 `json:"max_concurrent_queries,omitempty"` - UserConfig map[string]interface{} `json:"user_config,omitempty"` - AutoscalingConfig map[string]interface{} `json:"autoscaling_config,omitempty"` - GracefulShutdownWaitLoopS *int32 `json:"graceful_shutdown_wait_loop_s,omitempty"` - GracefulShutdownTimeoutS *int32 `json:"graceful_shutdown_timeout_s,omitempty"` - HealthCheckPeriodS *int32 `json:"health_check_period_s,omitempty"` - HealthCheckTimeoutS *int32 `json:"health_check_timeout_s,omitempty"` - RayActorOptions RayActorOptionSpec `json:"ray_actor_options,omitempty"` -} - -// RayActorOptionSpec defines the desired state of RayActor, used by Ray Dashboard. -type RayActorOptionSpec struct { - RuntimeEnv map[string]interface{} `json:"runtime_env,omitempty"` - NumCpus *float64 `json:"num_cpus,omitempty"` - NumGpus *float64 `json:"num_gpus,omitempty"` - Memory *int32 `json:"memory,omitempty"` - ObjectStoreMemory *int32 `json:"object_store_memory,omitempty"` - Resources map[string]interface{} `json:"resources,omitempty"` - AcceleratorType string `json:"accelerator_type,omitempty"` -} - -// ServeDeploymentStatuses defines the current states of all Serve Deployments. -type ServeDeploymentStatuses struct { - ApplicationStatus rayv1alpha1.AppStatus `json:"app_status,omitempty"` - DeploymentStatuses []rayv1alpha1.ServeDeploymentStatus `json:"deployment_statuses,omitempty"` -} - // ServingClusterDeployments defines the request sent to the dashboard api server. // See https://docs.ray.io/en/master/_modules/ray/serve/schema.html#ServeApplicationSchema for more details. type ServingClusterDeployments struct { @@ -79,7 +48,11 @@ type RayDashboardClientInterface interface { InitClient(url string) GetDeployments(context.Context) (string, error) UpdateDeployments(ctx context.Context, spec rayv1alpha1.ServeDeploymentGraphSpec) error - GetDeploymentsStatus(context.Context) (*ServeDeploymentStatuses, error) + // V1/single-app Rest API + GetSingleApplicationStatus(context.Context) (*ServeApplicationStatus, error) + // V2/multi-app Rest API + GetServeDetails(ctx context.Context) (*ServeDetails, error) + GetMultiApplicationStatus(context.Context) (map[string]*ServeApplicationStatus, error) ConvertServeConfig(specs []rayv1alpha1.ServeConfigSpec) []ServeConfigSpec GetJobInfo(ctx context.Context, jobId string) (*RayJobInfo, error) SubmitJob(ctx context.Context, rayJob *rayv1alpha1.RayJob, log *logr.Logger) (jobId string, err error) @@ -230,7 +203,7 @@ func (r *RayDashboardClient) UpdateDeployments(ctx context.Context, spec rayv1al } // GetDeploymentsStatus get the current deployment statuses in the Ray cluster. -func (r *RayDashboardClient) GetDeploymentsStatus(ctx context.Context) (*ServeDeploymentStatuses, error) { +func (r *RayDashboardClient) GetSingleApplicationStatus(ctx context.Context) (*ServeApplicationStatus, error) { req, err := http.NewRequestWithContext(ctx, "GET", r.dashboardURL+StatusPath, nil) if err != nil { return nil, err @@ -248,12 +221,78 @@ func (r *RayDashboardClient) GetDeploymentsStatus(ctx context.Context) (*ServeDe return nil, fmt.Errorf("GetDeploymentsStatus fail: %s %s", resp.Status, string(body)) } - var serveStatuses ServeDeploymentStatuses - if err = json.Unmarshal(body, &serveStatuses); err != nil { + var status ServeSingleApplicationStatusV1 + if err = json.Unmarshal(body, &status); err != nil { + return nil, err + } + + defaultAppStatus := ServeApplicationStatus{ + Name: "default", + Message: status.ApplicationStatus.Message, + Status: status.ApplicationStatus.Status, + Deployments: make(map[string]ServeDeploymentStatus), + } + // r.Log.V(1).Info("getAndCheckServeStatus", "previous default app status", oldDefaultAppStatus) + + for _, deployment := range status.DeploymentStatuses { + deploymentStatus := ServeDeploymentStatus{ + Status: deployment.Status, + Message: deployment.Message, + } + + defaultAppStatus.Deployments[deployment.Name] = deploymentStatus + } + return &defaultAppStatus, nil +} + +func (r *RayDashboardClient) GetMultiApplicationStatus(ctx context.Context) (map[string]*ServeApplicationStatus, error) { + serveDetails, err := r.GetServeDetails(ctx) + if err != nil { + return nil, fmt.Errorf("Failed to get serve details: %v", err) + } + + return r.ConvertServeDetailsToApplicationStatuses(serveDetails) +} + +// GetServeDetails gets details on all live applications on the Ray cluster. +func (r *RayDashboardClient) GetServeDetails(ctx context.Context) (*ServeDetails, error) { + req, err := http.NewRequestWithContext(ctx, "GET", r.dashboardURL+ServeDetailsPath, nil) + if err != nil { + return nil, err + } + + resp, err := r.client.Do(req) + if err != nil { return nil, err } + defer resp.Body.Close() + + body, _ := ioutil.ReadAll(resp.Body) + + if resp.StatusCode < 200 || resp.StatusCode > 299 { + return nil, fmt.Errorf("GetServeDetails fail: %s %s", resp.Status, string(body)) + } + + var serveDetails ServeDetails + if err = json.Unmarshal(body, &serveDetails); err != nil { + return nil, fmt.Errorf("GetServeDetails failed. Failed to unmarshal bytes: %s", string(body)) + } + + return &serveDetails, nil +} + +func (r *RayDashboardClient) ConvertServeDetailsToApplicationStatuses(serveDetails *ServeDetails) (map[string]*ServeApplicationStatus, error) { + detailsJson, err := json.Marshal(serveDetails.Applications) + if err != nil { + return nil, fmt.Errorf("Failed to marshal serve details.") + } + + applicationStatuses := map[string]*ServeApplicationStatus{} + if err = json.Unmarshal(detailsJson, &applicationStatuses); err != nil { + return nil, fmt.Errorf("Failed to unmarshal serve details bytes into map of application statuses: %v", err) + } - return &serveStatuses, nil + return applicationStatuses, nil } func (r *RayDashboardClient) ConvertServeConfig(specs []rayv1alpha1.ServeConfigSpec) []ServeConfigSpec { diff --git a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go index e131b0e1973..6babfeed3b9 100644 --- a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go +++ b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go @@ -12,9 +12,11 @@ import ( ) type FakeRayDashboardClient struct { - client http.Client - dashboardURL string - serveStatuses ServeDeploymentStatuses + client http.Client + dashboardURL string + singleAppStatus ServeApplicationStatus + multiAppStatuses map[string]*ServeApplicationStatus + serveDetails ServeDetails } var _ RayDashboardClientInterface = (*FakeRayDashboardClient)(nil) @@ -33,8 +35,16 @@ func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, specs rayv return nil } -func (r *FakeRayDashboardClient) GetDeploymentsStatus(_ context.Context) (*ServeDeploymentStatuses, error) { - return &r.serveStatuses, nil +func (r *FakeRayDashboardClient) GetSingleApplicationStatus(_ context.Context) (*ServeApplicationStatus, error) { + return &r.singleAppStatus, nil +} + +func (r *FakeRayDashboardClient) GetMultiApplicationStatus(_ context.Context) (map[string]*ServeApplicationStatus, error) { + return r.multiAppStatuses, nil +} + +func (r *FakeRayDashboardClient) GetServeDetails(_ context.Context) (*ServeDetails, error) { + return &r.serveDetails, nil } func (r *FakeRayDashboardClient) ConvertServeConfig(specs []rayv1alpha1.ServeConfigSpec) []ServeConfigSpec { @@ -79,8 +89,8 @@ func (r *FakeRayDashboardClient) ConvertServeConfig(specs []rayv1alpha1.ServeCon return serveConfigToSend } -func (r *FakeRayDashboardClient) SetServeStatus(status ServeDeploymentStatuses) { - r.serveStatuses = status +func (r *FakeRayDashboardClient) SetServeStatus(status ServeApplicationStatus) { + r.singleAppStatus = status } func (r *FakeRayDashboardClient) GetJobInfo(_ context.Context, jobId string) (*RayJobInfo, error) { diff --git a/ray-operator/controllers/ray/utils/serve_api_models.go b/ray-operator/controllers/ray/utils/serve_api_models.go new file mode 100644 index 00000000000..71c4f1d37fd --- /dev/null +++ b/ray-operator/controllers/ray/utils/serve_api_models.go @@ -0,0 +1,82 @@ +package utils + +type RayServeConfigType string + +const ( + MULTI_APP RayServeConfigType = "MULTI_APP" + SINGLE_APP RayServeConfigType = "SINGLE_APP" +) + +// V1 Serve API Response format +type ServeAppStatusInfoV1 struct { + Status string `json:"status,omitempty"` + Message string `json:"message,omitempty"` +} + +type ServeSingleApplicationStatusV1 struct { + ApplicationStatus ServeAppStatusInfoV1 `json:"app_status,omitempty"` + DeploymentStatuses []ServeDeploymentStatus `json:"deployment_statuses,omitempty"` +} + +// ServeDeploymentStatus and ServeApplicationStatus describe the format of status(es) that will +// be returned by GetSingleApplicationStatus and GetMultiApplicationStatus methods of the dashboard client +// Describes the status of a deployment +type ServeDeploymentStatus struct { + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + Message string `json:"message,omitempty"` +} + +// Describes the status of an application +type ServeApplicationStatus struct { + Name string `json:"name,omitempty"` + Status string `json:"status"` + Message string `json:"message,omitempty"` + Deployments map[string]ServeDeploymentStatus `json:"deployments"` +} + +// V2 Serve API Response format. These extend the ServeDeploymentStatus and ServeApplicationStatus structs, +// but contain more information such as route prefix because the V2/multi-app GET API fetchs general metadata, +// not just statuses. +type ServeDeploymentDetails struct { + ServeDeploymentStatus + RoutePrefix string `json:"route_prefix,omitempty"` +} + +type ServeApplicationDetails struct { + ServeApplicationStatus + RoutePrefix string `json:"route_prefix,omitempty"` + DocsPath string `json:"docs_path,omitempty"` + Deployments map[string]ServeDeploymentDetails `json:"deployments"` +} + +type ServeDetails struct { + Applications map[string]ServeApplicationDetails `json:"applications"` + DeployMode string `json:"deploy_mode,omitempty"` +} + +// ServeConfigSpec defines the desired state of RayService, used by Ray Dashboard. +type ServeConfigSpec struct { + Name string `json:"name"` + NumReplicas *int32 `json:"num_replicas,omitempty"` + RoutePrefix string `json:"route_prefix,omitempty"` + MaxConcurrentQueries *int32 `json:"max_concurrent_queries,omitempty"` + UserConfig map[string]interface{} `json:"user_config,omitempty"` + AutoscalingConfig map[string]interface{} `json:"autoscaling_config,omitempty"` + GracefulShutdownWaitLoopS *int32 `json:"graceful_shutdown_wait_loop_s,omitempty"` + GracefulShutdownTimeoutS *int32 `json:"graceful_shutdown_timeout_s,omitempty"` + HealthCheckPeriodS *int32 `json:"health_check_period_s,omitempty"` + HealthCheckTimeoutS *int32 `json:"health_check_timeout_s,omitempty"` + RayActorOptions RayActorOptionSpec `json:"ray_actor_options,omitempty"` +} + +// RayActorOptionSpec defines the desired state of RayActor, used by Ray Dashboard. +type RayActorOptionSpec struct { + RuntimeEnv map[string]interface{} `json:"runtime_env,omitempty"` + NumCpus *float64 `json:"num_cpus,omitempty"` + NumGpus *float64 `json:"num_gpus,omitempty"` + Memory *int32 `json:"memory,omitempty"` + ObjectStoreMemory *int32 `json:"object_store_memory,omitempty"` + Resources map[string]interface{} `json:"resources,omitempty"` + AcceleratorType string `json:"accelerator_type,omitempty"` +}