Skip to content

Commit

Permalink
feat: update RayCluster .status.reason field with pod creation error
Browse files Browse the repository at this point in the history
fixes #603

### example before if Pod can't be created due to ResourceQuota exceeded

```
kubectl get rayclusters dxia-test2 -o yaml
...
status:
  state: failed

kubectl describe rayclusters dxia-test2
...
Status:
  State:   failed
```

### example after if Pod can't be created due to ResourceQuota exceeded

```
kubectl get rayclusters dxia-test2 -o yaml
...
status:
  reason: 'pods "dxia-test2-head-lbvdc" is forbidden: exceeded quota: quota, requested:
    limits.cpu=15,requests.cpu=15, used: limits.cpu=60,requests.cpu=60, limited: limits.cpu=10,requests.cpu=10'
  state: failed

kubectl describe rayclusters dxia-test2
...
Status:
  Reason:  pods "dxia-test2-head-9mdm5" is forbidden: exceeded quota: quota, requested: limits.cpu=15,requests.cpu=15, used: limits.cpu=60,requests.cpu=60, limited: limits.cpu=10,requests.cpu=10
  State:   failed
```
  • Loading branch information
davidxia committed Nov 1, 2022
1 parent 389ba00 commit ac82334
Show file tree
Hide file tree
Showing 9 changed files with 71 additions and 0 deletions.
3 changes: 3 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11138,6 +11138,9 @@ spec:
each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
of cluster Important: Run "make" to regenerat'
Expand Down
3 changes: 3 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11696,6 +11696,9 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down
8 changes: 8 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11752,6 +11752,10 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current
State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down Expand Up @@ -11851,6 +11855,10 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current
State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down
2 changes: 2 additions & 0 deletions ray-operator/apis/ray/v1alpha1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ type RayClusterStatus struct {
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"`
// Service Endpoints
Endpoints map[string]string `json:"endpoints,omitempty"`
// Reason provides more information about current State
Reason string `json:"reason,omitempty"`
}

// RayNodeType the type of a ray node: head/worker
Expand Down
3 changes: 3 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11138,6 +11138,9 @@ spec:
each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
of cluster Important: Run "make" to regenerat'
Expand Down
3 changes: 3 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11696,6 +11696,9 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down
8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11752,6 +11752,10 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current
State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down Expand Up @@ -11851,6 +11855,10 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current
State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down
8 changes: 8 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,9 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc
if updateErr := r.updateClusterState(instance, rayiov1alpha1.Failed); updateErr != nil {
r.Log.Error(updateErr, "RayCluster update state error", "cluster name", request.Name)
}
if updateErr := r.updateClusterReason(instance, err.Error()); updateErr != nil {
r.Log.Error(updateErr, "RayCluster update reason error", "cluster name", request.Name)
}
return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err
}
// update the status if needed
Expand Down Expand Up @@ -979,3 +982,8 @@ func (r *RayClusterReconciler) updateClusterState(instance *rayiov1alpha1.RayClu
instance.Status.State = clusterState
return r.Status().Update(context.Background(), instance)
}

func (r *RayClusterReconciler) updateClusterReason(instance *rayiov1alpha1.RayCluster, clusterReason string) error {
instance.Status.Reason = clusterReason
return r.Status().Update(context.Background(), instance)
}
33 changes: 33 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller_fake_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,39 @@ func TestReconcile_AutoscalerRoleBinding(t *testing.T) {
assert.Nil(t, err, "Fail to get autoscaler RoleBinding after reconciliation")
}

func TestReconcile_UpdateClusterReason(t *testing.T) {
setupTest(t)
defer tearDown(t)
newScheme := runtime.NewScheme()
_ = rayiov1alpha1.AddToScheme(newScheme)

fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(testRayCluster).Build()

namespacedName := types.NamespacedName{
Name: instanceName,
Namespace: namespaceStr,
}
cluster := rayiov1alpha1.RayCluster{}
err := fakeClient.Get(context.Background(), namespacedName, &cluster)
assert.Nil(t, err, "Fail to get RayCluster")
assert.Empty(t, cluster.Status.Reason, "Cluster reason should be empty")

testRayClusterReconciler := &RayClusterReconciler{
Client: fakeClient,
Recorder: &record.FakeRecorder{},
Scheme: scheme.Scheme,
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
}
reason := "test reason"

err = testRayClusterReconciler.updateClusterReason(testRayCluster, reason)
assert.Nil(t, err, "Fail to update cluster reason")

err = fakeClient.Get(context.Background(), namespacedName, &cluster)
assert.Nil(t, err, "Fail to get RayCluster after updating reason")
assert.Equal(t, cluster.Status.Reason, reason, "Cluster reason should be updated")
}

func TestUpdateEndpoints(t *testing.T) {
setupTest(t)
defer tearDown(t)
Expand Down

0 comments on commit ac82334

Please sign in to comment.