From 368dd360e01bc92a83caadc3253c0e871385fc48 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Thu, 29 Dec 2022 16:38:57 -0800 Subject: [PATCH 1/5] Reconcile when cluster pods are Pending Signed-off-by: Archit Kulkarni --- ray-operator/apis/ray/v1alpha1/raycluster_types.go | 1 + ray-operator/controllers/ray/raycluster_controller.go | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/ray-operator/apis/ray/v1alpha1/raycluster_types.go b/ray-operator/apis/ray/v1alpha1/raycluster_types.go index a3ea2a4919..6aa5eecfb2 100644 --- a/ray-operator/apis/ray/v1alpha1/raycluster_types.go +++ b/ray-operator/apis/ray/v1alpha1/raycluster_types.go @@ -101,6 +101,7 @@ const ( Ready ClusterState = "ready" Unhealthy ClusterState = "unhealthy" Failed ClusterState = "failed" + Pending ClusterState = "pending" ) // RayClusterStatus defines the observed state of RayCluster diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 34875b3f66..f011f216aa 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -233,6 +233,9 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc r.Log.Error(err, "Update status error", "cluster name", request.Name) } } + if instance.Status.State == rayiov1alpha1.Pending { + return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, nil + } return ctrl.Result{}, nil } @@ -858,6 +861,8 @@ func (r *RayClusterReconciler) updateStatus(instance *rayiov1alpha1.RayCluster) } else { if utils.CheckAllPodsRunnning(runtimePods) { instance.Status.State = rayiov1alpha1.Ready + } else { + instance.Status.State = rayiov1alpha1.Pending } } From 37fec45b0b9bb14f4b4db555c3aae884074e72ef Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Fri, 30 Dec 2022 12:55:48 -0800 Subject: [PATCH 2/5] Revert "Reconcile when cluster pods are Pending" This reverts commit 368dd360e01bc92a83caadc3253c0e871385fc48. --- ray-operator/apis/ray/v1alpha1/raycluster_types.go | 1 - ray-operator/controllers/ray/raycluster_controller.go | 5 ----- 2 files changed, 6 deletions(-) diff --git a/ray-operator/apis/ray/v1alpha1/raycluster_types.go b/ray-operator/apis/ray/v1alpha1/raycluster_types.go index 6aa5eecfb2..a3ea2a4919 100644 --- a/ray-operator/apis/ray/v1alpha1/raycluster_types.go +++ b/ray-operator/apis/ray/v1alpha1/raycluster_types.go @@ -101,7 +101,6 @@ const ( Ready ClusterState = "ready" Unhealthy ClusterState = "unhealthy" Failed ClusterState = "failed" - Pending ClusterState = "pending" ) // RayClusterStatus defines the observed state of RayCluster diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index f011f216aa..34875b3f66 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -233,9 +233,6 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc r.Log.Error(err, "Update status error", "cluster name", request.Name) } } - if instance.Status.State == rayiov1alpha1.Pending { - return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, nil - } return ctrl.Result{}, nil } @@ -861,8 +858,6 @@ func (r *RayClusterReconciler) updateStatus(instance *rayiov1alpha1.RayCluster) } else { if utils.CheckAllPodsRunnning(runtimePods) { instance.Status.State = rayiov1alpha1.Ready - } else { - instance.Status.State = rayiov1alpha1.Pending } } From f4486df462ead45117d34b68f474561da9298457 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Fri, 30 Dec 2022 13:20:29 -0800 Subject: [PATCH 3/5] Requeue RayCluster reconciliation if "successful" Signed-off-by: Archit Kulkarni --- ray-operator/controllers/ray/raycluster_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 34875b3f66..9ca2b56446 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -234,7 +234,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc } } - return ctrl.Result{}, nil + return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, nil } func (r *RayClusterReconciler) reconcileIngress(instance *rayiov1alpha1.RayCluster) error { From 251a18f320246b66efa3e9ecaca53613c4656665 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Fri, 20 Jan 2023 10:56:11 -0800 Subject: [PATCH 4/5] Change unconditional requeue from 2s to 60s Signed-off-by: Archit Kulkarni --- ray-operator/controllers/ray/raycluster_controller.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 9ca2b56446..b5666b7a18 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -234,7 +234,8 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc } } - return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, nil + // Unconditionally requeue after 60 seconds. + return ctrl.Result{RequeueAfter: 60 * time.Second}, nil } func (r *RayClusterReconciler) reconcileIngress(instance *rayiov1alpha1.RayCluster) error { From 9d50cbd5d39e118114f2b3f2d0a7a9dcf560b8e1 Mon Sep 17 00:00:00 2001 From: Archit Kulkarni Date: Mon, 23 Jan 2023 12:41:42 -0800 Subject: [PATCH 5/5] Use env var with default 300s and log Signed-off-by: Archit Kulkarni --- .../controllers/ray/raycluster_controller.go | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index b5666b7a18..7b8fe7f128 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -3,6 +3,8 @@ package ray import ( "context" "fmt" + "os" + "strconv" "strings" "time" @@ -234,8 +236,17 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc } } - // Unconditionally requeue after 60 seconds. - return ctrl.Result{RequeueAfter: 60 * time.Second}, nil + // Unconditionally requeue after the number of seconds specified in the + // environment variable RAYCLUSTER_DEFAULT_RECONCILE_LOOP_S. If the + // environment variable is not set, requeue after 5 minutes. + var requeueAfterSeconds int + requeueAfterSeconds, err := strconv.Atoi(os.Getenv("RAYCLUSTER_DEFAULT_RECONCILE_LOOP_S")) + if err != nil { + r.Log.Info("RAYCLUSTER_DEFAULT_RECONCILE_LOOP_S is not set, using default value 300s", "cluster name", request.Name) + requeueAfterSeconds = 5 * 60 + } + r.Log.Info("Unconditional requeue after", "cluster name", request.Name, "seconds", requeueAfterSeconds) + return ctrl.Result{RequeueAfter: time.Duration(requeueAfterSeconds) * time.Second}, nil } func (r *RayClusterReconciler) reconcileIngress(instance *rayiov1alpha1.RayCluster) error {