grafana · jhesketh · Aug 13, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## main / unreleased
 
+* [FEATURE] Only scale-up once all of the leader `StatefulSet` replicas are ready. #164
+
 ## v0.17.1
 
 * [ENHANCEMENT] prepare-downscale admission webhook: undo prepare-shutdown calls if adding the `last-downscale` annotation fails. #151

diff --git a/README.md b/README.md
@@ -33,7 +33,11 @@ For each **rollout group**, the operator **guarantees**:
 
 The operator can also optionally coordinate scaling up and down of `StatefulSets` that are part of the same `rollout-group` based on the `grafana.com/rollout-downscale-leader` annotation. When using this feature, the `grafana.com/min-time-between-zones-downscale` label must also be set on each `StatefulSet`.
 
-This can be useful for automating the tedious scaling of stateful services like Mimir ingesters. Making use of this feature requires adding a few annotations and labels to configure how it works. Examples for a multi-AZ ingester group are given below.
+This can be useful for automating the tedious scaling of stateful services like Mimir ingesters. Making use of this feature requires adding a few annotations and labels to configure how it works.
+
+If the `grafana.com/rollout-leader-ready` annotation is set to `true` on a follower `StatefulSet`, the operator will only scale up the follower once all replicas in the leader `StatefulSet` are `ready`. This ensures that the follower zone does not scale up until the leader zone is completely stable.
+
+Example usage for a multi-AZ ingester group:
 
 - For `ingester-zone-a`, add the following:
   - Labels:
@@ -47,7 +51,8 @@ This can be useful for automating the tedious scaling of stateful services like
     - `grafana.com/min-time-between-zones-downscale=12h` (change the value here to an appropriate duration)
     - `grafana.com/prepare-downscale=true` (to allow the service to be notified when it will be scaled down)
   - Annotations:
-    - `grafana.com/rollout-downscale-leader=ingester-zone-a` (zone `b` will follow zone `a`, after a delay) 
+    - `grafana.com/rollout-downscale-leader=ingester-zone-a` (zone `b` will follow zone `a`, after a delay)
+    - `grafana.com/rollout-leader-ready=true` (zone `b` will only scale up once all replicas in zone `a` are ready)
     - `grafana.com/prepare-downscale-http-path=ingester/prepare-shutdown` (to call a specific endpoint on the service)
     - `grafana.com/prepare-downscale-http-port=80` (to call a specific endpoint on the service)
 - For `ingester-zone-c`, add the following:
@@ -56,6 +61,7 @@ This can be useful for automating the tedious scaling of stateful services like
     - `grafana.com/prepare-downscale=true` (to allow the service to be notified when it will be scaled down)
   - Annotations:
     - `grafana.com/rollout-downscale-leader=ingester-zone-b` (zone `c` will follow zone `b`, after a delay)
+    - `grafana.com/rollout-leader-ready=true` (zone `c` will only scale up once all replicas in zone `b` are ready)
     - `grafana.com/prepare-downscale-http-path=ingester/prepare-shutdown` (to call a specific endpoint on the service)
     - `grafana.com/prepare-downscale-http-port=80` (to call a specific endpoint on the service)
 

diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -28,6 +28,9 @@ const (
 	// RolloutDownscaleLeaderAnnotationKey is the name of the leader statefulset that should be used to determine
 	// the number of replicas in a follower statefulset.
 	RolloutDownscaleLeaderAnnotationKey = "grafana.com/rollout-downscale-leader"
+	// RolloutLeaderReadyKey is whether to only scale up once `ready` replicas match the desired replicas.
+	RolloutLeaderReadyAnnotationKey   = "grafana.com/rollout-leader-ready"
+	RolloutLeaderReadyAnnotationValue = "true"
 
 	rolloutMirrorReplicasFromResourceAnnotationKeyPrefix = "grafana.com/rollout-mirror-replicas-from-resource"
 	// RolloutMirrorReplicasFromResourceNameAnnotationKey -- when set (together with "kind" and optionally "api-version" annotations), rollout-operator sets number of

diff --git a/pkg/controller/replicas.go b/pkg/controller/replicas.go
@@ -21,7 +21,15 @@ func desiredStsReplicas(group string, sts *v1.StatefulSet, all []*v1.StatefulSet
 
 	leaderReplicas := *leader.Spec.Replicas
 	if leaderReplicas > followerReplicas {
-		// Scale up is always allowed immediately
+		// Handle scale-up scenarios
+		annotations := sts.GetAnnotations()
+		onlyWhenReady, ok := annotations[config.RolloutLeaderReadyAnnotationKey]
+		if ok && onlyWhenReady == config.RolloutLeaderReadyAnnotationValue {
+			// We only scale up once all of the leader pods are ready. Otherwise we do nothing.
+			if leaderReplicas != leader.Status.ReadyReplicas {
+				return followerReplicas, nil
+			}
+		}
 		return leaderReplicas, nil
 	}
 

diff --git a/pkg/controller/replicas_test.go b/pkg/controller/replicas_test.go
@@ -211,6 +211,77 @@ func TestReconcileStsReplicas(t *testing.T) {
 		require.Equal(t, int32(4), replicas)
 	})
 
+	t.Run("scale up only when all leader replicas are ready", func(t *testing.T) {
+		sts1 := mockStatefulSet("test-zone-a", withReplicas(4, 3))
+		sts2 := mockStatefulSet("test-zone-b", withReplicas(2, 2), withAnnotations(map[string]string{
+			config.RolloutDownscaleLeaderAnnotationKey: "test-zone-a",
+			config.RolloutLeaderReadyAnnotationKey:     config.RolloutLeaderReadyAnnotationValue,
+		}))
+		sts3 := mockStatefulSet("test-zone-c", withReplicas(1, 1), withAnnotations(map[string]string{
+			config.RolloutDownscaleLeaderAnnotationKey: "test-zone-b",
+			config.RolloutLeaderReadyAnnotationKey:     config.RolloutLeaderReadyAnnotationValue,
+		}))
+
+		replicas, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2, sts3}, log.NewNopLogger())
+		require.NoError(t, err)
+		require.Equal(t, int32(2), replicas, "no change in replicas because leader isn't ready yet")
+
+		sts1 = mockStatefulSet("test-zone-a", withReplicas(4, 4))
+		replicasB, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2, sts3}, log.NewNopLogger())
+		require.NoError(t, err)
+		require.Equal(t, int32(4), replicasB, "ready to scale zone-b")
+
+		sts2 = mockStatefulSet("test-zone-b", withReplicas(replicasB, 2))
+		replicasC, err := desiredStsReplicas("test", sts3, []*v1.StatefulSet{sts1, sts2, sts3}, log.NewNopLogger())
+		require.NoError(t, err)
+		require.Equal(t, int32(1), replicasC, "no change in replicas because zone-b isn't ready yet")
+
+		sts2 = mockStatefulSet("test-zone-b", withReplicas(replicasB, replicasB))
+		replicasC, err = desiredStsReplicas("test", sts3, []*v1.StatefulSet{sts1, sts2, sts3}, log.NewNopLogger())
+		require.NoError(t, err)
+		require.Equal(t, int32(4), replicasC, "ready to scale zone-c")
+	})
+
+	t.Run("scale up ignoring ready replicas", func(t *testing.T) {
+		sts1 := mockStatefulSet("test-zone-a", withReplicas(4, 2))
+		sts2 := mockStatefulSet("test-zone-b", withReplicas(3, 3), withAnnotations(map[string]string{
+			config.RolloutDownscaleLeaderAnnotationKey: "test-zone-a",
+		}))
+
+		replicas, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2}, log.NewNopLogger())
+		require.NoError(t, err)
+		require.Equal(t, int32(4), replicas)
+	})
+
+	t.Run("scaling down always occurs to desired replicas", func(t *testing.T) {
+		sts1 := mockStatefulSet("test-zone-a", withReplicas(2, 1))
+		sts2 := mockStatefulSet("test-zone-b", withReplicas(3, 3), withAnnotations(map[string]string{
+			config.RolloutDownscaleLeaderAnnotationKey: "test-zone-a",
+			config.RolloutLeaderReadyAnnotationKey:     config.RolloutLeaderReadyAnnotationValue,
+		}))
+
+		replicas, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2}, log.NewNopLogger())
+		require.NoError(t, err)
+		require.Equal(t, int32(2), replicas)
+	})
+
+	t.Run("do not scale up follower if not all leader replicas are ready", func(t *testing.T) {
+		sts1 := mockStatefulSet("test-zone-a", withReplicas(6, 4))
+		sts2 := mockStatefulSet("test-zone-b", withReplicas(3, 3), withAnnotations(map[string]string{
+			config.RolloutDownscaleLeaderAnnotationKey: "test-zone-a",
+			config.RolloutLeaderReadyAnnotationKey:     config.RolloutLeaderReadyAnnotationValue,
+		}))
+
+		replicas, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2}, log.NewNopLogger())
+		require.NoError(t, err)
+		require.Equal(t, int32(3), replicas, "no change in replicas because leader isn't ready yet")
+
+		sts1 = mockStatefulSet("test-zone-a", withReplicas(6, 6))
+		replicas, err = desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2}, log.NewNopLogger())
+		require.NoError(t, err)
+		require.Equal(t, int32(6), replicas, "ready to scale zone-b")
+	})
+
 	t.Run("scale down min time error", func(t *testing.T) {
 		downscale1 := time.Now().UTC().Round(time.Second).Add(-72 * time.Hour)
 		downscale2 := time.Now().UTC().Round(time.Second).Add(-60 * time.Hour)