From 6c216254b0405f81313bfc2d56174c4d2ea0a379 Mon Sep 17 00:00:00 2001 From: zachaller Date: Tue, 29 Nov 2022 11:04:07 -0600 Subject: [PATCH 01/15] fix(traficrouter): WIP on not setting weight if not available Signed-off-by: zachaller --- rollout/trafficrouting.go | 26 ++++++++++++++++++++++++++ test/e2e/functional_test.go | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index ebc22b9704..1c4603eec3 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -102,6 +102,8 @@ func (c *Controller) NewTrafficRoutingReconciler(roCtx *rolloutContext) ([]traff return nil, nil } +//var count int = 0 + func (c *rolloutContext) reconcileTrafficRouting() error { reconcilers, err := c.newTrafficRoutingReconciler(c) // a return here does ensure that all trafficReconcilers are healthy @@ -181,6 +183,11 @@ func (c *rolloutContext) reconcileTrafficRouting() error { atDesiredReplicaCount := replicasetutil.AtDesiredReplicaCountsForCanary(c.rollout, c.newRS, c.stableRS, c.otherRSs, nil) if !atDesiredReplicaCount && !c.rollout.Status.PromoteFull { // Use the previous weight since the new RS is not ready for a new weight + + //canaryPodsReady := replicasetutil.GetAvailableReplicaCountForReplicaSets(c.newRS) + // + // This logic seems flawed when we have a step that sets it to 100% and then pauses but when we go into + // the pause state, the canary becomes not ready for any reason. We should not be setting the weight to 100% for i := *index - 1; i >= 0; i-- { step := c.rollout.Spec.Strategy.Canary.Steps[i] if step.SetWeight != nil { @@ -220,6 +227,25 @@ func (c *rolloutContext) reconcileTrafficRouting() error { return err } + //REMOVE: this is test code to simulate a failure within the canary + //if *index == 8 && count < 10 { + // c.kubeclientset.CoreV1().Pods("smi").DeleteCollection(context.TODO(), metav1.DeleteOptions{}, metav1.ListOptions{ + // LabelSelector: "role=c", + // }) + // count++ + //} + + //If we are in the middle of a rollout we need to check that we are available on the newRS aka the canary + //before we start routing any traffic to it + if canaryHash != stableHash { + if !replicasetutil.IsReplicaSetAvailable(c.newRS) { + // We are in the middle of a rollout, but the newRS is not available yet so let's calculate a traffic router + // weight instead of using user supplied weight to avoid downtime + c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, setting desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) + desiredWeight = (100 * c.newRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas + } + } + err = reconciler.SetWeight(desiredWeight, weightDestinations...) if err != nil { c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) diff --git a/test/e2e/functional_test.go b/test/e2e/functional_test.go index 8ca5a026ac..71907414e8 100644 --- a/test/e2e/functional_test.go +++ b/test/e2e/functional_test.go @@ -288,7 +288,7 @@ func (s *FunctionalSuite) TestRolloutPDBRestart() { s.Given(). HealthyRollout(` --- -apiVersion: policy/v1beta1 +apiVersion: policy/v1 kind: PodDisruptionBudget metadata: name: rollout-pdb-restart From fdf404ce8ccef0f444243635ddf798c5355b8039 Mon Sep 17 00:00:00 2001 From: zachaller Date: Tue, 29 Nov 2022 12:05:40 -0600 Subject: [PATCH 02/15] fix tests Signed-off-by: zachaller --- rollout/trafficrouting.go | 3 ++- rollout/trafficrouting_test.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index 1c4603eec3..e2f45dad73 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -236,7 +236,8 @@ func (c *rolloutContext) reconcileTrafficRouting() error { //} //If we are in the middle of a rollout we need to check that we are available on the newRS aka the canary - //before we start routing any traffic to it + //before we start routing any traffic to it, this is not perfect becuase there is still time delay within each + //traffic router where something can go wrong like nodes being killed etc. if canaryHash != stableHash { if !replicasetutil.IsReplicaSetAvailable(c.newRS) { // We are in the middle of a rollout, but the newRS is not available yet so let's calculate a traffic router diff --git a/rollout/trafficrouting_test.go b/rollout/trafficrouting_test.go index 6c1e49f476..d1d81fdff0 100644 --- a/rollout/trafficrouting_test.go +++ b/rollout/trafficrouting_test.go @@ -446,7 +446,7 @@ func TestRolloutUseDynamicWeightOnPromoteFull(t *testing.T) { f.fakeTrafficRouting = newUnmockedFakeTrafficRoutingReconciler() f.fakeTrafficRouting.On("UpdateHash", mock.Anything, mock.Anything, mock.Anything).Return(nil) f.fakeTrafficRouting.On("SetWeight", mock.Anything, mock.Anything).Return(func(desiredWeight int32, additionalDestinations ...v1alpha1.WeightDestination) error { - assert.Equal(t, int32(5), desiredWeight) + assert.Equal(t, int32(50), desiredWeight) //This is 50% here because only 50% of the pods in the canary are available return nil }) f.fakeTrafficRouting.On("SetHeaderRoute", mock.Anything, mock.Anything).Return(nil) From 14481d214f7d7f062a355f66693c4bd0520c1447 Mon Sep 17 00:00:00 2001 From: zachaller Date: Wed, 30 Nov 2022 08:45:37 -0600 Subject: [PATCH 03/15] try bailing vs setting weight Signed-off-by: zachaller --- rollout/trafficrouting.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index e2f45dad73..ff1f798640 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -228,7 +228,7 @@ func (c *rolloutContext) reconcileTrafficRouting() error { } //REMOVE: this is test code to simulate a failure within the canary - //if *index == 8 && count < 10 { + //if *index == 6 && count < 15 { // c.kubeclientset.CoreV1().Pods("smi").DeleteCollection(context.TODO(), metav1.DeleteOptions{}, metav1.ListOptions{ // LabelSelector: "role=c", // }) @@ -236,14 +236,16 @@ func (c *rolloutContext) reconcileTrafficRouting() error { //} //If we are in the middle of a rollout we need to check that we are available on the newRS aka the canary - //before we start routing any traffic to it, this is not perfect becuase there is still time delay within each + //before we start routing any traffic to it, this is not perfect because there is still time delay within each //traffic router where something can go wrong like nodes being killed etc. if canaryHash != stableHash { if !replicasetutil.IsReplicaSetAvailable(c.newRS) { // We are in the middle of a rollout, but the newRS is not available yet so let's calculate a traffic router // weight instead of using user supplied weight to avoid downtime - c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, setting desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) - desiredWeight = (100 * c.newRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas + //c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, setting desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) + //desiredWeight = (100 * c.newRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas + c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, delay setting desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) + return nil } } From 5d64b747e4f6615edc001ccfab661d806bd240f8 Mon Sep 17 00:00:00 2001 From: zachaller Date: Wed, 30 Nov 2022 13:28:07 -0600 Subject: [PATCH 04/15] work with expirments that do not set any weights Signed-off-by: zachaller --- rollout/service.go | 23 +++++++++---- rollout/trafficrouting.go | 32 +++++++++++-------- .../rollout-alb-experiment-no-setweight.yaml | 1 + test/e2e/aws_test.go | 2 +- 4 files changed, 37 insertions(+), 21 deletions(-) diff --git a/rollout/service.go b/rollout/service.go index a2861be0c2..4ee278861f 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -4,12 +4,6 @@ import ( "context" "fmt" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - patchtypes "k8s.io/apimachinery/pkg/types" - "k8s.io/utils/pointer" - "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/rollout/trafficrouting" "github.com/argoproj/argo-rollouts/utils/annotations" @@ -21,6 +15,11 @@ import ( replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" rolloututils "github.com/argoproj/argo-rollouts/utils/rollout" serviceutil "github.com/argoproj/argo-rollouts/utils/service" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + patchtypes "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/pointer" ) const ( @@ -68,6 +67,7 @@ func (c rolloutContext) switchServiceSelector(service *corev1.Service, newRollou if err != nil { return err } + fmt.Println("WE HAVE SWITCHED SERVICE SELECTORS") msg := fmt.Sprintf("Switched selector for service '%s' from '%s' to '%s'", service.Name, oldPodHash, newRolloutUniqueLabelValue) c.recorder.Eventf(r, record.EventOptions{EventReason: "SwitchService"}, msg) service.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] = newRolloutUniqueLabelValue @@ -250,10 +250,21 @@ func (c *rolloutContext) reconcilePingAndPongService() error { return nil } +var countS1 int = 0 + func (c *rolloutContext) reconcileStableAndCanaryService() error { if c.rollout.Spec.Strategy.Canary == nil { return nil } + + if countS1 < 15 && *c.rollout.Status.CurrentStepIndex >= int32(len(c.rollout.Spec.Strategy.Canary.Steps)) { + //c.kubeclientset.CoreV1().Pods("smi").DeleteCollection(context.TODO(), metav1.DeleteOptions{}, metav1.ListOptions{ + // LabelSelector: "role=c", + //}) + //time.Sleep(1 * time.Second) + countS1++ + } + err := c.ensureSVCTargets(c.rollout.Spec.Strategy.Canary.StableService, c.stableRS, true) if err != nil { return err diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index ff1f798640..b087840a4c 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -238,21 +238,25 @@ func (c *rolloutContext) reconcileTrafficRouting() error { //If we are in the middle of a rollout we need to check that we are available on the newRS aka the canary //before we start routing any traffic to it, this is not perfect because there is still time delay within each //traffic router where something can go wrong like nodes being killed etc. - if canaryHash != stableHash { - if !replicasetutil.IsReplicaSetAvailable(c.newRS) { - // We are in the middle of a rollout, but the newRS is not available yet so let's calculate a traffic router - // weight instead of using user supplied weight to avoid downtime - //c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, setting desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) - //desiredWeight = (100 * c.newRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas - c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, delay setting desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) - return nil + if !replicasetutil.IsReplicaSetAvailable(c.newRS) { + c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, delay setting traficrouter desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) + if len(weightDestinations) > 0 { + fmt.Printf("WE SET WEIGHT TO EXPERIMENTS: %d\n", desiredWeight) + desiredWeight = (100 * c.newRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas + c.log.Infof("rollout has expirments and the canary replicaset is not ready, calculating canary weight as %d", desiredWeight) + err = reconciler.SetWeight(desiredWeight, weightDestinations...) + if err != nil { + c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) + return err + } + } + } else { + fmt.Printf("WE SET WEIGHT TO HAPPY PATH: %d\n", desiredWeight) + err = reconciler.SetWeight(desiredWeight, weightDestinations...) + if err != nil { + c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) + return err } - } - - err = reconciler.SetWeight(desiredWeight, weightDestinations...) - if err != nil { - c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) - return err } if modified, newWeights := calculateWeightStatus(c.rollout, canaryHash, stableHash, desiredWeight, weightDestinations...); modified { diff --git a/test/e2e/alb/rollout-alb-experiment-no-setweight.yaml b/test/e2e/alb/rollout-alb-experiment-no-setweight.yaml index 544da2799d..bca3aac5e9 100644 --- a/test/e2e/alb/rollout-alb-experiment-no-setweight.yaml +++ b/test/e2e/alb/rollout-alb-experiment-no-setweight.yaml @@ -93,6 +93,7 @@ spec: servicePort: 80 steps: - experiment: + duration: 15s templates: - name: experiment-alb-canary specRef: canary diff --git a/test/e2e/aws_test.go b/test/e2e/aws_test.go index a2a4434781..1265095797 100644 --- a/test/e2e/aws_test.go +++ b/test/e2e/aws_test.go @@ -168,7 +168,7 @@ func (s *AWSSuite) TestALBExperimentStepNoSetWeight() { When(). PromoteRollout(). WaitForRolloutStatus("Healthy"). - Sleep(1 * time.Second). // stable is currently set first, and then changes made to VirtualServices/DestinationRules + Sleep(2 * time.Second). // stable is currently set first, and then changes made to VirtualServices/DestinationRules Then(). Assert(assertWeights(s, "alb-rollout-canary", "alb-rollout-stable", 0, 100)) } From 18264b9f66f0a407c7318fb893a2006353ce25a3 Mon Sep 17 00:00:00 2001 From: zachaller Date: Wed, 30 Nov 2022 13:58:50 -0600 Subject: [PATCH 05/15] fix test by commenting out code Signed-off-by: zachaller --- rollout/service.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/rollout/service.go b/rollout/service.go index 4ee278861f..7f72d06054 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -3,7 +3,6 @@ package rollout import ( "context" "fmt" - "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/rollout/trafficrouting" "github.com/argoproj/argo-rollouts/utils/annotations" @@ -250,20 +249,20 @@ func (c *rolloutContext) reconcilePingAndPongService() error { return nil } -var countS1 int = 0 +//var countS1 int = 0 func (c *rolloutContext) reconcileStableAndCanaryService() error { if c.rollout.Spec.Strategy.Canary == nil { return nil } - if countS1 < 15 && *c.rollout.Status.CurrentStepIndex >= int32(len(c.rollout.Spec.Strategy.Canary.Steps)) { - //c.kubeclientset.CoreV1().Pods("smi").DeleteCollection(context.TODO(), metav1.DeleteOptions{}, metav1.ListOptions{ - // LabelSelector: "role=c", - //}) - //time.Sleep(1 * time.Second) - countS1++ - } + //if countS1 < 15 && *c.rollout.Status.CurrentStepIndex >= int32(len(c.rollout.Spec.Strategy.Canary.Steps)) { + // c.kubeclientset.CoreV1().Pods("smi").DeleteCollection(context.TODO(), metav1.DeleteOptions{}, metav1.ListOptions{ + // LabelSelector: "role=c", + // }) + // time.Sleep(1 * time.Second) + // countS1++ + //} err := c.ensureSVCTargets(c.rollout.Spec.Strategy.Canary.StableService, c.stableRS, true) if err != nil { From c13ea8f235ebd3cca7eb7fc3d074700f5b9cb669 Mon Sep 17 00:00:00 2001 From: zachaller Date: Wed, 30 Nov 2022 14:02:36 -0600 Subject: [PATCH 06/15] lint Signed-off-by: zachaller --- rollout/service.go | 1 + 1 file changed, 1 insertion(+) diff --git a/rollout/service.go b/rollout/service.go index 7f72d06054..ae6c9b91a5 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -3,6 +3,7 @@ package rollout import ( "context" "fmt" + "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/rollout/trafficrouting" "github.com/argoproj/argo-rollouts/utils/annotations" From d963b3438d1866dc3030d7de81cf347f6ce44c1a Mon Sep 17 00:00:00 2001 From: zachaller Date: Wed, 30 Nov 2022 14:38:31 -0600 Subject: [PATCH 07/15] simplify logic Signed-off-by: zachaller --- rollout/trafficrouting.go | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index b087840a4c..31affcd643 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -238,18 +238,17 @@ func (c *rolloutContext) reconcileTrafficRouting() error { //If we are in the middle of a rollout we need to check that we are available on the newRS aka the canary //before we start routing any traffic to it, this is not perfect because there is still time delay within each //traffic router where something can go wrong like nodes being killed etc. - if !replicasetutil.IsReplicaSetAvailable(c.newRS) { + if !replicasetutil.IsReplicaSetAvailable(c.newRS) && len(weightDestinations) == 0 { c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, delay setting traficrouter desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) - if len(weightDestinations) > 0 { - fmt.Printf("WE SET WEIGHT TO EXPERIMENTS: %d\n", desiredWeight) - desiredWeight = (100 * c.newRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas - c.log.Infof("rollout has expirments and the canary replicaset is not ready, calculating canary weight as %d", desiredWeight) - err = reconciler.SetWeight(desiredWeight, weightDestinations...) - if err != nil { - c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) - return err - } - } + //if len(weightDestinations) > 0 { + // fmt.Printf("WE SET WEIGHT TO EXPERIMENTS: %d\n", desiredWeight) + // c.log.Infof("rollout has expirments and the canary replicaset is not ready, calculating canary weight as %d", desiredWeight) + // err = reconciler.SetWeight(desiredWeight, weightDestinations...) + // if err != nil { + // c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) + // return err + // } + //} } else { fmt.Printf("WE SET WEIGHT TO HAPPY PATH: %d\n", desiredWeight) err = reconciler.SetWeight(desiredWeight, weightDestinations...) From 9a725ed9e20c0e221da3e685db9392b6db3bf657 Mon Sep 17 00:00:00 2001 From: zachaller Date: Wed, 30 Nov 2022 17:24:05 -0600 Subject: [PATCH 08/15] switch logic Signed-off-by: zachaller --- rollout/service.go | 36 +++++++++++++++++++------------- rollout/trafficrouting.go | 38 ++++------------------------------ rollout/trafficrouting_test.go | 2 +- utils/replicaset/replicaset.go | 5 +++++ 4 files changed, 32 insertions(+), 49 deletions(-) diff --git a/rollout/service.go b/rollout/service.go index ae6c9b91a5..c341f3e996 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -67,7 +67,6 @@ func (c rolloutContext) switchServiceSelector(service *corev1.Service, newRollou if err != nil { return err } - fmt.Println("WE HAVE SWITCHED SERVICE SELECTORS") msg := fmt.Sprintf("Switched selector for service '%s' from '%s' to '%s'", service.Name, oldPodHash, newRolloutUniqueLabelValue) c.recorder.Eventf(r, record.EventOptions{EventReason: "SwitchService"}, msg) service.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] = newRolloutUniqueLabelValue @@ -250,21 +249,10 @@ func (c *rolloutContext) reconcilePingAndPongService() error { return nil } -//var countS1 int = 0 - func (c *rolloutContext) reconcileStableAndCanaryService() error { if c.rollout.Spec.Strategy.Canary == nil { return nil } - - //if countS1 < 15 && *c.rollout.Status.CurrentStepIndex >= int32(len(c.rollout.Spec.Strategy.Canary.Steps)) { - // c.kubeclientset.CoreV1().Pods("smi").DeleteCollection(context.TODO(), metav1.DeleteOptions{}, metav1.ListOptions{ - // LabelSelector: "role=c", - // }) - // time.Sleep(1 * time.Second) - // countS1++ - //} - err := c.ensureSVCTargets(c.rollout.Spec.Strategy.Canary.StableService, c.stableRS, true) if err != nil { return err @@ -288,13 +276,33 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, } currSelector := svc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] desiredSelector := rs.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] - if currSelector != desiredSelector { + logCtx := c.log.WithField(logutil.ServiceKey, svc.Name) + + if _, ok := svc.Annotations[v1alpha1.ManagedByRolloutsKey]; !ok && currSelector != desiredSelector { + // This if block will be entered only when adopting a service that already exists, because the current selector + // will be empty at that point. When we are adopting a service, we want to make sure that the replicaset is fully + // available before we start routing traffic to it, so we do not overload it. + // See PR: https://github.com/argoproj/argo-rollouts/pull/1777 + // ensure ReplicaSet is fully available, otherwise we will point the service to nothing or an underprovisioned ReplicaSet if checkRsAvailability && !replicasetutil.IsReplicaSetAvailable(rs) { - logCtx := c.log.WithField(logutil.ServiceKey, svc.Name) logCtx.Infof("delaying service switch from %s to %s: ReplicaSet not fully available", currSelector, desiredSelector) return nil } + logCtx.Infof("adopting service %s", svc.Name) + err = c.switchServiceSelector(svc, desiredSelector, c.rollout) + if err != nil { + return err + } + } else if currSelector != desiredSelector { + // This if block will be called only at the end of a rollout, when we are at the end we generally will have enough + // capacity to handle the traffic, so we do not need to check the full availability of the ReplicaSet. We do still + // want to make sure we have at least one pod available, so we do not point the service to nothing. + + if checkRsAvailability && !replicasetutil.IsReplicaSetPartiallyAvailable(rs) { + logCtx.Infof("delaying service switch from %s to %s: ReplicaSet has zero availability", currSelector, desiredSelector) + return nil + } err = c.switchServiceSelector(svc, desiredSelector, c.rollout) if err != nil { return err diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index 31affcd643..58c8efd5c4 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -183,11 +183,6 @@ func (c *rolloutContext) reconcileTrafficRouting() error { atDesiredReplicaCount := replicasetutil.AtDesiredReplicaCountsForCanary(c.rollout, c.newRS, c.stableRS, c.otherRSs, nil) if !atDesiredReplicaCount && !c.rollout.Status.PromoteFull { // Use the previous weight since the new RS is not ready for a new weight - - //canaryPodsReady := replicasetutil.GetAvailableReplicaCountForReplicaSets(c.newRS) - // - // This logic seems flawed when we have a step that sets it to 100% and then pauses but when we go into - // the pause state, the canary becomes not ready for any reason. We should not be setting the weight to 100% for i := *index - 1; i >= 0; i-- { step := c.rollout.Spec.Strategy.Canary.Steps[i] if step.SetWeight != nil { @@ -227,35 +222,10 @@ func (c *rolloutContext) reconcileTrafficRouting() error { return err } - //REMOVE: this is test code to simulate a failure within the canary - //if *index == 6 && count < 15 { - // c.kubeclientset.CoreV1().Pods("smi").DeleteCollection(context.TODO(), metav1.DeleteOptions{}, metav1.ListOptions{ - // LabelSelector: "role=c", - // }) - // count++ - //} - - //If we are in the middle of a rollout we need to check that we are available on the newRS aka the canary - //before we start routing any traffic to it, this is not perfect because there is still time delay within each - //traffic router where something can go wrong like nodes being killed etc. - if !replicasetutil.IsReplicaSetAvailable(c.newRS) && len(weightDestinations) == 0 { - c.log.Infof("canary service %s is not ready to switch traffic from %s to %s, delay setting traficrouter desiredWeight to %d", c.rollout.Spec.Strategy.Canary.CanaryService, stableHash, canaryHash, desiredWeight) - //if len(weightDestinations) > 0 { - // fmt.Printf("WE SET WEIGHT TO EXPERIMENTS: %d\n", desiredWeight) - // c.log.Infof("rollout has expirments and the canary replicaset is not ready, calculating canary weight as %d", desiredWeight) - // err = reconciler.SetWeight(desiredWeight, weightDestinations...) - // if err != nil { - // c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) - // return err - // } - //} - } else { - fmt.Printf("WE SET WEIGHT TO HAPPY PATH: %d\n", desiredWeight) - err = reconciler.SetWeight(desiredWeight, weightDestinations...) - if err != nil { - c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) - return err - } + err = reconciler.SetWeight(desiredWeight, weightDestinations...) + if err != nil { + c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: "TrafficRoutingError"}, err.Error()) + return err } if modified, newWeights := calculateWeightStatus(c.rollout, canaryHash, stableHash, desiredWeight, weightDestinations...); modified { diff --git a/rollout/trafficrouting_test.go b/rollout/trafficrouting_test.go index d1d81fdff0..6c1e49f476 100644 --- a/rollout/trafficrouting_test.go +++ b/rollout/trafficrouting_test.go @@ -446,7 +446,7 @@ func TestRolloutUseDynamicWeightOnPromoteFull(t *testing.T) { f.fakeTrafficRouting = newUnmockedFakeTrafficRoutingReconciler() f.fakeTrafficRouting.On("UpdateHash", mock.Anything, mock.Anything, mock.Anything).Return(nil) f.fakeTrafficRouting.On("SetWeight", mock.Anything, mock.Anything).Return(func(desiredWeight int32, additionalDestinations ...v1alpha1.WeightDestination) error { - assert.Equal(t, int32(50), desiredWeight) //This is 50% here because only 50% of the pods in the canary are available + assert.Equal(t, int32(5), desiredWeight) return nil }) f.fakeTrafficRouting.On("SetHeaderRoute", mock.Anything, mock.Anything).Return(nil) diff --git a/utils/replicaset/replicaset.go b/utils/replicaset/replicaset.go index 4a0b56b5a5..8554cdf94b 100644 --- a/utils/replicaset/replicaset.go +++ b/utils/replicaset/replicaset.go @@ -646,3 +646,8 @@ func IsReplicaSetAvailable(rs *appsv1.ReplicaSet) bool { availableReplicas := rs.Status.AvailableReplicas return replicas != nil && *replicas != 0 && availableReplicas != 0 && *replicas <= availableReplicas } + +// IsReplicaSetPartiallyAvailable returns if a ReplicaSet is scaled up and has at least 1 pod available +func IsReplicaSetPartiallyAvailable(rs *appsv1.ReplicaSet) bool { + return rs.Status.AvailableReplicas > 0 +} From 4aabe8ca40dcc022af11de0321aa3bbd9f8aaedc Mon Sep 17 00:00:00 2001 From: zachaller Date: Thu, 1 Dec 2022 08:38:59 -0600 Subject: [PATCH 09/15] add more comments Signed-off-by: zachaller --- rollout/service.go | 14 +++++--- rollout/service_test.go | 71 +++++++++++++++++++++++++++++++++++++++ rollout/trafficrouting.go | 2 -- 3 files changed, 80 insertions(+), 7 deletions(-) diff --git a/rollout/service.go b/rollout/service.go index c341f3e996..efec6da4b0 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -265,7 +265,10 @@ func (c *rolloutContext) reconcileStableAndCanaryService() error { } // ensureSVCTargets updates the service with the given name to point to the given ReplicaSet, -// but only if that ReplicaSet has full availability. +// but only if that ReplicaSet has full availability. There is still an edge case with this function if +// in the small window of time between a rollout being completed and we try to update the service selector, we lose 100% +// of the pods availability. We will still go and reconcile the traffic router, setting the stable weight to zero. This mainly +// affects dynamic stable scale. func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, checkRsAvailability bool) error { if rs == nil || svcName == "" { return nil @@ -279,7 +282,7 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, logCtx := c.log.WithField(logutil.ServiceKey, svc.Name) if _, ok := svc.Annotations[v1alpha1.ManagedByRolloutsKey]; !ok && currSelector != desiredSelector { - // This if block will be entered only when adopting a service that already exists, because the current selector + // This if block will be entered only when adopting a service that already exists, because the current annotation // will be empty at that point. When we are adopting a service, we want to make sure that the replicaset is fully // available before we start routing traffic to it, so we do not overload it. // See PR: https://github.com/argoproj/argo-rollouts/pull/1777 @@ -295,9 +298,10 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, return err } } else if currSelector != desiredSelector { - // This if block will be called only at the end of a rollout, when we are at the end we generally will have enough - // capacity to handle the traffic, so we do not need to check the full availability of the ReplicaSet. We do still - // want to make sure we have at least one pod available, so we do not point the service to nothing. + // This if block will be called only at the beginning and end of a rollout that is adopted, when we are at the end of a rollout + // we generally will have enough capacity to handle the traffic, so we do not need to check the full availability of the + // ReplicaSet. We do still want to make sure we have at least one pod available, so we do not point the service to nothing, but + // losing a pod or two should be tolerable to still switch service selectors. if checkRsAvailability && !replicasetutil.IsReplicaSetPartiallyAvailable(rs) { logCtx.Infof("delaying service switch from %s to %s: ReplicaSet has zero availability", currSelector, desiredSelector) diff --git a/rollout/service_test.go b/rollout/service_test.go index 8a402d001c..c620e028a3 100644 --- a/rollout/service_test.go +++ b/rollout/service_test.go @@ -787,6 +787,22 @@ func TestDelayCanaryStableServiceLabelInjection(t *testing.T) { _, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] assert.False(t, stableInjected) } + { + // ensure we don't update service because new/stable are both partially available on an adoption of service reconcile + ctrl, _, _ := f.newController(noResyncPeriodFunc) + roCtx, err := ctrl.newRolloutContext(ro1) + assert.NoError(t, err) + + roCtx.newRS = newReplicaSetWithStatus(ro1, 3, 1) + roCtx.stableRS = newReplicaSetWithStatus(ro2, 3, 1) + + err = roCtx.reconcileStableAndCanaryService() + assert.NoError(t, err) + _, canaryInjected := canarySvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] + assert.False(t, canaryInjected) + _, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] + assert.False(t, stableInjected) + } { // next ensure we do update service because new/stable are now available ctrl, _, _ := f.newController(noResyncPeriodFunc) @@ -805,3 +821,58 @@ func TestDelayCanaryStableServiceLabelInjection(t *testing.T) { } } + +// TestDelayCanaryStableServiceDelayOnAdoptedService verifies allow partial readiness of pods when switching labels +// on an adopted services, but that if there is zero readiness we will not switch +func TestDelayCanaryStableServiceDelayOnAdoptedService(t *testing.T) { + ro1 := newCanaryRollout("foo", 3, nil, nil, nil, intstr.FromInt(1), intstr.FromInt(1)) + ro1.Spec.Strategy.Canary.CanaryService = "canary" + ro1.Spec.Strategy.Canary.StableService = "stable" + //Setup services that are already adopted by rollouts + stableSvc := newService("stable", 80, ro1.Spec.Selector.MatchLabels, ro1) + ro2 := bumpVersion(ro1) + canarySvc := newService("canary", 80, ro1.Spec.Selector.MatchLabels, ro2) + + f := newFixture(t) + defer f.Close() + f.kubeobjects = append(f.kubeobjects, canarySvc, stableSvc) + f.serviceLister = append(f.serviceLister, canarySvc, stableSvc) + + { + // first ensure we don't update service because new/stable are both not available + ctrl, _, _ := f.newController(noResyncPeriodFunc) + roCtx, err := ctrl.newRolloutContext(ro1) + assert.NoError(t, err) + + roCtx.newRS = newReplicaSetWithStatus(ro1, 3, 0) + roCtx.stableRS = newReplicaSetWithStatus(ro2, 3, 0) + + err = roCtx.reconcileStableAndCanaryService() + assert.NoError(t, err) + canaryHash2, canaryInjected := canarySvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] + assert.False(t, canaryInjected) + fmt.Println(canaryHash2) + stableHash2, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] + assert.False(t, stableInjected) + fmt.Println(stableHash2) + } + { + // first ensure we don't update service because new/stable are both not available + ctrl, _, _ := f.newController(noResyncPeriodFunc) + roCtx, err := ctrl.newRolloutContext(ro1) + assert.NoError(t, err) + + roCtx.newRS = newReplicaSetWithStatus(ro1, 3, 1) + roCtx.stableRS = newReplicaSetWithStatus(ro2, 3, 2) + + err = roCtx.reconcileStableAndCanaryService() + assert.NoError(t, err) + canaryHash2, canaryInjected := canarySvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] + assert.True(t, canaryInjected) + fmt.Println(canaryHash2) + stableHash2, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] + assert.True(t, stableInjected) + fmt.Println(stableHash2) + } + +} diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index 58c8efd5c4..ebc22b9704 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -102,8 +102,6 @@ func (c *Controller) NewTrafficRoutingReconciler(roCtx *rolloutContext) ([]traff return nil, nil } -//var count int = 0 - func (c *rolloutContext) reconcileTrafficRouting() error { reconcilers, err := c.newTrafficRoutingReconciler(c) // a return here does ensure that all trafficReconcilers are healthy From 7df9ff66c270905e3abb08ba357d396e1c1e3ffc Mon Sep 17 00:00:00 2001 From: zachaller Date: Thu, 1 Dec 2022 08:40:53 -0600 Subject: [PATCH 10/15] add more comments Signed-off-by: zachaller --- rollout/service.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rollout/service.go b/rollout/service.go index efec6da4b0..fbc11912c5 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -265,10 +265,10 @@ func (c *rolloutContext) reconcileStableAndCanaryService() error { } // ensureSVCTargets updates the service with the given name to point to the given ReplicaSet, -// but only if that ReplicaSet has full availability. There is still an edge case with this function if -// in the small window of time between a rollout being completed and we try to update the service selector, we lose 100% -// of the pods availability. We will still go and reconcile the traffic router, setting the stable weight to zero. This mainly -// affects dynamic stable scale. +// but only if that ReplicaSet has proper availability. There is still an edge case with this function if +// in the small window of time between a rollout being completed, and we try to update the service selector, we lose 100% +// of the pods availability. We will not switch service selector but still go and reconcile the traffic router, setting the +// stable weight to zero. This really only affects dynamic stable scale. func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, checkRsAvailability bool) error { if rs == nil || svcName == "" { return nil From 516adc034aa53ea2236963ba45d54bb587abc3a8 Mon Sep 17 00:00:00 2001 From: zachaller Date: Thu, 1 Dec 2022 09:14:15 -0600 Subject: [PATCH 11/15] add more test Signed-off-by: zachaller --- utils/replicaset/replicaset_test.go | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/utils/replicaset/replicaset_test.go b/utils/replicaset/replicaset_test.go index f1a2a80fd7..9b597f87e1 100644 --- a/utils/replicaset/replicaset_test.go +++ b/utils/replicaset/replicaset_test.go @@ -1336,3 +1336,42 @@ func TestIsReplicaSetAvailable(t *testing.T) { assert.False(t, IsReplicaSetAvailable(&rs)) } } + +func TestIsReplicaSetPartiallyAvailable(t *testing.T) { + { + rs := appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{ + Replicas: pointer.Int32Ptr(2), + }, + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 0, + AvailableReplicas: 0, + }, + } + assert.False(t, IsReplicaSetPartiallyAvailable(&rs)) + } + { + rs := appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{ + Replicas: pointer.Int32Ptr(2), + }, + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 2, + AvailableReplicas: 1, + }, + } + assert.True(t, IsReplicaSetPartiallyAvailable(&rs)) + } + { + rs := appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{ + Replicas: pointer.Int32Ptr(2), + }, + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 2, + AvailableReplicas: 2, + }, + } + assert.True(t, IsReplicaSetPartiallyAvailable(&rs)) + } +} From 3e73aae2c103e0443ee024c2ef95aac8ed5aa0f5 Mon Sep 17 00:00:00 2001 From: zachaller Date: Thu, 1 Dec 2022 09:33:31 -0600 Subject: [PATCH 12/15] refactor test Signed-off-by: zachaller --- rollout/service_test.go | 10 +++++----- utils/replicaset/replicaset_test.go | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/rollout/service_test.go b/rollout/service_test.go index c620e028a3..393faf87a0 100644 --- a/rollout/service_test.go +++ b/rollout/service_test.go @@ -838,7 +838,7 @@ func TestDelayCanaryStableServiceDelayOnAdoptedService(t *testing.T) { f.kubeobjects = append(f.kubeobjects, canarySvc, stableSvc) f.serviceLister = append(f.serviceLister, canarySvc, stableSvc) - { + t.Run("AdoptedService No Availability", func(t *testing.T) { // first ensure we don't update service because new/stable are both not available ctrl, _, _ := f.newController(noResyncPeriodFunc) roCtx, err := ctrl.newRolloutContext(ro1) @@ -855,9 +855,9 @@ func TestDelayCanaryStableServiceDelayOnAdoptedService(t *testing.T) { stableHash2, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] assert.False(t, stableInjected) fmt.Println(stableHash2) - } - { - // first ensure we don't update service because new/stable are both not available + }) + t.Run("AdoptedService Partial Availability", func(t *testing.T) { + // ensure we do change selector on partially available replica sets ctrl, _, _ := f.newController(noResyncPeriodFunc) roCtx, err := ctrl.newRolloutContext(ro1) assert.NoError(t, err) @@ -873,6 +873,6 @@ func TestDelayCanaryStableServiceDelayOnAdoptedService(t *testing.T) { stableHash2, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] assert.True(t, stableInjected) fmt.Println(stableHash2) - } + }) } diff --git a/utils/replicaset/replicaset_test.go b/utils/replicaset/replicaset_test.go index 9b597f87e1..5a7c7ff8c1 100644 --- a/utils/replicaset/replicaset_test.go +++ b/utils/replicaset/replicaset_test.go @@ -1338,7 +1338,7 @@ func TestIsReplicaSetAvailable(t *testing.T) { } func TestIsReplicaSetPartiallyAvailable(t *testing.T) { - { + t.Run("No Availability", func(t *testing.T) { rs := appsv1.ReplicaSet{ Spec: appsv1.ReplicaSetSpec{ Replicas: pointer.Int32Ptr(2), @@ -1349,8 +1349,8 @@ func TestIsReplicaSetPartiallyAvailable(t *testing.T) { }, } assert.False(t, IsReplicaSetPartiallyAvailable(&rs)) - } - { + }) + t.Run("Partial Availability", func(t *testing.T) { rs := appsv1.ReplicaSet{ Spec: appsv1.ReplicaSetSpec{ Replicas: pointer.Int32Ptr(2), @@ -1361,8 +1361,8 @@ func TestIsReplicaSetPartiallyAvailable(t *testing.T) { }, } assert.True(t, IsReplicaSetPartiallyAvailable(&rs)) - } - { + }) + t.Run("Full Availability", func(t *testing.T) { rs := appsv1.ReplicaSet{ Spec: appsv1.ReplicaSetSpec{ Replicas: pointer.Int32Ptr(2), @@ -1373,5 +1373,5 @@ func TestIsReplicaSetPartiallyAvailable(t *testing.T) { }, } assert.True(t, IsReplicaSetPartiallyAvailable(&rs)) - } + }) } From e04123e2253ebdb7c3793a37f0a7913969596987 Mon Sep 17 00:00:00 2001 From: zachaller Date: Mon, 5 Dec 2022 09:51:56 -0600 Subject: [PATCH 13/15] refactor code to reduce duplication Signed-off-by: zachaller --- rollout/service.go | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/rollout/service.go b/rollout/service.go index fbc11912c5..3bd9eef516 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -281,32 +281,31 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, desiredSelector := rs.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] logCtx := c.log.WithField(logutil.ServiceKey, svc.Name) - if _, ok := svc.Annotations[v1alpha1.ManagedByRolloutsKey]; !ok && currSelector != desiredSelector { - // This if block will be entered only when adopting a service that already exists, because the current annotation - // will be empty at that point. When we are adopting a service, we want to make sure that the replicaset is fully - // available before we start routing traffic to it, so we do not overload it. - // See PR: https://github.com/argoproj/argo-rollouts/pull/1777 + if currSelector != desiredSelector { + if _, ok := svc.Annotations[v1alpha1.ManagedByRolloutsKey]; !ok { + // This if block will be entered only when adopting a service that already exists, because the current annotation + // will be empty at that point. When we are adopting a service, we want to make sure that the replicaset is fully + // available before we start routing traffic to it, so we do not overload it. + // See PR: https://github.com/argoproj/argo-rollouts/pull/1777 - // ensure ReplicaSet is fully available, otherwise we will point the service to nothing or an underprovisioned ReplicaSet - if checkRsAvailability && !replicasetutil.IsReplicaSetAvailable(rs) { - logCtx.Infof("delaying service switch from %s to %s: ReplicaSet not fully available", currSelector, desiredSelector) - return nil - } - logCtx.Infof("adopting service %s", svc.Name) - err = c.switchServiceSelector(svc, desiredSelector, c.rollout) - if err != nil { - return err - } - } else if currSelector != desiredSelector { - // This if block will be called only at the beginning and end of a rollout that is adopted, when we are at the end of a rollout - // we generally will have enough capacity to handle the traffic, so we do not need to check the full availability of the - // ReplicaSet. We do still want to make sure we have at least one pod available, so we do not point the service to nothing, but - // losing a pod or two should be tolerable to still switch service selectors. + // ensure ReplicaSet is fully available, otherwise we will point the service to nothing or an underprovisioned ReplicaSet + if checkRsAvailability && !replicasetutil.IsReplicaSetAvailable(rs) { + logCtx.Infof("delaying service switch from %s to %s: ReplicaSet not fully available", currSelector, desiredSelector) + return nil + } + logCtx.Infof("adopting service %s", svc.Name) + } else { + // This if block will be called only at the beginning and end of a rollout that is adopted, when we are at the end of a rollout + // we generally will have enough capacity to handle the traffic, so we do not need to check the full availability of the + // ReplicaSet. We do still want to make sure we have at least one pod available, so we do not point the service to nothing, but + // losing a pod or two should be tolerable to still switch service selectors. - if checkRsAvailability && !replicasetutil.IsReplicaSetPartiallyAvailable(rs) { - logCtx.Infof("delaying service switch from %s to %s: ReplicaSet has zero availability", currSelector, desiredSelector) - return nil + if checkRsAvailability && !replicasetutil.IsReplicaSetPartiallyAvailable(rs) { + logCtx.Infof("delaying service switch from %s to %s: ReplicaSet has zero availability", currSelector, desiredSelector) + return nil + } } + err = c.switchServiceSelector(svc, desiredSelector, c.rollout) if err != nil { return err From 182bf79830c8f2dd168047efdcde6c11f24fc95b Mon Sep 17 00:00:00 2001 From: zachaller Date: Mon, 5 Dec 2022 09:58:05 -0600 Subject: [PATCH 14/15] change comments a bit Signed-off-by: zachaller --- rollout/service.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rollout/service.go b/rollout/service.go index 3bd9eef516..d61d190401 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -283,7 +283,7 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, if currSelector != desiredSelector { if _, ok := svc.Annotations[v1alpha1.ManagedByRolloutsKey]; !ok { - // This if block will be entered only when adopting a service that already exists, because the current annotation + // This block will be entered only when adopting a service that already exists, because the current annotation // will be empty at that point. When we are adopting a service, we want to make sure that the replicaset is fully // available before we start routing traffic to it, so we do not overload it. // See PR: https://github.com/argoproj/argo-rollouts/pull/1777 @@ -295,7 +295,7 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, } logCtx.Infof("adopting service %s", svc.Name) } else { - // This if block will be called only at the beginning and end of a rollout that is adopted, when we are at the end of a rollout + // This block will be entered only at the beginning and end of a rollout that is adopted, when we are at the end of a rollout // we generally will have enough capacity to handle the traffic, so we do not need to check the full availability of the // ReplicaSet. We do still want to make sure we have at least one pod available, so we do not point the service to nothing, but // losing a pod or two should be tolerable to still switch service selectors. From a621f058d4a970d076a6b3c7c7b9b628dc70cfe6 Mon Sep 17 00:00:00 2001 From: zachaller Date: Mon, 5 Dec 2022 10:58:41 -0600 Subject: [PATCH 15/15] remove else Signed-off-by: zachaller --- rollout/service.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/rollout/service.go b/rollout/service.go index d61d190401..de63f527b3 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -294,16 +294,14 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, return nil } logCtx.Infof("adopting service %s", svc.Name) - } else { - // This block will be entered only at the beginning and end of a rollout that is adopted, when we are at the end of a rollout - // we generally will have enough capacity to handle the traffic, so we do not need to check the full availability of the - // ReplicaSet. We do still want to make sure we have at least one pod available, so we do not point the service to nothing, but - // losing a pod or two should be tolerable to still switch service selectors. + } - if checkRsAvailability && !replicasetutil.IsReplicaSetPartiallyAvailable(rs) { - logCtx.Infof("delaying service switch from %s to %s: ReplicaSet has zero availability", currSelector, desiredSelector) - return nil - } + // When we are at the end of a rollout we generally will have enough capacity to handle the traffic, so we do not + // need to check the full availability of the ReplicaSet. We do still want to make sure we have at least one pod + // available, so we do not point the service to nothing, but losing a pod or two should be tolerable to still switch service selectors. + if checkRsAvailability && !replicasetutil.IsReplicaSetPartiallyAvailable(rs) { + logCtx.Infof("delaying service switch from %s to %s: ReplicaSet has zero availability", currSelector, desiredSelector) + return nil } err = c.switchServiceSelector(svc, desiredSelector, c.rollout)