diff --git a/rollout/bluegreen.go b/rollout/bluegreen.go index 887e022e28..f1bcf7a7bb 100644 --- a/rollout/bluegreen.go +++ b/rollout/bluegreen.go @@ -220,10 +220,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re annotationedRSs := int32(0) rolloutReplicas := defaults.GetReplicasOrDefault(c.rollout.Spec.Replicas) for _, targetRS := range oldRSs { - if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) { - // We should technically never get here because we shouldn't be passing a replicaset list - // which includes referenced ReplicaSets. But we check just in case - c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name) + if c.isReplicaSetReferenced(targetRS) { + // We might get here if user interrupted an an update in order to move back to stable. + c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name) continue } if *targetRS.Spec.Replicas == 0 { diff --git a/rollout/canary.go b/rollout/canary.go index dff4b52d50..b443db507e 100644 --- a/rollout/canary.go +++ b/rollout/canary.go @@ -180,10 +180,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli annotationedRSs := int32(0) for _, targetRS := range oldRSs { - if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) { - // We should technically never get here because we shouldn't be passing a replicaset list - // which includes referenced ReplicaSets. But we check just in case - c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name) + if c.isReplicaSetReferenced(targetRS) { + // We might get here if user interrupted an an update in order to move back to stable. + c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name) continue } if maxScaleDown <= 0 { @@ -220,15 +219,8 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli // and doesn't yet have scale down deadline. This happens when a user changes their // mind in the middle of an V1 -> V2 update, and then applies a V3. We are deciding // what to do with the defunct, intermediate V2 ReplicaSet right now. - if !c.replicaSetReferencedByCanaryTraffic(targetRS) { - // It is safe to scale the intermediate RS down, if no traffic is directed to it. - c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name) - } else { - c.log.Infof("Skip scaling down intermediate RS '%s': still referenced by service", targetRS.Name) - // This ReplicaSet is still referenced by the service. It is not safe to scale - // this down. - continue - } + // It is safe to scale the intermediate RS down, since no traffic is directed to it. + c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name) } } if *targetRS.Spec.Replicas == desiredReplicaCount { @@ -248,19 +240,26 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli return totalScaledDown, nil } -func (c *rolloutContext) replicaSetReferencedByCanaryTraffic(rs *appsv1.ReplicaSet) bool { - rsPodHash := replicasetutil.GetPodTemplateHash(rs) - ro := c.rollout - - if ro.Status.Canary.Weights == nil { - return false - } - - if ro.Status.Canary.Weights.Canary.PodTemplateHash == rsPodHash || ro.Status.Canary.Weights.Stable.PodTemplateHash == rsPodHash { - return true +// isDynamicallyRollingBackToStable returns true if we were in the middle of an canary update with +// dynamic stable scaling, but was interrupted and are now rolling back to stable RS. This is similar +// to, but different than aborting. With abort, desired hash != stable hash and so we know the +// two hashes to balance traffic against. But with dynamically rolling back to stable, the +// desired hash == stable hash, and so we must use the *previous* desired hash and balance traffic +// between previous desired vs. stable hash, in order to safely shift traffic back to stable. +// This function also returns the previous desired hash (where we are weighted to) +func isDynamicallyRollingBackToStable(ro *v1alpha1.Rollout, desiredRS *appsv1.ReplicaSet) (bool, string) { + if rolloututil.IsFullyPromoted(ro) && ro.Spec.Strategy.Canary.TrafficRouting != nil && ro.Spec.Strategy.Canary.DynamicStableScale { + if ro.Status.Canary.Weights != nil { + currSelector := ro.Status.Canary.Weights.Canary.PodTemplateHash + desiredSelector := replicasetutil.GetPodTemplateHash(desiredRS) + if currSelector != desiredSelector { + if desiredRS.Status.AvailableReplicas < *ro.Spec.Replicas { + return true, currSelector + } + } + } } - - return false + return false, "" } // canProceedWithScaleDownAnnotation returns whether or not it is safe to proceed with annotating diff --git a/rollout/canary_test.go b/rollout/canary_test.go index 4adca3fcb9..f136a01e42 100644 --- a/rollout/canary_test.go +++ b/rollout/canary_test.go @@ -8,6 +8,7 @@ import ( "time" "github.com/stretchr/testify/assert" + appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -1890,3 +1891,120 @@ func TestHandleCanaryAbort(t *testing.T) { assert.JSONEq(t, calculatePatch(r1, fmt.Sprintf(expectedPatch, newConditions)), patch) }) } + +func TestIsDynamicallyRollingBackToStable(t *testing.T) { + newRSWithHashAndReplicas := func(hash string, available int32) *appsv1.ReplicaSet { + return &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha1.DefaultRolloutUniqueLabelKey: hash, + }, + }, + Status: v1.ReplicaSetStatus{ + AvailableReplicas: available, + }, + } + } + + testCases := []struct { + name string + status v1alpha1.RolloutStatus + trafficRoutingDisabled bool + dynamicStableScalingDisabled bool + rsHash string + rsAvailableReplicas *int32 // if nil, will set to rollout replicas + trafficWeights *v1alpha1.TrafficWeights + expectedResult bool + }{ + { + name: "desired RS != stable RS", + status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "def456"}, + rsHash: "", + expectedResult: false, + }, + { + name: "not using traffic routing", + trafficRoutingDisabled: true, + status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "abc123"}, + rsHash: "", + expectedResult: false, + }, + { + name: "not using dynamicStableScaling", + dynamicStableScalingDisabled: true, + status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "abc123"}, + rsHash: "", + expectedResult: false, + }, + { + name: "weighted selector == desired RS", + status: v1alpha1.RolloutStatus{ + CurrentPodHash: "abc123", + StableRS: "abc123", + Canary: v1alpha1.CanaryStatus{ + Weights: &v1alpha1.TrafficWeights{ + Canary: v1alpha1.WeightDestination{ + PodTemplateHash: "abc123", + }, + }, + }, + }, + rsHash: "abc123", + expectedResult: false, + }, + { + name: "weighted selector != desired RS, desired not fully available", + status: v1alpha1.RolloutStatus{ + CurrentPodHash: "abc123", + StableRS: "abc123", + Canary: v1alpha1.CanaryStatus{ + Weights: &v1alpha1.TrafficWeights{ + Canary: v1alpha1.WeightDestination{ + PodTemplateHash: "def456", + }, + }, + }, + }, + rsHash: "abc123", + rsAvailableReplicas: pointer.Int32(1), + expectedResult: true, + }, + { + name: "weighted selector != desired RS, desired RS is fully available", + status: v1alpha1.RolloutStatus{ + CurrentPodHash: "abc123", + StableRS: "abc123", + Canary: v1alpha1.CanaryStatus{ + Weights: &v1alpha1.TrafficWeights{ + Canary: v1alpha1.WeightDestination{ + PodTemplateHash: "def456", + }, + }, + }, + }, + rsHash: "abc123", + expectedResult: true, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ro := newCanaryRollout("test", 10, nil, nil, nil, intstr.FromInt(0), intstr.FromInt(1)) + if !tc.trafficRoutingDisabled { + ro.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{} + } + if !tc.dynamicStableScalingDisabled { + ro.Spec.Strategy.Canary.DynamicStableScale = true + } + ro.Status = tc.status + + desiredRS := newRSWithHashAndReplicas(tc.rsHash, 1) + if tc.rsAvailableReplicas != nil { + desiredRS.Status.AvailableReplicas = *tc.rsAvailableReplicas + } + + rbToStable, _ := isDynamicallyRollingBackToStable(ro, desiredRS) + + assert.Equal(t, tc.expectedResult, rbToStable) + }) + } +} diff --git a/rollout/replicaset.go b/rollout/replicaset.go index fad23e756e..7d9a71f62a 100644 --- a/rollout/replicaset.go +++ b/rollout/replicaset.go @@ -7,6 +7,7 @@ import ( "time" appsv1 "k8s.io/api/apps/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" patchtypes "k8s.io/apimachinery/pkg/types" @@ -15,6 +16,7 @@ import ( "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/utils/defaults" replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" + serviceutil "github.com/argoproj/argo-rollouts/utils/service" timeutil "github.com/argoproj/argo-rollouts/utils/time" ) @@ -296,3 +298,56 @@ func (c *rolloutContext) scaleDownDelayHelper(rs *appsv1.ReplicaSet, annotatione return annotationedRSs, desiredReplicaCount, nil } + +// isReplicaSetReferenced returns if the given ReplicaSet is still being referenced by any of +// the current, stable, blue-green services. Used to determine if the ReplicaSet can +// safely be scaled to zero, or deleted. +func (c *rolloutContext) isReplicaSetReferenced(rs *appsv1.ReplicaSet) bool { + rsPodHash := replicasetutil.GetPodTemplateHash(rs) + if rsPodHash == "" { + return false + } + ro := c.rollout + referencesToCheck := []string{ + ro.Status.StableRS, + ro.Status.CurrentPodHash, + ro.Status.BlueGreen.ActiveSelector, + ro.Status.BlueGreen.PreviewSelector, + } + if ro.Status.Canary.Weights != nil { + referencesToCheck = append(referencesToCheck, ro.Status.Canary.Weights.Canary.PodTemplateHash, ro.Status.Canary.Weights.Stable.PodTemplateHash) + } + for _, ref := range referencesToCheck { + if ref == rsPodHash { + return true + } + } + + // The above are static, lightweight checks to see if the selectors we record in our status are + // still referencing the ReplicaSet in question. Those checks aren't always enough. Next, we do + // a deeper check to look up the actual service objects, and see if they are still referencing + // the ReplicaSet. If so, we cannot scale it down. + var servicesToCheck []string + if ro.Spec.Strategy.Canary != nil { + servicesToCheck = []string{ro.Spec.Strategy.Canary.CanaryService, ro.Spec.Strategy.Canary.StableService} + } else { + servicesToCheck = []string{ro.Spec.Strategy.BlueGreen.ActiveService, ro.Spec.Strategy.BlueGreen.PreviewService} + } + for _, svcName := range servicesToCheck { + if svcName == "" { + continue + } + svc, err := c.servicesLister.Services(c.rollout.Namespace).Get(svcName) + if err != nil { + if k8serrors.IsNotFound(err) { + // service doesn't exist + continue + } + return true + } + if serviceutil.GetRolloutSelectorLabel(svc) == rsPodHash { + return true + } + } + return false +} diff --git a/rollout/replicaset_test.go b/rollout/replicaset_test.go index 10c1dc0893..3d87cf3132 100644 --- a/rollout/replicaset_test.go +++ b/rollout/replicaset_test.go @@ -8,6 +8,9 @@ import ( "github.com/stretchr/testify/assert" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" + k8sinformers "k8s.io/client-go/informers" k8sfake "k8s.io/client-go/kubernetes/fake" core "k8s.io/client-go/testing" "k8s.io/utils/pointer" @@ -353,3 +356,182 @@ func TestReconcileOldReplicaSet(t *testing.T) { }) } } + +func TestIsReplicaSetReferenced(t *testing.T) { + newRSWithPodTemplateHash := func(hash string) *appsv1.ReplicaSet { + return &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha1.DefaultRolloutUniqueLabelKey: hash, + }, + }, + } + } + + testCases := []struct { + name string + status v1alpha1.RolloutStatus + canaryService string + stableService string + activeService string + previewService string + services []runtime.Object + rsHash string + expectedResult bool + }{ + { + name: "empty hash", + status: v1alpha1.RolloutStatus{StableRS: "abc123"}, + rsHash: "", + expectedResult: false, + }, + { + name: "not referenced", + status: v1alpha1.RolloutStatus{StableRS: "abc123"}, + rsHash: "def456", + expectedResult: false, + }, + { + name: "stable rs referenced", + status: v1alpha1.RolloutStatus{StableRS: "abc123"}, + rsHash: "abc123", + expectedResult: true, + }, + { + name: "current rs referenced", + status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123"}, + rsHash: "abc123", + expectedResult: true, + }, + { + name: "active referenced", + status: v1alpha1.RolloutStatus{BlueGreen: v1alpha1.BlueGreenStatus{ActiveSelector: "abc123"}}, + rsHash: "abc123", + expectedResult: true, + }, + { + name: "active referenced", + status: v1alpha1.RolloutStatus{BlueGreen: v1alpha1.BlueGreenStatus{PreviewSelector: "abc123"}}, + rsHash: "abc123", + expectedResult: true, + }, + { + name: "traffic routed canary current pod hash", + status: v1alpha1.RolloutStatus{Canary: v1alpha1.CanaryStatus{Weights: &v1alpha1.TrafficWeights{ + Canary: v1alpha1.WeightDestination{ + PodTemplateHash: "abc123", + }, + }}}, + rsHash: "abc123", + expectedResult: true, + }, + { + name: "traffic routed canary current pod hash", + status: v1alpha1.RolloutStatus{Canary: v1alpha1.CanaryStatus{Weights: &v1alpha1.TrafficWeights{ + Stable: v1alpha1.WeightDestination{ + PodTemplateHash: "abc123", + }, + }}}, + rsHash: "abc123", + expectedResult: true, + }, + { + name: "canary service still referenced", + status: v1alpha1.RolloutStatus{ + CurrentPodHash: "abc123", + StableRS: "abc123", + }, + canaryService: "mysvc", + services: []runtime.Object{newService("mysvc", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: "def456"}, nil)}, + rsHash: "def456", + expectedResult: true, + }, + { + name: "stable service still referenced", + status: v1alpha1.RolloutStatus{ + CurrentPodHash: "abc123", + StableRS: "abc123", + }, + stableService: "mysvc", + services: []runtime.Object{newService("mysvc", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: "def456"}, nil)}, + rsHash: "def456", + expectedResult: true, + }, + { + name: "active service still referenced", + status: v1alpha1.RolloutStatus{ + CurrentPodHash: "abc123", + StableRS: "abc123", + }, + activeService: "mysvc", + services: []runtime.Object{newService("mysvc", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: "def456"}, nil)}, + rsHash: "def456", + expectedResult: true, + }, + { + name: "preview service still referenced", + status: v1alpha1.RolloutStatus{ + CurrentPodHash: "abc123", + StableRS: "abc123", + }, + activeService: "mysvc", + previewService: "mysvc2", + services: []runtime.Object{newService("mysvc2", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: "def456"}, nil)}, + rsHash: "def456", + expectedResult: true, + }, + { + name: "service not found", + status: v1alpha1.RolloutStatus{ + CurrentPodHash: "abc123", + StableRS: "abc123", + }, + activeService: "mysvc", + previewService: "mysvc2", + rsHash: "def456", + expectedResult: false, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + + fake := fake.Clientset{} + k8sfake := k8sfake.NewSimpleClientset(tc.services...) + informers := k8sinformers.NewSharedInformerFactory(k8sfake, 0) + servicesLister := informers.Core().V1().Services().Lister() + stopchan := make(chan struct{}) + defer close(stopchan) + informers.Start(stopchan) + informers.WaitForCacheSync(stopchan) + + var r *v1alpha1.Rollout + if tc.activeService != "" { + r = newBlueGreenRollout("test", 1, nil, tc.activeService, tc.previewService) + } else { + r = newCanaryRollout("test", 1, nil, nil, nil, intstr.FromInt(0), intstr.FromInt(1)) + r.Spec.Strategy.Canary.CanaryService = tc.canaryService + r.Spec.Strategy.Canary.StableService = tc.stableService + } + r.Status = tc.status + + roCtx := &rolloutContext{ + rollout: r, + log: logutil.WithRollout(r), + reconcilerBase: reconcilerBase{ + servicesLister: servicesLister, + argoprojclientset: &fake, + kubeclientset: k8sfake, + recorder: record.NewFakeEventRecorder(), + }, + } + rs := newRSWithPodTemplateHash(tc.rsHash) + stillReferenced := roCtx.isReplicaSetReferenced(rs) + + assert.Equal( + t, + tc.expectedResult, + stillReferenced, + ) + }) + } +} diff --git a/rollout/service.go b/rollout/service.go index f808cb55fc..69739b9315 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -147,7 +147,7 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error { return nil } - c.targetsVerified = pointer.BoolPtr(false) + c.targetsVerified = pointer.Bool(false) // get endpoints of service endpoints, err := c.kubeclientset.CoreV1().Endpoints(svc.Namespace).Get(ctx, svc.Name, metav1.GetOptions{}) @@ -177,7 +177,7 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error { } c.recorder.Eventf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifiedReason}, conditions.TargetGroupVerifiedRegistrationMessage, svc.Name, tgb.Spec.TargetGroupARN, verifyRes.EndpointsRegistered) } - c.targetsVerified = pointer.BoolPtr(true) + c.targetsVerified = pointer.Bool(true) return nil } @@ -266,6 +266,17 @@ func (c *rolloutContext) reconcileStableAndCanaryService() error { return nil } + if dynamicallyRollingBackToStable, currSelector := isDynamicallyRollingBackToStable(c.rollout, c.newRS); dynamicallyRollingBackToStable { + // User may have interrupted an update in order go back to stableRS, and is using dynamic + // stable scaling. If that is the case, the stableRS might be undersized and if we blindly + // switch service selector we could overwhelm stableRS pods. + // If we get here, we detected that the canary service needs to be pointed back to + // stable, but stable is not fully available. Skip the service switch for now. + c.log.Infof("delaying fully promoted service switch of '%s' from %s to %s: ReplicaSet '%s' not fully available", + c.rollout.Spec.Strategy.Canary.CanaryService, currSelector, replicasetutil.GetPodTemplateHash(c.newRS), c.newRS.Name) + return nil + } + err = c.ensureSVCTargets(c.rollout.Spec.Strategy.Canary.CanaryService, c.newRS, true) if err != nil { return err diff --git a/rollout/service_test.go b/rollout/service_test.go index e29ee53b4a..cb15367a3a 100644 --- a/rollout/service_test.go +++ b/rollout/service_test.go @@ -437,26 +437,26 @@ func TestCanaryAWSVerifyTargetGroupsNotYetReady(t *testing.T) { TargetHealthDescriptions: []elbv2types.TargetHealthDescription{ { Target: &elbv2types.TargetDescription{ - Id: pointer.StringPtr("1.2.3.4"), - Port: pointer.Int32Ptr(80), + Id: pointer.String("1.2.3.4"), + Port: pointer.Int32(80), }, }, { Target: &elbv2types.TargetDescription{ - Id: pointer.StringPtr("5.6.7.8"), - Port: pointer.Int32Ptr(80), + Id: pointer.String("5.6.7.8"), + Port: pointer.Int32(80), }, }, { Target: &elbv2types.TargetDescription{ - Id: pointer.StringPtr("2.4.6.8"), // irrelevant - Port: pointer.Int32Ptr(81), // wrong port + Id: pointer.String("2.4.6.8"), // irrelevant + Port: pointer.Int32(81), // wrong port }, }, { Target: &elbv2types.TargetDescription{ - Id: pointer.StringPtr("9.8.7.6"), // irrelevant ip - Port: pointer.Int32Ptr(80), + Id: pointer.String("9.8.7.6"), // irrelevant ip + Port: pointer.Int32(80), }, }, }, @@ -464,8 +464,8 @@ func TestCanaryAWSVerifyTargetGroupsNotYetReady(t *testing.T) { fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil) r1 := newCanaryRollout("foo", 3, nil, []v1alpha1.CanaryStep{{ - SetWeight: pointer.Int32Ptr(10), - }}, pointer.Int32Ptr(0), intstr.FromString("25%"), intstr.FromString("25%")) + SetWeight: pointer.Int32(10), + }}, pointer.Int32(0), intstr.FromString("25%"), intstr.FromString("25%")) r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ ALB: &v1alpha1.ALBTrafficRouting{ @@ -491,6 +491,7 @@ func TestCanaryAWSVerifyTargetGroupsNotYetReady(t *testing.T) { r2.Status.Message = "" r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) r2.Status.StableRS = rs2PodHash + r2.Status.CurrentStepIndex = pointer.Int32(1) availableCondition, _ := newAvailableCondition(true) conditions.SetRolloutCondition(&r2.Status, availableCondition) healthyCondition, _ := newHealthyCondition(false) @@ -536,26 +537,26 @@ func TestCanaryAWSVerifyTargetGroupsReady(t *testing.T) { TargetHealthDescriptions: []elbv2types.TargetHealthDescription{ { Target: &elbv2types.TargetDescription{ - Id: pointer.StringPtr("1.2.3.4"), - Port: pointer.Int32Ptr(80), + Id: pointer.String("1.2.3.4"), + Port: pointer.Int32(80), }, }, { Target: &elbv2types.TargetDescription{ - Id: pointer.StringPtr("5.6.7.8"), - Port: pointer.Int32Ptr(80), + Id: pointer.String("5.6.7.8"), + Port: pointer.Int32(80), }, }, { Target: &elbv2types.TargetDescription{ - Id: pointer.StringPtr("2.4.6.8"), // irrelevant - Port: pointer.Int32Ptr(80), // wrong port + Id: pointer.String("2.4.6.8"), // irrelevant + Port: pointer.Int32(80), // wrong port }, }, { Target: &elbv2types.TargetDescription{ - Id: pointer.StringPtr("9.8.7.6"), // irrelevant ip - Port: pointer.Int32Ptr(80), + Id: pointer.String("9.8.7.6"), // irrelevant ip + Port: pointer.Int32(80), }, }, }, @@ -563,8 +564,8 @@ func TestCanaryAWSVerifyTargetGroupsReady(t *testing.T) { fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil) r1 := newCanaryRollout("foo", 3, nil, []v1alpha1.CanaryStep{{ - SetWeight: pointer.Int32Ptr(10), - }}, pointer.Int32Ptr(0), intstr.FromString("25%"), intstr.FromString("25%")) + SetWeight: pointer.Int32(10), + }}, pointer.Int32(0), intstr.FromString("25%"), intstr.FromString("25%")) r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ ALB: &v1alpha1.ALBTrafficRouting{ Ingress: "ingress", @@ -589,6 +590,7 @@ func TestCanaryAWSVerifyTargetGroupsReady(t *testing.T) { r2.Status.Message = "" r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) r2.Status.StableRS = rs2PodHash + r2.Status.CurrentStepIndex = pointer.Int32(1) availableCondition, _ := newAvailableCondition(true) conditions.SetRolloutCondition(&r2.Status, availableCondition) healthyCondition, _ := newHealthyCondition(false) @@ -624,8 +626,8 @@ func TestCanaryAWSVerifyTargetGroupsSkip(t *testing.T) { defer f.Close() r1 := newCanaryRollout("foo", 3, nil, []v1alpha1.CanaryStep{{ - SetWeight: pointer.Int32Ptr(10), - }}, pointer.Int32Ptr(0), intstr.FromString("25%"), intstr.FromString("25%")) + SetWeight: pointer.Int32(10), + }}, pointer.Int32(0), intstr.FromString("25%"), intstr.FromString("25%")) r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ ALB: &v1alpha1.ALBTrafficRouting{ Ingress: "ingress", @@ -652,6 +654,7 @@ func TestCanaryAWSVerifyTargetGroupsSkip(t *testing.T) { r2.Status.Message = "" r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) r2.Status.StableRS = rs2PodHash + r2.Status.CurrentStepIndex = pointer.Int32(1) availableCondition, _ := newAvailableCondition(true) conditions.SetRolloutCondition(&r2.Status, availableCondition) healthyCondition, _ := newHealthyCondition(false) diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index c7b3bf7055..a87e31a9e8 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -163,25 +163,20 @@ func (c *rolloutContext) reconcileTrafficRouting() error { canaryHash = c.newRS.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] } - if rolloututil.IsFullyPromoted(c.rollout) { - // when we are fully promoted. desired canary weight should be 0 + if dynamicallyRollingBackToStable, prevDesiredHash := isDynamicallyRollingBackToStable(c.rollout, c.newRS); dynamicallyRollingBackToStable { + desiredWeight = c.calculateDesiredWeightOnAbortOrStableRollback() + // Since stableRS == desiredRS, we must balance traffic between the + // *previous desired* vs. stable (as opposed to current desired vs. stable). + // The previous desired is remembered in Status.Canary.Weights.Canary.PodTemplateHash. + // See: https://github.com/argoproj/argo-rollouts/issues/3020 + canaryHash = prevDesiredHash + } else if rolloututil.IsFullyPromoted(c.rollout) { err := reconciler.RemoveManagedRoutes() if err != nil { return err } } else if c.pauseContext.IsAborted() { - // when aborted, desired canary weight should immediately be 0 (100% to stable), *unless* - // we are using dynamic stable scaling. In that case, we are dynamically decreasing the - // weight to the canary according to the availability of the stable (whatever it can support). - if c.rollout.Spec.Strategy.Canary.DynamicStableScale { - desiredWeight = 100 - ((100 * c.stableRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas) - if c.rollout.Status.Canary.Weights != nil { - // This ensures that if we are already at a lower weight, then we will not - // increase the weight because stable availability is flapping (e.g. pod restarts) - desiredWeight = minInt(desiredWeight, c.rollout.Status.Canary.Weights.Canary.Weight) - } - } - + desiredWeight = c.calculateDesiredWeightOnAbortOrStableRollback() if (c.rollout.Spec.Strategy.Canary.DynamicStableScale && desiredWeight == 0) || !c.rollout.Spec.Strategy.Canary.DynamicStableScale { // If we are using dynamic stable scale we need to also make sure that desiredWeight=0 aka we are completely // done with aborting before resetting the canary service selectors back to stable @@ -295,6 +290,26 @@ func (c *rolloutContext) reconcileTrafficRouting() error { return nil } +// calculateDesiredWeightOnAbortOrStableRollback returns the desired weight to use when we are either +// aborting, or rolling back to stable RS. +func (c *rolloutContext) calculateDesiredWeightOnAbortOrStableRollback() int32 { + if !c.rollout.Spec.Strategy.Canary.DynamicStableScale { + // When aborting or rolling back to stable RS and dynamicStableScaling is disabled, + // then desired canary weight should immediately be 0 (100% to stable) since we can trust + // that it is fully scaled up + return 0 + } + // When using dynamic stable scaling, we must dynamically decreasing the weight to the canary + // according to the availability of the stable (whatever it can support). + desiredWeight := 100 - ((100 * c.stableRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas) + if c.rollout.Status.Canary.Weights != nil { + // This ensures that if we are already at a lower weight, then we will not + // increase the weight because stable availability is flapping (e.g. pod restarts) + desiredWeight = minInt(desiredWeight, c.rollout.Status.Canary.Weights.Canary.Weight) + } + return desiredWeight +} + // trafficWeightUpdatedMessage returns a message we emit for the kubernetes event whenever we adjust traffic weights func trafficWeightUpdatedMessage(prev, new *v1alpha1.TrafficWeights) string { var details []string diff --git a/rollout/trafficrouting_test.go b/rollout/trafficrouting_test.go index 78817492f6..a545d9fae6 100644 --- a/rollout/trafficrouting_test.go +++ b/rollout/trafficrouting_test.go @@ -2,6 +2,7 @@ package rollout import ( "errors" + "fmt" "strconv" "testing" "time" @@ -752,8 +753,8 @@ func TestCanaryWithTrafficRoutingAddScaleDownDelay(t *testing.T) { defer f.Close() r1 := newCanaryRollout("foo", 1, nil, []v1alpha1.CanaryStep{{ - SetWeight: pointer.Int32Ptr(10), - }}, pointer.Int32Ptr(0), intstr.FromInt(1), intstr.FromInt(1)) + SetWeight: pointer.Int32(10), + }}, pointer.Int32(0), intstr.FromInt(1), intstr.FromInt(1)) r1.Spec.Strategy.Canary.CanaryService = "canary" r1.Spec.Strategy.Canary.StableService = "stable" r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ @@ -765,6 +766,7 @@ func TestCanaryWithTrafficRoutingAddScaleDownDelay(t *testing.T) { rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] r2 = updateCanaryRolloutStatus(r2, rs2PodHash, 2, 1, 2, false) r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) + r2.Status.CurrentStepIndex = pointer.Int32(1) availableCondition, _ := newAvailableCondition(true) conditions.SetRolloutCondition(&r2.Status, availableCondition) completedCondition, _ := newCompletedCondition(true) @@ -1153,3 +1155,94 @@ func TestRolloutReplicaIsAvailableAndGenerationNotBeModifiedShouldModifyVirtualS }).Once().Return(nil) f.run(getKey(r1, t)) } + +// This makes sure we don't set weight to zero if we are rolling back to stable with DynamicStableScale +func TestDontWeightToZeroWhenDynamicallyRollingBackToStable(t *testing.T) { + f := newFixture(t) + defer f.Close() + + steps := []v1alpha1.CanaryStep{ + { + SetWeight: pointer.Int32(90), + }, + { + Pause: &v1alpha1.RolloutPause{}, + }, + } + r1 := newCanaryRollout("foo", 10, nil, steps, pointer.Int32(1), intstr.FromInt(1), intstr.FromInt(1)) + r1.Spec.Strategy.Canary.DynamicStableScale = true + r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ + SMI: &v1alpha1.SMITrafficRouting{}, + } + r1.Spec.Strategy.Canary.CanaryService = "canary" + r1.Spec.Strategy.Canary.StableService = "stable" + r1.Status.ReadyReplicas = 10 + r1.Status.AvailableReplicas = 10 + r2 := bumpVersion(r1) + + rs1 := newReplicaSetWithStatus(r1, 1, 1) + rs2 := newReplicaSetWithStatus(r2, 9, 9) + + rs1PodHash := rs1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + canarySelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash} + stableSelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs1PodHash} + canarySvc := newService("canary", 80, canarySelector, r1) + stableSvc := newService("stable", 80, stableSelector, r1) + + // simulate rollback to stable + r2.Spec = r1.Spec + r2.Status.StableRS = rs1PodHash + r2.Status.CurrentPodHash = rs1PodHash // will cause IsFullyPromoted() to be true + r2.Status.Canary.Weights = &v1alpha1.TrafficWeights{ + Canary: v1alpha1.WeightDestination{ + Weight: 10, + ServiceName: "canary", + PodTemplateHash: rs2PodHash, + }, + Stable: v1alpha1.WeightDestination{ + Weight: 90, + ServiceName: "stable", + PodTemplateHash: rs1PodHash, + }, + } + + f.kubeobjects = append(f.kubeobjects, rs1, rs2, canarySvc, stableSvc) + f.replicaSetLister = append(f.replicaSetLister, rs1, rs2) + + f.rolloutLister = append(f.rolloutLister, r2) + f.objects = append(f.objects, r2) + + f.expectUpdateReplicaSetAction(rs1) // Updates the revision annotation from 1 to 3 + f.expectUpdateReplicaSetAction(rs1) // repeat of the above (not sure why) + scaleUpIndex := f.expectUpdateReplicaSetAction(rs1) // this one scales the stable RS to 10 + f.expectPatchRolloutAction(r2) + + f.fakeTrafficRouting = newUnmockedFakeTrafficRoutingReconciler() + f.fakeTrafficRouting.On("UpdateHash", mock.Anything, mock.Anything, mock.Anything).Return(func(canaryHash, stableHash string, additionalDestinations ...v1alpha1.WeightDestination) error { + // make sure UpdateHash was called with previous desired hash (not current pod hash) + if canaryHash != rs2PodHash { + return fmt.Errorf("UpdateHash was called with canary hash: %s. Expected: %s", canaryHash, rs2PodHash) + } + if stableHash != rs1PodHash { + return fmt.Errorf("UpdateHash was called with stable hash: %s. Expected: %s", canaryHash, rs1PodHash) + } + return nil + + }) + f.fakeTrafficRouting.On("SetWeight", mock.Anything, mock.Anything).Return(func(desiredWeight int32, additionalDestinations ...v1alpha1.WeightDestination) error { + // make sure SetWeight was not changed + if desiredWeight != 10 { + return fmt.Errorf("SetWeight was called with unexpected weight: %d. Expected: 10", desiredWeight) + } + return nil + }) + f.fakeTrafficRouting.On("SetHeaderRoute", mock.Anything, mock.Anything).Return(nil) + f.fakeTrafficRouting.On("RemoveManagedRoutes", mock.Anything, mock.Anything).Return(nil) + f.fakeTrafficRouting.On("VerifyWeight", mock.Anything).Return(pointer.BoolPtr(true), nil) + f.run(getKey(r1, t)) + + // Make sure we scale up stable ReplicaSet to 10 + rs1Updated := f.getUpdatedReplicaSet(scaleUpIndex) + assert.Equal(t, int32(10), *rs1Updated.Spec.Replicas) +} diff --git a/test/e2e/canary_test.go b/test/e2e/canary_test.go index fe22175074..bc5e60b6c3 100644 --- a/test/e2e/canary_test.go +++ b/test/e2e/canary_test.go @@ -13,6 +13,7 @@ import ( corev1 "k8s.io/api/core/v1" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" + rov1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/test/fixtures" ) @@ -620,3 +621,62 @@ func (s *CanarySuite) TestCanaryDynamicStableScale() { ExpectServiceSelector("dynamic-stable-scale-canary", map[string]string{"app": "dynamic-stable-scale", "rollouts-pod-template-hash": "868d98995b"}, false). ExpectRevisionPodCount("1", 4) } + +// TestCanaryDynamicStableScaleRollbackToStable verifies when we rollback to stable with +// DynamicStableScale enabled, we do so in a safe manner without shifting traffic back to stable +// before it can handle it +func (s *CanarySuite) TestCanaryDynamicStableScaleRollbackToStable() { + s.Given(). + RolloutObjects(`@functional/canary-dynamic-stable-scale.yaml`). + When(). + ApplyManifests(). + MarkPodsReady("1", 4). // mark all 4 pods ready + WaitForRolloutStatus("Healthy"). + UpdateSpec(). + MarkPodsReady("2", 1). // mark 1 of 1 canary pods ready + WaitForRolloutStatus("Paused"). + Sleep(2*time.Second). + Then(). + ExpectRevisionPodCount("1", 3). + ExpectRevisionPodCount("2", 1). + When(). + UndoRollout(1). // Rollback to stable (revision 1) + Sleep(2*time.Second). + Then(). + ExpectRevisionPodCount("3", 4). // Ensure we fully scale up the stable (now revision 3) + ExpectRevisionPodCount("2", 1). // And do not scale down the previous desired (revision 2) + Assert(func(t *fixtures.Then) { + // Make sure canary service is still pointing to the previous desired (revision 2) + rs3 := t.GetReplicaSetByRevision("3") + rs2 := t.GetReplicaSetByRevision("2") + canarySvc, stableSvc := t.GetServices() + assert.Equal(s.T(), rs2.Labels[rov1.DefaultRolloutUniqueLabelKey], canarySvc.Spec.Selector["rollouts-pod-template-hash"]) + assert.Equal(s.T(), rs3.Labels[rov1.DefaultRolloutUniqueLabelKey], stableSvc.Spec.Selector["rollouts-pod-template-hash"]) + + // Ensure we did not touch the weights even though we are "fully promoted" + ro := t.GetRollout() + assert.Equal(s.T(), rs2.Labels[rov1.DefaultRolloutUniqueLabelKey], ro.Status.Canary.Weights.Canary.PodTemplateHash) + assert.Equal(s.T(), int32(25), ro.Status.Canary.Weights.Canary.Weight) + assert.Equal(s.T(), rs3.Labels[rov1.DefaultRolloutUniqueLabelKey], ro.Status.Canary.Weights.Stable.PodTemplateHash) + assert.Equal(s.T(), int32(75), ro.Status.Canary.Weights.Stable.Weight) + }). + When(). + MarkPodsReady("3", 1). // marks the 4th pod of stableRS/newRS (revision 3) ready + WaitForRevisionPodCount("2", 0). // make sure we scale down the previous desired (revision 2) + Then(). + Assert(func(t *fixtures.Then) { + // Make sure canary/stable service is updated to point to revision 3 + rs3 := t.GetReplicaSetByRevision("3") + canarySvc, stableSvc := t.GetServices() + assert.Equal(s.T(), rs3.Labels[rov1.DefaultRolloutUniqueLabelKey], canarySvc.Spec.Selector["rollouts-pod-template-hash"]) + assert.Equal(s.T(), rs3.Labels[rov1.DefaultRolloutUniqueLabelKey], stableSvc.Spec.Selector["rollouts-pod-template-hash"]) + + // Ensure we are 100% back to stable + ro := t.GetRollout() + assert.Equal(s.T(), rs3.Labels[rov1.DefaultRolloutUniqueLabelKey], ro.Status.Canary.Weights.Canary.PodTemplateHash) + assert.Equal(s.T(), int32(0), ro.Status.Canary.Weights.Canary.Weight) + assert.Equal(s.T(), rs3.Labels[rov1.DefaultRolloutUniqueLabelKey], ro.Status.Canary.Weights.Stable.PodTemplateHash) + assert.Equal(s.T(), int32(100), ro.Status.Canary.Weights.Stable.Weight) + + }) +} diff --git a/test/fixtures/common.go b/test/fixtures/common.go index 9e060df865..670d2dd2f4 100644 --- a/test/fixtures/common.go +++ b/test/fixtures/common.go @@ -71,6 +71,7 @@ func (c *Common) CheckError(err error) { } } +// Rollout returns the original rollout manifest used in the test func (c *Common) Rollout() *rov1.Rollout { var ro rov1.Rollout err := runtime.DefaultUnstructuredConverter.FromUnstructured(c.rollout.Object, &ro) @@ -78,6 +79,13 @@ func (c *Common) Rollout() *rov1.Rollout { return &ro } +// GetRollout returns the live rollout object in the cluster +func (c *Common) GetRollout() *rov1.Rollout { + ro, err := c.rolloutClient.ArgoprojV1alpha1().Rollouts(c.namespace).Get(context.TODO(), c.Rollout().GetName(), metav1.GetOptions{}) + c.CheckError(err) + return ro +} + func (c *Common) PrintRollout(name string) { streams := genericclioptions.IOStreams{In: os.Stdin, Out: os.Stdout, ErrOut: os.Stderr} o := options.NewArgoRolloutsOptions(streams) diff --git a/test/fixtures/when.go b/test/fixtures/when.go index 395e614bcf..d9696a7761 100644 --- a/test/fixtures/when.go +++ b/test/fixtures/when.go @@ -25,12 +25,14 @@ import ( "sigs.k8s.io/yaml" "github.com/argoproj/argo-rollouts/pkg/apiclient/rollout" + "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" rov1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/cmd/abort" "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/cmd/promote" "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/cmd/restart" "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/cmd/retry" "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/cmd/status" + "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/cmd/undo" "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/options" "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/viewcontroller" rolloututil "github.com/argoproj/argo-rollouts/utils/rollout" @@ -185,6 +187,16 @@ func (w *When) RetryRollout() *When { return w } +func (w *When) UndoRollout(toRevision int64) *When { + if w.rollout == nil { + w.t.Fatal("Rollout not set") + } + _, err := undo.RunUndoRollout(w.dynamicClient.Resource(v1alpha1.RolloutGVR).Namespace(w.namespace), w.kubeClient, w.rollout.GetName(), toRevision) + w.CheckError(err) + w.log.Infof("Undo rollout to %d", toRevision) + return w +} + func (w *When) RestartRollout() *When { if w.rollout == nil { w.t.Fatal("Rollout not set") diff --git a/utils/replicaset/canary.go b/utils/replicaset/canary.go index a751cd2be2..40eadb7848 100755 --- a/utils/replicaset/canary.go +++ b/utils/replicaset/canary.go @@ -41,8 +41,13 @@ func AtDesiredReplicaCountsForCanary(ro *v1alpha1.Rollout, newRS, stableRS *apps return false } } - if GetAvailableReplicaCountForReplicaSets(olderRSs) != int32(0) { - return false + if ro.Spec.Strategy.Canary.TrafficRouting == nil { + // For basic canary, all older ReplicaSets must be scaled to zero since they serve traffic. + // For traffic weighted canary, it's okay if they are still scaled up, since the traffic + // router will prevent them from serving traffic + if GetAvailableReplicaCountForReplicaSets(olderRSs) != int32(0) { + return false + } } return true } diff --git a/utils/replicaset/replicaset.go b/utils/replicaset/replicaset.go index 9aec161b66..b2664afd53 100644 --- a/utils/replicaset/replicaset.go +++ b/utils/replicaset/replicaset.go @@ -592,17 +592,6 @@ func (o ReplicaSetsByRevisionNumber) Less(i, j int) bool { return iRevision < jRevision } -// IsStillReferenced returns if the given ReplicaSet is still being referenced by any of -// the current, stable, blue-green active references. Used to determine if the ReplicaSet can -// safely be scaled to zero, or deleted. -func IsStillReferenced(status v1alpha1.RolloutStatus, rs *appsv1.ReplicaSet) bool { - hash := GetPodTemplateHash(rs) - if hash != "" && (hash == status.StableRS || hash == status.CurrentPodHash || hash == status.BlueGreen.ActiveSelector) { - return true - } - return false -} - // HasScaleDownDeadline returns whether or not the given ReplicaSet is annotated with a scale-down delay func HasScaleDownDeadline(rs *appsv1.ReplicaSet) bool { if rs == nil || rs.Annotations == nil { diff --git a/utils/replicaset/replicaset_test.go b/utils/replicaset/replicaset_test.go index 23bf320955..462fa2b835 100644 --- a/utils/replicaset/replicaset_test.go +++ b/utils/replicaset/replicaset_test.go @@ -1078,48 +1078,6 @@ func TestNeedsRestart(t *testing.T) { }) } -func TestIsStillReferenced(t *testing.T) { - newRSWithPodTemplateHash := func(hash string) *appsv1.ReplicaSet { - return &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - v1alpha1.DefaultRolloutUniqueLabelKey: hash, - }, - }, - } - } - { - status := v1alpha1.RolloutStatus{StableRS: "abc123"} - rs := &appsv1.ReplicaSet{} - assert.False(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{StableRS: "abc123"} - rs := newRSWithPodTemplateHash("") - assert.False(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{StableRS: "abc123"} - rs := newRSWithPodTemplateHash("abc123") - assert.True(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{CurrentPodHash: "abc123"} - rs := newRSWithPodTemplateHash("abc123") - assert.True(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{BlueGreen: v1alpha1.BlueGreenStatus{ActiveSelector: "abc123"}} - rs := newRSWithPodTemplateHash("abc123") - assert.True(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{StableRS: "abc123"} - rs := newRSWithPodTemplateHash("def456") - assert.False(t, IsStillReferenced(status, rs)) - } -} - func TestHasScaleDownDeadline(t *testing.T) { { assert.False(t, HasScaleDownDeadline(nil))