diff --git a/rollout/bluegreen.go b/rollout/bluegreen.go index 887e022e28..f1bcf7a7bb 100644 --- a/rollout/bluegreen.go +++ b/rollout/bluegreen.go @@ -220,10 +220,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re annotationedRSs := int32(0) rolloutReplicas := defaults.GetReplicasOrDefault(c.rollout.Spec.Replicas) for _, targetRS := range oldRSs { - if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) { - // We should technically never get here because we shouldn't be passing a replicaset list - // which includes referenced ReplicaSets. But we check just in case - c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name) + if c.isReplicaSetReferenced(targetRS) { + // We might get here if user interrupted an an update in order to move back to stable. + c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name) continue } if *targetRS.Spec.Replicas == 0 { diff --git a/rollout/canary.go b/rollout/canary.go index dff4b52d50..cd7fcaa1c5 100644 --- a/rollout/canary.go +++ b/rollout/canary.go @@ -14,6 +14,7 @@ import ( "github.com/argoproj/argo-rollouts/utils/record" replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" rolloututil "github.com/argoproj/argo-rollouts/utils/rollout" + serviceutil "github.com/argoproj/argo-rollouts/utils/service" ) func (c *rolloutContext) rolloutCanary() error { @@ -180,10 +181,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli annotationedRSs := int32(0) for _, targetRS := range oldRSs { - if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) { - // We should technically never get here because we shouldn't be passing a replicaset list - // which includes referenced ReplicaSets. But we check just in case - c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name) + if c.isReplicaSetReferenced(targetRS) { + // We might get here if user interrupted an an update in order to move back to stable. + c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name) continue } if maxScaleDown <= 0 { @@ -220,15 +220,8 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli // and doesn't yet have scale down deadline. This happens when a user changes their // mind in the middle of an V1 -> V2 update, and then applies a V3. We are deciding // what to do with the defunct, intermediate V2 ReplicaSet right now. - if !c.replicaSetReferencedByCanaryTraffic(targetRS) { - // It is safe to scale the intermediate RS down, if no traffic is directed to it. - c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name) - } else { - c.log.Infof("Skip scaling down intermediate RS '%s': still referenced by service", targetRS.Name) - // This ReplicaSet is still referenced by the service. It is not safe to scale - // this down. - continue - } + // It is safe to scale the intermediate RS down, since no traffic is directed to it. + c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name) } } if *targetRS.Spec.Replicas == desiredReplicaCount { @@ -248,21 +241,61 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli return totalScaledDown, nil } -func (c *rolloutContext) replicaSetReferencedByCanaryTraffic(rs *appsv1.ReplicaSet) bool { +// isReplicaSetReferenced returns if the given ReplicaSet is still being referenced by any of +// the current, stable, blue-green active services. Used to determine if the ReplicaSet can +// safely be scaled to zero, or deleted. +func (c *rolloutContext) isReplicaSetReferenced(rs *appsv1.ReplicaSet) bool { rsPodHash := replicasetutil.GetPodTemplateHash(rs) ro := c.rollout - - if ro.Status.Canary.Weights == nil { - return false + if rsPodHash != "" && (rsPodHash == ro.Status.StableRS || rsPodHash == ro.Status.CurrentPodHash || rsPodHash == ro.Status.BlueGreen.ActiveSelector) { + return true } - - if ro.Status.Canary.Weights.Canary.PodTemplateHash == rsPodHash || ro.Status.Canary.Weights.Stable.PodTemplateHash == rsPodHash { + var servicesToCheck []string + if ro.Spec.Strategy.Canary != nil { + servicesToCheck = []string{ro.Spec.Strategy.Canary.CanaryService, ro.Spec.Strategy.Canary.StableService} + } else { + servicesToCheck = []string{ro.Spec.Strategy.BlueGreen.ActiveService, ro.Spec.Strategy.BlueGreen.PreviewService} + } + for _, svcName := range servicesToCheck { + if svcName == "" { + continue + } + svc, err := c.servicesLister.Services(c.rollout.Namespace).Get(svcName) + if err != nil { + return true + } + if serviceutil.GetRolloutSelectorLabel(svc) == rsPodHash { + return true + } + } + if ro.Status.Canary.Weights != nil && (ro.Status.Canary.Weights.Canary.PodTemplateHash == rsPodHash || ro.Status.Canary.Weights.Stable.PodTemplateHash == rsPodHash) { return true } - return false } +// isDynamicallyGoingBackToStable returns true if we were in the middle of an canary update with +// dynamic stable scaling, but was interrupted and are now going back to stable RS. This is similar +// to, but different than aborting. With abort, desired hash != stable hash and so we know the +// two hashes to balance traffic against. But with dynamically going back to stable, the +// desired hash == stable hash, and so we must use the *previous* desired hash and balance traffic +// between previous desired vs. stable hash, in order to safely shift traffic back to stable. +// This function also returns the previous desired hash (where we are weighted to) +func (c *rolloutContext) isDynamicallyGoingBackToStable() (bool, string) { + if rolloututil.IsFullyPromoted(c.rollout) && c.rollout.Spec.Strategy.Canary.TrafficRouting != nil && c.rollout.Spec.Strategy.Canary.DynamicStableScale { + if c.rollout.Status.Canary.Weights != nil { + currSelector := c.rollout.Status.Canary.Weights.Canary.PodTemplateHash + desiredSelector := replicasetutil.GetPodTemplateHash(c.newRS) + if currSelector != desiredSelector { + if c.newRS.Status.AvailableReplicas < *c.rollout.Spec.Replicas { + return true, currSelector + } + } + } + } + return false, "" +} + // canProceedWithScaleDownAnnotation returns whether or not it is safe to proceed with annotating // old replicasets with the scale-down-deadline in the traffic-routed canary strategy. // This method only matters with ALB canary + the target group verification feature. diff --git a/rollout/canary_test.go b/rollout/canary_test.go index 4adca3fcb9..71bdaf2e84 100644 --- a/rollout/canary_test.go +++ b/rollout/canary_test.go @@ -1890,3 +1890,45 @@ func TestHandleCanaryAbort(t *testing.T) { assert.JSONEq(t, calculatePatch(r1, fmt.Sprintf(expectedPatch, newConditions)), patch) }) } + +// func TestIsStillReferenced(t *testing.T) { +// newRSWithPodTemplateHash := func(hash string) *appsv1.ReplicaSet { +// return &appsv1.ReplicaSet{ +// ObjectMeta: metav1.ObjectMeta{ +// Labels: map[string]string{ +// v1alpha1.DefaultRolloutUniqueLabelKey: hash, +// }, +// }, +// } +// } +// { +// status := v1alpha1.RolloutStatus{StableRS: "abc123"} +// rs := &appsv1.ReplicaSet{} +// assert.False(t, IsStillReferenced(status, rs)) +// } +// { +// status := v1alpha1.RolloutStatus{StableRS: "abc123"} +// rs := newRSWithPodTemplateHash("") +// assert.False(t, IsStillReferenced(status, rs)) +// } +// { +// status := v1alpha1.RolloutStatus{StableRS: "abc123"} +// rs := newRSWithPodTemplateHash("abc123") +// assert.True(t, IsStillReferenced(status, rs)) +// } +// { +// status := v1alpha1.RolloutStatus{CurrentPodHash: "abc123"} +// rs := newRSWithPodTemplateHash("abc123") +// assert.True(t, IsStillReferenced(status, rs)) +// } +// { +// status := v1alpha1.RolloutStatus{BlueGreen: v1alpha1.BlueGreenStatus{ActiveSelector: "abc123"}} +// rs := newRSWithPodTemplateHash("abc123") +// assert.True(t, IsStillReferenced(status, rs)) +// } +// { +// status := v1alpha1.RolloutStatus{StableRS: "abc123"} +// rs := newRSWithPodTemplateHash("def456") +// assert.False(t, IsStillReferenced(status, rs)) +// } +// } diff --git a/rollout/service.go b/rollout/service.go index f808cb55fc..d0335ce260 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -147,7 +147,7 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error { return nil } - c.targetsVerified = pointer.BoolPtr(false) + c.targetsVerified = pointer.Bool(false) // get endpoints of service endpoints, err := c.kubeclientset.CoreV1().Endpoints(svc.Namespace).Get(ctx, svc.Name, metav1.GetOptions{}) @@ -177,7 +177,7 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error { } c.recorder.Eventf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifiedReason}, conditions.TargetGroupVerifiedRegistrationMessage, svc.Name, tgb.Spec.TargetGroupARN, verifyRes.EndpointsRegistered) } - c.targetsVerified = pointer.BoolPtr(true) + c.targetsVerified = pointer.Bool(true) return nil } @@ -266,6 +266,17 @@ func (c *rolloutContext) reconcileStableAndCanaryService() error { return nil } + if dynamicallyGoingBackToStable, currSelector := c.isDynamicallyGoingBackToStable(); dynamicallyGoingBackToStable { + // User may have interrupted an update in order go back to stableRS, and is using dynamic + // stable scaling. If that is the case, the stableRS might be undersized and if we blindly + // switch service selector we could overwhelm stableRS pods. + // If we get here, we detected that the canary service needs to be pointed back to + // stable, but stable is not fully available. Skip the service switch for now. + c.log.Infof("delaying fully promoted service switch of '%s' from %s to %s: ReplicaSet '%s' not fully available", + c.rollout.Spec.Strategy.Canary.CanaryService, currSelector, replicasetutil.GetPodTemplateHash(c.newRS), c.newRS.Name) + return nil + } + err = c.ensureSVCTargets(c.rollout.Spec.Strategy.Canary.CanaryService, c.newRS, true) if err != nil { return err diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index c7b3bf7055..16fc66a1b5 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -163,25 +163,20 @@ func (c *rolloutContext) reconcileTrafficRouting() error { canaryHash = c.newRS.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] } - if rolloututil.IsFullyPromoted(c.rollout) { - // when we are fully promoted. desired canary weight should be 0 + if dynamicallyGoingBackToStable, prevDesiredHash := c.isDynamicallyGoingBackToStable(); dynamicallyGoingBackToStable { + desiredWeight = c.calculateDesiredWeightOnAbortOrStableRollback() + // Since stableRS == desiredRS, we must balance traffic between the + // *previous desired* vs. stable (as opposed to current desired vs. stable). + // The previous desired is remembered in Status.Canary.Weights.Canary.PodTemplateHash. + // See: https://github.com/argoproj/argo-rollouts/issues/3020 + canaryHash = prevDesiredHash + } else if rolloututil.IsFullyPromoted(c.rollout) { err := reconciler.RemoveManagedRoutes() if err != nil { return err } } else if c.pauseContext.IsAborted() { - // when aborted, desired canary weight should immediately be 0 (100% to stable), *unless* - // we are using dynamic stable scaling. In that case, we are dynamically decreasing the - // weight to the canary according to the availability of the stable (whatever it can support). - if c.rollout.Spec.Strategy.Canary.DynamicStableScale { - desiredWeight = 100 - ((100 * c.stableRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas) - if c.rollout.Status.Canary.Weights != nil { - // This ensures that if we are already at a lower weight, then we will not - // increase the weight because stable availability is flapping (e.g. pod restarts) - desiredWeight = minInt(desiredWeight, c.rollout.Status.Canary.Weights.Canary.Weight) - } - } - + desiredWeight = c.calculateDesiredWeightOnAbortOrStableRollback() if (c.rollout.Spec.Strategy.Canary.DynamicStableScale && desiredWeight == 0) || !c.rollout.Spec.Strategy.Canary.DynamicStableScale { // If we are using dynamic stable scale we need to also make sure that desiredWeight=0 aka we are completely // done with aborting before resetting the canary service selectors back to stable @@ -295,6 +290,26 @@ func (c *rolloutContext) reconcileTrafficRouting() error { return nil } +// calculateDesiredWeightOnAbortOrStableRollback returns the desired weight to use when we are either +// aborting, or rolling back to stable RS. +func (c *rolloutContext) calculateDesiredWeightOnAbortOrStableRollback() int32 { + if !c.rollout.Spec.Strategy.Canary.DynamicStableScale { + // When aborting or rolling back to stable RS and dynamicStableScaling is disabled, + // then desired canary weight should immediately be 0 (100% to stable) since we can trust + // that it is fully scaled up + return 0 + } + // When using dynamic stable scaling, we must dynamically decreasing the weight to the canary + // according to the availability of the stable (whatever it can support). + desiredWeight := 100 - ((100 * c.stableRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas) + if c.rollout.Status.Canary.Weights != nil { + // This ensures that if we are already at a lower weight, then we will not + // increase the weight because stable availability is flapping (e.g. pod restarts) + desiredWeight = minInt(desiredWeight, c.rollout.Status.Canary.Weights.Canary.Weight) + } + return desiredWeight +} + // trafficWeightUpdatedMessage returns a message we emit for the kubernetes event whenever we adjust traffic weights func trafficWeightUpdatedMessage(prev, new *v1alpha1.TrafficWeights) string { var details []string diff --git a/utils/replicaset/replicaset.go b/utils/replicaset/replicaset.go index 9aec161b66..b2664afd53 100644 --- a/utils/replicaset/replicaset.go +++ b/utils/replicaset/replicaset.go @@ -592,17 +592,6 @@ func (o ReplicaSetsByRevisionNumber) Less(i, j int) bool { return iRevision < jRevision } -// IsStillReferenced returns if the given ReplicaSet is still being referenced by any of -// the current, stable, blue-green active references. Used to determine if the ReplicaSet can -// safely be scaled to zero, or deleted. -func IsStillReferenced(status v1alpha1.RolloutStatus, rs *appsv1.ReplicaSet) bool { - hash := GetPodTemplateHash(rs) - if hash != "" && (hash == status.StableRS || hash == status.CurrentPodHash || hash == status.BlueGreen.ActiveSelector) { - return true - } - return false -} - // HasScaleDownDeadline returns whether or not the given ReplicaSet is annotated with a scale-down delay func HasScaleDownDeadline(rs *appsv1.ReplicaSet) bool { if rs == nil || rs.Annotations == nil { diff --git a/utils/replicaset/replicaset_test.go b/utils/replicaset/replicaset_test.go index 23bf320955..462fa2b835 100644 --- a/utils/replicaset/replicaset_test.go +++ b/utils/replicaset/replicaset_test.go @@ -1078,48 +1078,6 @@ func TestNeedsRestart(t *testing.T) { }) } -func TestIsStillReferenced(t *testing.T) { - newRSWithPodTemplateHash := func(hash string) *appsv1.ReplicaSet { - return &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - v1alpha1.DefaultRolloutUniqueLabelKey: hash, - }, - }, - } - } - { - status := v1alpha1.RolloutStatus{StableRS: "abc123"} - rs := &appsv1.ReplicaSet{} - assert.False(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{StableRS: "abc123"} - rs := newRSWithPodTemplateHash("") - assert.False(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{StableRS: "abc123"} - rs := newRSWithPodTemplateHash("abc123") - assert.True(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{CurrentPodHash: "abc123"} - rs := newRSWithPodTemplateHash("abc123") - assert.True(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{BlueGreen: v1alpha1.BlueGreenStatus{ActiveSelector: "abc123"}} - rs := newRSWithPodTemplateHash("abc123") - assert.True(t, IsStillReferenced(status, rs)) - } - { - status := v1alpha1.RolloutStatus{StableRS: "abc123"} - rs := newRSWithPodTemplateHash("def456") - assert.False(t, IsStillReferenced(status, rs)) - } -} - func TestHasScaleDownDeadline(t *testing.T) { { assert.False(t, HasScaleDownDeadline(nil))