Skip to content

Commit

Permalink
fix: rollback to stable with dynamicStableScale could go under maxUna…
Browse files Browse the repository at this point in the history
…vailable

Signed-off-by: Jesse Suen <[email protected]>
  • Loading branch information
jessesuen committed Oct 4, 2023
1 parent 6f5206f commit 319d0a8
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 93 deletions.
7 changes: 3 additions & 4 deletions rollout/bluegreen.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re
annotationedRSs := int32(0)
rolloutReplicas := defaults.GetReplicasOrDefault(c.rollout.Spec.Replicas)
for _, targetRS := range oldRSs {
if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) {
// We should technically never get here because we shouldn't be passing a replicaset list
// which includes referenced ReplicaSets. But we check just in case
c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name)
if c.isReplicaSetReferenced(targetRS) {
// We might get here if user interrupted an an update in order to move back to stable.
c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name)
continue
}
if *targetRS.Spec.Replicas == 0 {
Expand Down
73 changes: 53 additions & 20 deletions rollout/canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/argoproj/argo-rollouts/utils/record"
replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
rolloututil "github.com/argoproj/argo-rollouts/utils/rollout"
serviceutil "github.com/argoproj/argo-rollouts/utils/service"
)

func (c *rolloutContext) rolloutCanary() error {
Expand Down Expand Up @@ -180,10 +181,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli

annotationedRSs := int32(0)
for _, targetRS := range oldRSs {
if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) {
// We should technically never get here because we shouldn't be passing a replicaset list
// which includes referenced ReplicaSets. But we check just in case
c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name)
if c.isReplicaSetReferenced(targetRS) {
// We might get here if user interrupted an an update in order to move back to stable.
c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name)
continue
}
if maxScaleDown <= 0 {
Expand Down Expand Up @@ -220,15 +220,8 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
// and doesn't yet have scale down deadline. This happens when a user changes their
// mind in the middle of an V1 -> V2 update, and then applies a V3. We are deciding
// what to do with the defunct, intermediate V2 ReplicaSet right now.
if !c.replicaSetReferencedByCanaryTraffic(targetRS) {
// It is safe to scale the intermediate RS down, if no traffic is directed to it.
c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name)
} else {
c.log.Infof("Skip scaling down intermediate RS '%s': still referenced by service", targetRS.Name)
// This ReplicaSet is still referenced by the service. It is not safe to scale
// this down.
continue
}
// It is safe to scale the intermediate RS down, since no traffic is directed to it.
c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name)
}
}
if *targetRS.Spec.Replicas == desiredReplicaCount {
Expand All @@ -248,21 +241,61 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
return totalScaledDown, nil
}

func (c *rolloutContext) replicaSetReferencedByCanaryTraffic(rs *appsv1.ReplicaSet) bool {
// isReplicaSetReferenced returns if the given ReplicaSet is still being referenced by any of
// the current, stable, blue-green active services. Used to determine if the ReplicaSet can
// safely be scaled to zero, or deleted.
func (c *rolloutContext) isReplicaSetReferenced(rs *appsv1.ReplicaSet) bool {
rsPodHash := replicasetutil.GetPodTemplateHash(rs)
ro := c.rollout

if ro.Status.Canary.Weights == nil {
return false
if rsPodHash != "" && (rsPodHash == ro.Status.StableRS || rsPodHash == ro.Status.CurrentPodHash || rsPodHash == ro.Status.BlueGreen.ActiveSelector) {
return true
}

if ro.Status.Canary.Weights.Canary.PodTemplateHash == rsPodHash || ro.Status.Canary.Weights.Stable.PodTemplateHash == rsPodHash {
var servicesToCheck []string
if ro.Spec.Strategy.Canary != nil {
servicesToCheck = []string{ro.Spec.Strategy.Canary.CanaryService, ro.Spec.Strategy.Canary.StableService}
} else {
servicesToCheck = []string{ro.Spec.Strategy.BlueGreen.ActiveService, ro.Spec.Strategy.BlueGreen.PreviewService}
}
for _, svcName := range servicesToCheck {
if svcName == "" {
continue
}
svc, err := c.servicesLister.Services(c.rollout.Namespace).Get(svcName)
if err != nil {
return true
}
if serviceutil.GetRolloutSelectorLabel(svc) == rsPodHash {
return true
}
}
if ro.Status.Canary.Weights != nil && (ro.Status.Canary.Weights.Canary.PodTemplateHash == rsPodHash || ro.Status.Canary.Weights.Stable.PodTemplateHash == rsPodHash) {
return true
}

return false
}

// isDynamicallyGoingBackToStable returns true if we were in the middle of an canary update with
// dynamic stable scaling, but was interrupted and are now going back to stable RS. This is similar
// to, but different than aborting. With abort, desired hash != stable hash and so we know the
// two hashes to balance traffic against. But with dynamically going back to stable, the
// desired hash == stable hash, and so we must use the *previous* desired hash and balance traffic
// between previous desired vs. stable hash, in order to safely shift traffic back to stable.
// This function also returns the previous desired hash (where we are weighted to)
func (c *rolloutContext) isDynamicallyGoingBackToStable() (bool, string) {
if rolloututil.IsFullyPromoted(c.rollout) && c.rollout.Spec.Strategy.Canary.TrafficRouting != nil && c.rollout.Spec.Strategy.Canary.DynamicStableScale {
if c.rollout.Status.Canary.Weights != nil {
currSelector := c.rollout.Status.Canary.Weights.Canary.PodTemplateHash
desiredSelector := replicasetutil.GetPodTemplateHash(c.newRS)
if currSelector != desiredSelector {
if c.newRS.Status.AvailableReplicas < *c.rollout.Spec.Replicas {
return true, currSelector
}
}
}
}
return false, ""
}

// canProceedWithScaleDownAnnotation returns whether or not it is safe to proceed with annotating
// old replicasets with the scale-down-deadline in the traffic-routed canary strategy.
// This method only matters with ALB canary + the target group verification feature.
Expand Down
42 changes: 42 additions & 0 deletions rollout/canary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1890,3 +1890,45 @@ func TestHandleCanaryAbort(t *testing.T) {
assert.JSONEq(t, calculatePatch(r1, fmt.Sprintf(expectedPatch, newConditions)), patch)
})
}

// func TestIsStillReferenced(t *testing.T) {
// newRSWithPodTemplateHash := func(hash string) *appsv1.ReplicaSet {
// return &appsv1.ReplicaSet{
// ObjectMeta: metav1.ObjectMeta{
// Labels: map[string]string{
// v1alpha1.DefaultRolloutUniqueLabelKey: hash,
// },
// },
// }
// }
// {
// status := v1alpha1.RolloutStatus{StableRS: "abc123"}
// rs := &appsv1.ReplicaSet{}
// assert.False(t, IsStillReferenced(status, rs))
// }
// {
// status := v1alpha1.RolloutStatus{StableRS: "abc123"}
// rs := newRSWithPodTemplateHash("")
// assert.False(t, IsStillReferenced(status, rs))
// }
// {
// status := v1alpha1.RolloutStatus{StableRS: "abc123"}
// rs := newRSWithPodTemplateHash("abc123")
// assert.True(t, IsStillReferenced(status, rs))
// }
// {
// status := v1alpha1.RolloutStatus{CurrentPodHash: "abc123"}
// rs := newRSWithPodTemplateHash("abc123")
// assert.True(t, IsStillReferenced(status, rs))
// }
// {
// status := v1alpha1.RolloutStatus{BlueGreen: v1alpha1.BlueGreenStatus{ActiveSelector: "abc123"}}
// rs := newRSWithPodTemplateHash("abc123")
// assert.True(t, IsStillReferenced(status, rs))
// }
// {
// status := v1alpha1.RolloutStatus{StableRS: "abc123"}
// rs := newRSWithPodTemplateHash("def456")
// assert.False(t, IsStillReferenced(status, rs))
// }
// }
15 changes: 13 additions & 2 deletions rollout/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error {
return nil
}

c.targetsVerified = pointer.BoolPtr(false)
c.targetsVerified = pointer.Bool(false)

// get endpoints of service
endpoints, err := c.kubeclientset.CoreV1().Endpoints(svc.Namespace).Get(ctx, svc.Name, metav1.GetOptions{})
Expand Down Expand Up @@ -177,7 +177,7 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error {
}
c.recorder.Eventf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifiedReason}, conditions.TargetGroupVerifiedRegistrationMessage, svc.Name, tgb.Spec.TargetGroupARN, verifyRes.EndpointsRegistered)
}
c.targetsVerified = pointer.BoolPtr(true)
c.targetsVerified = pointer.Bool(true)
return nil
}

Expand Down Expand Up @@ -266,6 +266,17 @@ func (c *rolloutContext) reconcileStableAndCanaryService() error {
return nil
}

if dynamicallyGoingBackToStable, currSelector := c.isDynamicallyGoingBackToStable(); dynamicallyGoingBackToStable {
// User may have interrupted an update in order go back to stableRS, and is using dynamic
// stable scaling. If that is the case, the stableRS might be undersized and if we blindly
// switch service selector we could overwhelm stableRS pods.
// If we get here, we detected that the canary service needs to be pointed back to
// stable, but stable is not fully available. Skip the service switch for now.
c.log.Infof("delaying fully promoted service switch of '%s' from %s to %s: ReplicaSet '%s' not fully available",
c.rollout.Spec.Strategy.Canary.CanaryService, currSelector, replicasetutil.GetPodTemplateHash(c.newRS), c.newRS.Name)
return nil
}

err = c.ensureSVCTargets(c.rollout.Spec.Strategy.Canary.CanaryService, c.newRS, true)
if err != nil {
return err
Expand Down
43 changes: 29 additions & 14 deletions rollout/trafficrouting.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,25 +163,20 @@ func (c *rolloutContext) reconcileTrafficRouting() error {
canaryHash = c.newRS.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
}

if rolloututil.IsFullyPromoted(c.rollout) {
// when we are fully promoted. desired canary weight should be 0
if dynamicallyGoingBackToStable, prevDesiredHash := c.isDynamicallyGoingBackToStable(); dynamicallyGoingBackToStable {
desiredWeight = c.calculateDesiredWeightOnAbortOrStableRollback()
// Since stableRS == desiredRS, we must balance traffic between the
// *previous desired* vs. stable (as opposed to current desired vs. stable).
// The previous desired is remembered in Status.Canary.Weights.Canary.PodTemplateHash.
// See: https://github.com/argoproj/argo-rollouts/issues/3020
canaryHash = prevDesiredHash
} else if rolloututil.IsFullyPromoted(c.rollout) {
err := reconciler.RemoveManagedRoutes()
if err != nil {
return err
}
} else if c.pauseContext.IsAborted() {
// when aborted, desired canary weight should immediately be 0 (100% to stable), *unless*
// we are using dynamic stable scaling. In that case, we are dynamically decreasing the
// weight to the canary according to the availability of the stable (whatever it can support).
if c.rollout.Spec.Strategy.Canary.DynamicStableScale {
desiredWeight = 100 - ((100 * c.stableRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas)
if c.rollout.Status.Canary.Weights != nil {
// This ensures that if we are already at a lower weight, then we will not
// increase the weight because stable availability is flapping (e.g. pod restarts)
desiredWeight = minInt(desiredWeight, c.rollout.Status.Canary.Weights.Canary.Weight)
}
}

desiredWeight = c.calculateDesiredWeightOnAbortOrStableRollback()
if (c.rollout.Spec.Strategy.Canary.DynamicStableScale && desiredWeight == 0) || !c.rollout.Spec.Strategy.Canary.DynamicStableScale {
// If we are using dynamic stable scale we need to also make sure that desiredWeight=0 aka we are completely
// done with aborting before resetting the canary service selectors back to stable
Expand Down Expand Up @@ -295,6 +290,26 @@ func (c *rolloutContext) reconcileTrafficRouting() error {
return nil
}

// calculateDesiredWeightOnAbortOrStableRollback returns the desired weight to use when we are either
// aborting, or rolling back to stable RS.
func (c *rolloutContext) calculateDesiredWeightOnAbortOrStableRollback() int32 {
if !c.rollout.Spec.Strategy.Canary.DynamicStableScale {
// When aborting or rolling back to stable RS and dynamicStableScaling is disabled,
// then desired canary weight should immediately be 0 (100% to stable) since we can trust
// that it is fully scaled up
return 0
}
// When using dynamic stable scaling, we must dynamically decreasing the weight to the canary
// according to the availability of the stable (whatever it can support).
desiredWeight := 100 - ((100 * c.stableRS.Status.AvailableReplicas) / *c.rollout.Spec.Replicas)
if c.rollout.Status.Canary.Weights != nil {
// This ensures that if we are already at a lower weight, then we will not
// increase the weight because stable availability is flapping (e.g. pod restarts)
desiredWeight = minInt(desiredWeight, c.rollout.Status.Canary.Weights.Canary.Weight)
}
return desiredWeight
}

// trafficWeightUpdatedMessage returns a message we emit for the kubernetes event whenever we adjust traffic weights
func trafficWeightUpdatedMessage(prev, new *v1alpha1.TrafficWeights) string {
var details []string
Expand Down
11 changes: 0 additions & 11 deletions utils/replicaset/replicaset.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,17 +592,6 @@ func (o ReplicaSetsByRevisionNumber) Less(i, j int) bool {
return iRevision < jRevision
}

// IsStillReferenced returns if the given ReplicaSet is still being referenced by any of
// the current, stable, blue-green active references. Used to determine if the ReplicaSet can
// safely be scaled to zero, or deleted.
func IsStillReferenced(status v1alpha1.RolloutStatus, rs *appsv1.ReplicaSet) bool {
hash := GetPodTemplateHash(rs)
if hash != "" && (hash == status.StableRS || hash == status.CurrentPodHash || hash == status.BlueGreen.ActiveSelector) {
return true
}
return false
}

// HasScaleDownDeadline returns whether or not the given ReplicaSet is annotated with a scale-down delay
func HasScaleDownDeadline(rs *appsv1.ReplicaSet) bool {
if rs == nil || rs.Annotations == nil {
Expand Down
42 changes: 0 additions & 42 deletions utils/replicaset/replicaset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1078,48 +1078,6 @@ func TestNeedsRestart(t *testing.T) {
})
}

func TestIsStillReferenced(t *testing.T) {
newRSWithPodTemplateHash := func(hash string) *appsv1.ReplicaSet {
return &appsv1.ReplicaSet{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1alpha1.DefaultRolloutUniqueLabelKey: hash,
},
},
}
}
{
status := v1alpha1.RolloutStatus{StableRS: "abc123"}
rs := &appsv1.ReplicaSet{}
assert.False(t, IsStillReferenced(status, rs))
}
{
status := v1alpha1.RolloutStatus{StableRS: "abc123"}
rs := newRSWithPodTemplateHash("")
assert.False(t, IsStillReferenced(status, rs))
}
{
status := v1alpha1.RolloutStatus{StableRS: "abc123"}
rs := newRSWithPodTemplateHash("abc123")
assert.True(t, IsStillReferenced(status, rs))
}
{
status := v1alpha1.RolloutStatus{CurrentPodHash: "abc123"}
rs := newRSWithPodTemplateHash("abc123")
assert.True(t, IsStillReferenced(status, rs))
}
{
status := v1alpha1.RolloutStatus{BlueGreen: v1alpha1.BlueGreenStatus{ActiveSelector: "abc123"}}
rs := newRSWithPodTemplateHash("abc123")
assert.True(t, IsStillReferenced(status, rs))
}
{
status := v1alpha1.RolloutStatus{StableRS: "abc123"}
rs := newRSWithPodTemplateHash("def456")
assert.False(t, IsStillReferenced(status, rs))
}
}

func TestHasScaleDownDeadline(t *testing.T) {
{
assert.False(t, HasScaleDownDeadline(nil))
Expand Down

0 comments on commit 319d0a8

Please sign in to comment.