Skip to content

Commit

Permalink
fix: rollback to stable with dynamicStableScale could overwhelm stabl…
Browse files Browse the repository at this point in the history
…e pods (argoproj#3077)

* fix: rollback to stable with dynamicStableScale could go under maxUnavailable

Signed-off-by: Jesse Suen <[email protected]>

* test: add unit tests

Signed-off-by: Jesse Suen <[email protected]>

* test: add e2e tests

Signed-off-by: Jesse Suen <[email protected]>

* refactor: move isReplicaSetReferenced to replicaset.go

Signed-off-by: Jesse Suen <[email protected]>

---------

Signed-off-by: Jesse Suen <[email protected]>
Signed-off-by: balasoiu <[email protected]>
  • Loading branch information
jessesuen authored and balasoiu committed Oct 27, 2023
1 parent 12c7c8f commit 3fc8d6e
Show file tree
Hide file tree
Showing 15 changed files with 631 additions and 124 deletions.
7 changes: 3 additions & 4 deletions rollout/bluegreen.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re
annotationedRSs := int32(0)
rolloutReplicas := defaults.GetReplicasOrDefault(c.rollout.Spec.Replicas)
for _, targetRS := range oldRSs {
if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) {
// We should technically never get here because we shouldn't be passing a replicaset list
// which includes referenced ReplicaSets. But we check just in case
c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name)
if c.isReplicaSetReferenced(targetRS) {
// We might get here if user interrupted an an update in order to move back to stable.
c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name)
continue
}
if *targetRS.Spec.Replicas == 0 {
Expand Down
49 changes: 24 additions & 25 deletions rollout/canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli

annotationedRSs := int32(0)
for _, targetRS := range oldRSs {
if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) {
// We should technically never get here because we shouldn't be passing a replicaset list
// which includes referenced ReplicaSets. But we check just in case
c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name)
if c.isReplicaSetReferenced(targetRS) {
// We might get here if user interrupted an an update in order to move back to stable.
c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name)
continue
}
if maxScaleDown <= 0 {
Expand Down Expand Up @@ -220,15 +219,8 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
// and doesn't yet have scale down deadline. This happens when a user changes their
// mind in the middle of an V1 -> V2 update, and then applies a V3. We are deciding
// what to do with the defunct, intermediate V2 ReplicaSet right now.
if !c.replicaSetReferencedByCanaryTraffic(targetRS) {
// It is safe to scale the intermediate RS down, if no traffic is directed to it.
c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name)
} else {
c.log.Infof("Skip scaling down intermediate RS '%s': still referenced by service", targetRS.Name)
// This ReplicaSet is still referenced by the service. It is not safe to scale
// this down.
continue
}
// It is safe to scale the intermediate RS down, since no traffic is directed to it.
c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name)
}
}
if *targetRS.Spec.Replicas == desiredReplicaCount {
Expand All @@ -248,19 +240,26 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
return totalScaledDown, nil
}

func (c *rolloutContext) replicaSetReferencedByCanaryTraffic(rs *appsv1.ReplicaSet) bool {
rsPodHash := replicasetutil.GetPodTemplateHash(rs)
ro := c.rollout

if ro.Status.Canary.Weights == nil {
return false
}

if ro.Status.Canary.Weights.Canary.PodTemplateHash == rsPodHash || ro.Status.Canary.Weights.Stable.PodTemplateHash == rsPodHash {
return true
// isDynamicallyRollingBackToStable returns true if we were in the middle of an canary update with
// dynamic stable scaling, but was interrupted and are now rolling back to stable RS. This is similar
// to, but different than aborting. With abort, desired hash != stable hash and so we know the
// two hashes to balance traffic against. But with dynamically rolling back to stable, the
// desired hash == stable hash, and so we must use the *previous* desired hash and balance traffic
// between previous desired vs. stable hash, in order to safely shift traffic back to stable.
// This function also returns the previous desired hash (where we are weighted to)
func isDynamicallyRollingBackToStable(ro *v1alpha1.Rollout, desiredRS *appsv1.ReplicaSet) (bool, string) {
if rolloututil.IsFullyPromoted(ro) && ro.Spec.Strategy.Canary.TrafficRouting != nil && ro.Spec.Strategy.Canary.DynamicStableScale {
if ro.Status.Canary.Weights != nil {
currSelector := ro.Status.Canary.Weights.Canary.PodTemplateHash
desiredSelector := replicasetutil.GetPodTemplateHash(desiredRS)
if currSelector != desiredSelector {
if desiredRS.Status.AvailableReplicas < *ro.Spec.Replicas {
return true, currSelector
}
}
}
}

return false
return false, ""
}

// canProceedWithScaleDownAnnotation returns whether or not it is safe to proceed with annotating
Expand Down
118 changes: 118 additions & 0 deletions rollout/canary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/stretchr/testify/assert"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -1890,3 +1891,120 @@ func TestHandleCanaryAbort(t *testing.T) {
assert.JSONEq(t, calculatePatch(r1, fmt.Sprintf(expectedPatch, newConditions)), patch)
})
}

func TestIsDynamicallyRollingBackToStable(t *testing.T) {
newRSWithHashAndReplicas := func(hash string, available int32) *appsv1.ReplicaSet {
return &appsv1.ReplicaSet{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1alpha1.DefaultRolloutUniqueLabelKey: hash,
},
},
Status: v1.ReplicaSetStatus{
AvailableReplicas: available,
},
}
}

testCases := []struct {
name string
status v1alpha1.RolloutStatus
trafficRoutingDisabled bool
dynamicStableScalingDisabled bool
rsHash string
rsAvailableReplicas *int32 // if nil, will set to rollout replicas
trafficWeights *v1alpha1.TrafficWeights
expectedResult bool
}{
{
name: "desired RS != stable RS",
status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "def456"},
rsHash: "",
expectedResult: false,
},
{
name: "not using traffic routing",
trafficRoutingDisabled: true,
status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "abc123"},
rsHash: "",
expectedResult: false,
},
{
name: "not using dynamicStableScaling",
dynamicStableScalingDisabled: true,
status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "abc123"},
rsHash: "",
expectedResult: false,
},
{
name: "weighted selector == desired RS",
status: v1alpha1.RolloutStatus{
CurrentPodHash: "abc123",
StableRS: "abc123",
Canary: v1alpha1.CanaryStatus{
Weights: &v1alpha1.TrafficWeights{
Canary: v1alpha1.WeightDestination{
PodTemplateHash: "abc123",
},
},
},
},
rsHash: "abc123",
expectedResult: false,
},
{
name: "weighted selector != desired RS, desired not fully available",
status: v1alpha1.RolloutStatus{
CurrentPodHash: "abc123",
StableRS: "abc123",
Canary: v1alpha1.CanaryStatus{
Weights: &v1alpha1.TrafficWeights{
Canary: v1alpha1.WeightDestination{
PodTemplateHash: "def456",
},
},
},
},
rsHash: "abc123",
rsAvailableReplicas: pointer.Int32(1),
expectedResult: true,
},
{
name: "weighted selector != desired RS, desired RS is fully available",
status: v1alpha1.RolloutStatus{
CurrentPodHash: "abc123",
StableRS: "abc123",
Canary: v1alpha1.CanaryStatus{
Weights: &v1alpha1.TrafficWeights{
Canary: v1alpha1.WeightDestination{
PodTemplateHash: "def456",
},
},
},
},
rsHash: "abc123",
expectedResult: true,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ro := newCanaryRollout("test", 10, nil, nil, nil, intstr.FromInt(0), intstr.FromInt(1))
if !tc.trafficRoutingDisabled {
ro.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{}
}
if !tc.dynamicStableScalingDisabled {
ro.Spec.Strategy.Canary.DynamicStableScale = true
}
ro.Status = tc.status

desiredRS := newRSWithHashAndReplicas(tc.rsHash, 1)
if tc.rsAvailableReplicas != nil {
desiredRS.Status.AvailableReplicas = *tc.rsAvailableReplicas
}

rbToStable, _ := isDynamicallyRollingBackToStable(ro, desiredRS)

assert.Equal(t, tc.expectedResult, rbToStable)
})
}
}
55 changes: 55 additions & 0 deletions rollout/replicaset.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

appsv1 "k8s.io/api/apps/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
patchtypes "k8s.io/apimachinery/pkg/types"
Expand All @@ -15,6 +16,7 @@ import (
"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
"github.com/argoproj/argo-rollouts/utils/defaults"
replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
serviceutil "github.com/argoproj/argo-rollouts/utils/service"
timeutil "github.com/argoproj/argo-rollouts/utils/time"
)

Expand Down Expand Up @@ -296,3 +298,56 @@ func (c *rolloutContext) scaleDownDelayHelper(rs *appsv1.ReplicaSet, annotatione

return annotationedRSs, desiredReplicaCount, nil
}

// isReplicaSetReferenced returns if the given ReplicaSet is still being referenced by any of
// the current, stable, blue-green services. Used to determine if the ReplicaSet can
// safely be scaled to zero, or deleted.
func (c *rolloutContext) isReplicaSetReferenced(rs *appsv1.ReplicaSet) bool {
rsPodHash := replicasetutil.GetPodTemplateHash(rs)
if rsPodHash == "" {
return false
}
ro := c.rollout
referencesToCheck := []string{
ro.Status.StableRS,
ro.Status.CurrentPodHash,
ro.Status.BlueGreen.ActiveSelector,
ro.Status.BlueGreen.PreviewSelector,
}
if ro.Status.Canary.Weights != nil {
referencesToCheck = append(referencesToCheck, ro.Status.Canary.Weights.Canary.PodTemplateHash, ro.Status.Canary.Weights.Stable.PodTemplateHash)
}
for _, ref := range referencesToCheck {
if ref == rsPodHash {
return true
}
}

// The above are static, lightweight checks to see if the selectors we record in our status are
// still referencing the ReplicaSet in question. Those checks aren't always enough. Next, we do
// a deeper check to look up the actual service objects, and see if they are still referencing
// the ReplicaSet. If so, we cannot scale it down.
var servicesToCheck []string
if ro.Spec.Strategy.Canary != nil {
servicesToCheck = []string{ro.Spec.Strategy.Canary.CanaryService, ro.Spec.Strategy.Canary.StableService}
} else {
servicesToCheck = []string{ro.Spec.Strategy.BlueGreen.ActiveService, ro.Spec.Strategy.BlueGreen.PreviewService}
}
for _, svcName := range servicesToCheck {
if svcName == "" {
continue
}
svc, err := c.servicesLister.Services(c.rollout.Namespace).Get(svcName)
if err != nil {
if k8serrors.IsNotFound(err) {
// service doesn't exist
continue
}
return true
}
if serviceutil.GetRolloutSelectorLabel(svc) == rsPodHash {
return true
}
}
return false
}
Loading

0 comments on commit 3fc8d6e

Please sign in to comment.