Skip to content

Commit

Permalink
fix: unexpected downtime in rollouts
Browse files Browse the repository at this point in the history
Signed-off-by: Abhishek Bansal <[email protected]>
  • Loading branch information
Abhishek Bansal committed Nov 5, 2024
1 parent 53c4f12 commit 3ef7a60
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 0 deletions.
22 changes: 22 additions & 0 deletions rollout/trafficrouting.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
rolloututil "github.com/argoproj/argo-rollouts/utils/rollout"
"github.com/argoproj/argo-rollouts/utils/weightutil"
appsv1 "k8s.io/api/apps/v1"
)

// NewTrafficRoutingReconciler identifies return the TrafficRouting Plugin that the rollout wants to modify
Expand Down Expand Up @@ -133,6 +134,23 @@ func (c *Controller) NewTrafficRoutingReconciler(roCtx *rolloutContext) ([]traff
return nil, nil
}

func (c *rolloutContext) checkReplicasAvailable(rs *appsv1.ReplicaSet, desiredWeight int32) bool {
if rs == nil {
return false
}
availableReplicas := rs.Status.AvailableReplicas
totalReplicas := *c.rollout.Spec.Replicas

desiredReplicas := (desiredWeight * totalReplicas) / 100
if availableReplicas < desiredReplicas {
c.log.Infof("ReplicaSet '%s' has %d available replicas, waiting for %d", rs.Name, availableReplicas, desiredReplicas)
return false
}

return true

}

// this currently only be used in the canary strategy
func (c *rolloutContext) reconcileTrafficRouting() error {
reconcilers, err := c.newTrafficRoutingReconciler(c)
Expand Down Expand Up @@ -243,6 +261,10 @@ func (c *rolloutContext) reconcileTrafficRouting() error {
desiredWeight = weightutil.MaxTrafficWeight(c.rollout)
}
}

if !c.checkReplicasAvailable(c.stableRS, weightutil.MaxTrafficWeight(c.rollout)-desiredWeight) {
return nil
}
// We need to check for revision > 1 because when we first install the rollout we run step 0 this prevents that.
// There is a bigger fix needed for the reasons on why we run step 0 on rollout install, that needs to be explored.
revision, revisionFound := annotations.GetRevisionAnnotation(c.rollout)
Expand Down
67 changes: 67 additions & 0 deletions rollout/trafficrouting_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1431,3 +1431,70 @@ func TestDontWeightOrHaveManagedRoutesDuringInterruptedUpdate(t *testing.T) {
f.fakeTrafficRouting.AssertCalled(t, "RemoveManagedRoutes", mock.Anything, mock.Anything)

}

// This test verifies that if we are shifting traffic to stable replicaset without the stable replicaset being available proportional to the weight, the traffic shouldn't be switched immediately to the stable replicaset.
func TestCheckReplicaSetAvailable(t *testing.T) {
fix := newFixture(t)
defer fix.Close()

steps := []v1alpha1.CanaryStep{
{
SetWeight: pointer.Int32(60),
},
{
Pause: &v1alpha1.RolloutPause{},
},
}

rollout1 := newCanaryRollout("test-rollout", 10, nil, steps, pointer.Int32(1), intstr.FromInt(1), intstr.FromInt(1))
rollout1.Spec.Strategy.Canary.DynamicStableScale = true
rollout1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{
SMI: &v1alpha1.SMITrafficRouting{},
}
rollout1.Spec.Strategy.Canary.CanaryService = "canary-service"
rollout1.Spec.Strategy.Canary.StableService = "stable-service"
rollout1.Status.ReadyReplicas = 10
rollout1.Status.AvailableReplicas = 10

rollout2 := bumpVersion(rollout1)

replicaSet1 := newReplicaSetWithStatus(rollout1, 1, 1)
replicaSet2 := newReplicaSetWithStatus(rollout2, 9, 9)

replicaSet1Hash := replicaSet1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
replicaSet2Hash := replicaSet2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
canarySelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: replicaSet2Hash}
stableSelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: replicaSet1Hash}
canarySvc := newService("canary-service", 80, canarySelector, rollout1)
stableSvc := newService("stable-service", 80, stableSelector, rollout1)

rollout2.Spec = rollout1.Spec
rollout2.Status.StableRS = replicaSet1Hash
rollout2.Status.CurrentPodHash = replicaSet1Hash
rollout2.Status.Canary.Weights = &v1alpha1.TrafficWeights{
Canary: v1alpha1.WeightDestination{
Weight: 10,
ServiceName: "canary-service",
PodTemplateHash: replicaSet2Hash,
},
Stable: v1alpha1.WeightDestination{
Weight: 90,
ServiceName: "stable-service",
PodTemplateHash: replicaSet1Hash,
},
}

fix.kubeobjects = append(fix.kubeobjects, replicaSet1, replicaSet2, canarySvc, stableSvc)
fix.replicaSetLister = append(fix.replicaSetLister, replicaSet1, replicaSet2)

fix.rolloutLister = append(fix.rolloutLister, rollout2)
fix.objects = append(fix.objects, rollout2)

fix.expectUpdateReplicaSetAction(replicaSet1)
fix.expectUpdateRolloutAction(rollout2)
fix.expectUpdateReplicaSetAction(replicaSet1)
fix.expectPatchRolloutAction(rollout2)
fix.fakeTrafficRouting = newUnmockedFakeTrafficRoutingReconciler()
fix.fakeTrafficRouting.On("RemoveManagedRoutes", mock.Anything, mock.Anything, mock.Anything).Return(nil)
fix.run(getKey(rollout1, t))
}

0 comments on commit 3ef7a60

Please sign in to comment.