From 3ef7a60ff2af440d2e22cfafd35deb94e1e0ca4a Mon Sep 17 00:00:00 2001 From: Abhishek Bansal Date: Tue, 5 Nov 2024 11:24:53 +0530 Subject: [PATCH] fix: unexpected downtime in rollouts Signed-off-by: Abhishek Bansal --- rollout/trafficrouting.go | 22 +++++++++++ rollout/trafficrouting_test.go | 67 ++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index eef25e1c1c..76278a79be 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -27,6 +27,7 @@ import ( replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" rolloututil "github.com/argoproj/argo-rollouts/utils/rollout" "github.com/argoproj/argo-rollouts/utils/weightutil" + appsv1 "k8s.io/api/apps/v1" ) // NewTrafficRoutingReconciler identifies return the TrafficRouting Plugin that the rollout wants to modify @@ -133,6 +134,23 @@ func (c *Controller) NewTrafficRoutingReconciler(roCtx *rolloutContext) ([]traff return nil, nil } +func (c *rolloutContext) checkReplicasAvailable(rs *appsv1.ReplicaSet, desiredWeight int32) bool { + if rs == nil { + return false + } + availableReplicas := rs.Status.AvailableReplicas + totalReplicas := *c.rollout.Spec.Replicas + + desiredReplicas := (desiredWeight * totalReplicas) / 100 + if availableReplicas < desiredReplicas { + c.log.Infof("ReplicaSet '%s' has %d available replicas, waiting for %d", rs.Name, availableReplicas, desiredReplicas) + return false + } + + return true + +} + // this currently only be used in the canary strategy func (c *rolloutContext) reconcileTrafficRouting() error { reconcilers, err := c.newTrafficRoutingReconciler(c) @@ -243,6 +261,10 @@ func (c *rolloutContext) reconcileTrafficRouting() error { desiredWeight = weightutil.MaxTrafficWeight(c.rollout) } } + + if !c.checkReplicasAvailable(c.stableRS, weightutil.MaxTrafficWeight(c.rollout)-desiredWeight) { + return nil + } // We need to check for revision > 1 because when we first install the rollout we run step 0 this prevents that. // There is a bigger fix needed for the reasons on why we run step 0 on rollout install, that needs to be explored. revision, revisionFound := annotations.GetRevisionAnnotation(c.rollout) diff --git a/rollout/trafficrouting_test.go b/rollout/trafficrouting_test.go index 288430f7a6..2b72fc95e3 100644 --- a/rollout/trafficrouting_test.go +++ b/rollout/trafficrouting_test.go @@ -1431,3 +1431,70 @@ func TestDontWeightOrHaveManagedRoutesDuringInterruptedUpdate(t *testing.T) { f.fakeTrafficRouting.AssertCalled(t, "RemoveManagedRoutes", mock.Anything, mock.Anything) } + +// This test verifies that if we are shifting traffic to stable replicaset without the stable replicaset being available proportional to the weight, the traffic shouldn't be switched immediately to the stable replicaset. +func TestCheckReplicaSetAvailable(t *testing.T) { + fix := newFixture(t) + defer fix.Close() + + steps := []v1alpha1.CanaryStep{ + { + SetWeight: pointer.Int32(60), + }, + { + Pause: &v1alpha1.RolloutPause{}, + }, + } + + rollout1 := newCanaryRollout("test-rollout", 10, nil, steps, pointer.Int32(1), intstr.FromInt(1), intstr.FromInt(1)) + rollout1.Spec.Strategy.Canary.DynamicStableScale = true + rollout1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ + SMI: &v1alpha1.SMITrafficRouting{}, + } + rollout1.Spec.Strategy.Canary.CanaryService = "canary-service" + rollout1.Spec.Strategy.Canary.StableService = "stable-service" + rollout1.Status.ReadyReplicas = 10 + rollout1.Status.AvailableReplicas = 10 + + rollout2 := bumpVersion(rollout1) + + replicaSet1 := newReplicaSetWithStatus(rollout1, 1, 1) + replicaSet2 := newReplicaSetWithStatus(rollout2, 9, 9) + + replicaSet1Hash := replicaSet1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + replicaSet2Hash := replicaSet2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + canarySelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: replicaSet2Hash} + stableSelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: replicaSet1Hash} + canarySvc := newService("canary-service", 80, canarySelector, rollout1) + stableSvc := newService("stable-service", 80, stableSelector, rollout1) + + rollout2.Spec = rollout1.Spec + rollout2.Status.StableRS = replicaSet1Hash + rollout2.Status.CurrentPodHash = replicaSet1Hash + rollout2.Status.Canary.Weights = &v1alpha1.TrafficWeights{ + Canary: v1alpha1.WeightDestination{ + Weight: 10, + ServiceName: "canary-service", + PodTemplateHash: replicaSet2Hash, + }, + Stable: v1alpha1.WeightDestination{ + Weight: 90, + ServiceName: "stable-service", + PodTemplateHash: replicaSet1Hash, + }, + } + + fix.kubeobjects = append(fix.kubeobjects, replicaSet1, replicaSet2, canarySvc, stableSvc) + fix.replicaSetLister = append(fix.replicaSetLister, replicaSet1, replicaSet2) + + fix.rolloutLister = append(fix.rolloutLister, rollout2) + fix.objects = append(fix.objects, rollout2) + + fix.expectUpdateReplicaSetAction(replicaSet1) + fix.expectUpdateRolloutAction(rollout2) + fix.expectUpdateReplicaSetAction(replicaSet1) + fix.expectPatchRolloutAction(rollout2) + fix.fakeTrafficRouting = newUnmockedFakeTrafficRoutingReconciler() + fix.fakeTrafficRouting.On("RemoveManagedRoutes", mock.Anything, mock.Anything, mock.Anything).Return(nil) + fix.run(getKey(rollout1, t)) +}