Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

autoupdate rollout: honour the maintenance window duration #50745

Merged
merged 4 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions lib/autoupdate/rollout/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,12 @@ func (r *reconciler) buildRolloutSpec(config *autoupdate.AutoUpdateConfigSpecAge
}

return &autoupdate.AutoUpdateAgentRolloutSpec{
StartVersion: version.GetStartVersion(),
TargetVersion: version.GetTargetVersion(),
Schedule: version.GetSchedule(),
AutoupdateMode: mode,
Strategy: strategy,
StartVersion: version.GetStartVersion(),
TargetVersion: version.GetTargetVersion(),
Schedule: version.GetSchedule(),
AutoupdateMode: mode,
Strategy: strategy,
MaintenanceWindowDuration: config.GetMaintenanceWindowDuration(),
}, nil

}
Expand Down Expand Up @@ -318,7 +319,7 @@ func (r *reconciler) computeStatus(
}
status.Groups = groups

err = r.progressRollout(ctx, newSpec.GetStrategy(), status, now)
err = r.progressRollout(ctx, newSpec, status, now)
// Failing to progress the update is not a hard failure.
// We want to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user.
if err != nil {
Expand All @@ -334,13 +335,13 @@ func (r *reconciler) computeStatus(
// groups are updated in place.
// If an error is returned, the groups should still be upserted, depending on the strategy,
// failing to update a group might not be fatal (other groups can still progress independently).
func (r *reconciler) progressRollout(ctx context.Context, strategyName string, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
func (r *reconciler) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
for _, strategy := range r.rolloutStrategies {
if strategy.name() == strategyName {
return strategy.progressRollout(ctx, status, now)
if strategy.name() == spec.GetStrategy() {
return strategy.progressRollout(ctx, spec, status, now)
}
}
return trace.NotImplemented("rollout strategy %q not implemented", strategyName)
return trace.NotImplemented("rollout strategy %q not implemented", spec.GetStrategy())
}

// makeGroupStatus creates the autoupdate_agent_rollout.status.groups based on the autoupdate_config.
Expand Down
2 changes: 1 addition & 1 deletion lib/autoupdate/rollout/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ func (f *fakeRolloutStrategy) name() string {
return f.strategyName
}

func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
f.calls++
return nil
}
Expand Down
19 changes: 14 additions & 5 deletions lib/autoupdate/rollout/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,28 +40,37 @@ const (
// This interface allows us to inject dummy strategies for simpler testing.
type rolloutStrategy interface {
name() string
progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error
// progressRollout takes the new rollout spec, existing rollout status and current time.
// It updates the status resource in-place to progress the rollout to the next step if possible/needed.
progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutSpec, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error
}

func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) {
// inWindow checks if the time is in the grou's maintenance window.
// The maintenance window is a semi-open interval: [windowStart; windowEnd).
hugoShaka marked this conversation as resolved.
Show resolved Hide resolved
func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time, duration time.Duration) (bool, error) {
dayOK, err := canUpdateToday(group.ConfigDays, now)
if err != nil {
return false, trace.Wrap(err, "checking the day of the week")
}
if !dayOK {
return false, nil
}
return int(group.ConfigStartHour) == now.Hour(), nil

// We compute the theoretical window start and end
windowStart := now.Truncate(24 * time.Hour).Add(time.Duration(group.ConfigStartHour) * time.Hour)
bl-nero marked this conversation as resolved.
Show resolved Hide resolved
windowEnd := windowStart.Add(duration)

return !now.Before(windowStart) && now.Before(windowEnd), nil
}

// rolloutChangedInWindow checks if the rollout got created after the theoretical group start time
func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time) (bool, error) {
func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time, duration time.Duration) (bool, error) {
// If the rollout is older than 24h, we know it did not change during the window
if now.Sub(rolloutStart) > 24*time.Hour {
return false, nil
}
// Else we check if the rollout happened in the group window.
return inWindow(group, rolloutStart)
return inWindow(group, rolloutStart, duration)
}

func canUpdateToday(allowedDays []string, now time.Time) (bool, error) {
Expand Down
9 changes: 5 additions & 4 deletions lib/autoupdate/rollout/strategy_haltonerror.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const (
updateReasonPreviousGroupsNotDone = "previous_groups_not_done"
updateReasonUpdateComplete = "update_complete"
updateReasonUpdateInProgress = "update_in_progress"
haltOnErrorWindowDuration = time.Hour
)

type haltOnErrorStrategy struct {
Expand All @@ -54,7 +55,7 @@ func newHaltOnErrorStrategy(log *slog.Logger) (rolloutStrategy, error) {
}, nil
}

func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, _ *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
bl-nero marked this conversation as resolved.
Show resolved Hide resolved
// We process every group in order, all the previous groups must be in the DONE state
// for the next group to become active. Even if some early groups are not DONE,
// later groups might be ACTIVE and need to transition to DONE, so we cannot
Expand All @@ -81,7 +82,7 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autou
}

// Check if the rollout got created after the theoretical group start time
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime())
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime(), haltOnErrorWindowDuration)
if err != nil {
setGroupState(group, group.State, updateReasonReconcilerError, now)
return err
Expand Down Expand Up @@ -149,14 +150,14 @@ func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRollout
}
}

return inWindow(group, now)
return inWindow(group, now, haltOnErrorWindowDuration)
}

func isDoneHaltOnError(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, string) {
// Currently we don't implement status reporting from groups/agents.
// So we just wait 60 minutes and consider the maintenance done.
// This will change as we introduce agent status report and aggregated agent counts.
if group.StartTime.AsTime().Add(time.Hour).Before(now) {
if group.StartTime.AsTime().Add(haltOnErrorWindowDuration).Before(now) {
return true, updateReasonUpdateComplete
}
return false, updateReasonUpdateInProgress
Expand Down
2 changes: 1 addition & 1 deletion lib/autoupdate/rollout/strategy_haltonerror_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
State: 0,
StartTime: tt.rolloutStartTime,
}
err := strategy.progressRollout(ctx, status, clock.Now())
err := strategy.progressRollout(ctx, nil, status, clock.Now())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is calling progressRollout with a nil spec expected to happen in practice? Should this error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a single "real" call to this interface and it always sets the spec. Strategies can expect the spec to be always set.

The nil spec in the test is me taking a shortcut because I know that this specific strategy doesn't rely on the spec.

require.NoError(t, err)
// We use require.Equal instead of Elements match because group order matters.
// It's not super important for time-based, but is crucial for halt-on-error.
Expand Down
76 changes: 57 additions & 19 deletions lib/autoupdate/rollout/strategy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,56 +95,94 @@ func Test_canUpdateToday(t *testing.T) {

func Test_inWindow(t *testing.T) {
tests := []struct {
name string
group *autoupdate.AutoUpdateAgentRolloutStatusGroup
now time.Time
want bool
wantErr require.ErrorAssertionFunc
name string
group *autoupdate.AutoUpdateAgentRolloutStatusGroup
now time.Time
duration time.Duration
want bool
wantErr require.ErrorAssertionFunc
}{
{
name: "out of window",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekdayButSunday,
ConfigStartHour: matchingStartHour,
},
now: testSunday,
want: false,
wantErr: require.NoError,
now: testSunday,
duration: time.Hour,
want: false,
wantErr: require.NoError,
},
{
name: "inside window, wrong hour",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: nonMatchingStartHour,
},
now: testSunday,
want: false,
wantErr: require.NoError,
now: testSunday,
duration: time.Hour,
want: false,
wantErr: require.NoError,
},
{
name: "inside window, correct hour",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: matchingStartHour,
},
now: testSunday,
want: true,
wantErr: require.NoError,
now: testSunday,
duration: time.Hour,
want: true,
wantErr: require.NoError,
},
{
name: "invalid weekdays",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: []string{"HelloThereGeneralKenobi"},
ConfigStartHour: matchingStartHour,
},
now: testSunday,
want: false,
wantErr: require.Error,
now: testSunday,
duration: time.Hour,
want: false,
wantErr: require.Error,
},
{
name: "short window",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: matchingStartHour,
},
now: testSunday,
duration: time.Second,
want: false,
wantErr: require.NoError,
},
{
name: "window start time is included",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: matchingStartHour,
},
now: testSunday.Truncate(24 * time.Hour).Add(time.Duration(matchingStartHour) * time.Hour),
duration: time.Hour,
want: true,
wantErr: require.NoError,
},
{
name: "window end time is not included",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: matchingStartHour,
},
now: testSunday.Truncate(24 * time.Hour).Add(time.Duration(matchingStartHour+1) * time.Hour),
duration: time.Hour,
want: false,
wantErr: require.NoError,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := inWindow(tt.group, tt.now)
got, err := inWindow(tt.group, tt.now, tt.duration)
tt.wantErr(t, err)
require.Equal(t, tt.want, got)
})
Expand Down Expand Up @@ -205,7 +243,7 @@ func Test_rolloutChangedInWindow(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Test execution.
result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart)
result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart, time.Hour)
require.NoError(t, err)
require.Equal(t, tt.want, result)
})
Expand Down
14 changes: 10 additions & 4 deletions lib/autoupdate/rollout/strategy_timebased.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,13 @@ func newTimeBasedStrategy(log *slog.Logger) (rolloutStrategy, error) {
}, nil
}

func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
func (h *timeBasedStrategy) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
windowDuration := spec.GetMaintenanceWindowDuration().AsDuration()
// Backward compatibility for resources previously created without duration.
if windowDuration == 0 {
windowDuration = haltOnErrorWindowDuration
}

// We always process every group regardless of the order.
var errs []error
for _, group := range status.Groups {
Expand All @@ -61,7 +67,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
// We start any group unstarted group in window.
// Done groups can transition back to active if they enter their maintenance window again.
// Some agents might have missed the previous windows and might expected to try again.
shouldBeActive, err := inWindow(group, now)
shouldBeActive, err := inWindow(group, now, windowDuration)
if err != nil {
// In time-based rollouts, groups are not dependent.
// Failing to transition a group should affect other groups.
Expand All @@ -72,7 +78,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
}

// Check if the rollout got created after the theoretical group start time
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime())
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime(), windowDuration)
if err != nil {
setGroupState(group, group.State, updateReasonReconcilerError, now)
errs = append(errs, err)
Expand All @@ -93,7 +99,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE:
// The group is currently being updated. We check if the maintenance
// is over and if we should transition it to the done state
shouldBeActive, err := inWindow(group, now)
shouldBeActive, err := inWindow(group, now, windowDuration)
if err != nil {
// In time-based rollouts, groups are not dependent.
// Failing to transition a group should affect other groups.
Expand Down
8 changes: 7 additions & 1 deletion lib/autoupdate/rollout/strategy_timebased_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/jonboulle/clockwork"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/durationpb"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
Expand Down Expand Up @@ -325,14 +326,19 @@ func Test_progressGroupsTimeBased(t *testing.T) {
},
},
}

spec := &autoupdate.AutoUpdateAgentRolloutSpec{
MaintenanceWindowDuration: durationpb.New(time.Hour),
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
status := &autoupdate.AutoUpdateAgentRolloutStatus{
Groups: tt.initialState,
State: 0,
StartTime: tt.rolloutStartTime,
}
err := strategy.progressRollout(ctx, status, clock.Now())
err := strategy.progressRollout(ctx, spec, status, clock.Now())
require.NoError(t, err)
// We use require.Equal instead of Elements match because group order matters.
// It's not super important for time-based, but is crucial for halt-on-error.
Expand Down
Loading