Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Extend MS ScalingUp and Remediationg conditions to include preflight check errors #11390

Merged
merged 2 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,10 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

v1beta2conditions.Set(machineToBeRemediated, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneMachineRemediationMachineDeletedV1Beta2Reason,
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneMachineRemediationMachineDeletedV1Beta2Reason,
Message: "Machine deletionTimestamp set",
})

// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
Expand Down
52 changes: 27 additions & 25 deletions controlplane/kubeadm/internal/controllers/remediation_test.go

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions controlplane/kubeadm/internal/controllers/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ func Test_setRemediatingCondition(t *testing.T) {
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneMachineRemediationMachineDeletedV1Beta2Reason, Message: "Machine deletionTimestamp set"}

tests := []struct {
name string
Expand Down Expand Up @@ -550,7 +550,7 @@ func Test_setRemediatingCondition(t *testing.T) {
Type: controlplanev1.KubeadmControlPlaneRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneRemediatingV1Beta2Reason,
Message: "Remediation in progress from Machine m3",
Message: "Machine deletionTimestamp set from Machine m3",
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1502,7 +1502,7 @@ func TestSetRemediatingCondition(t *testing.T) {
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, Message: "Machine deletionTimestamp set"}

tests := []struct {
name string
Expand Down Expand Up @@ -1550,7 +1550,7 @@ func TestSetRemediatingCondition(t *testing.T) {
Type: clusterv1.ClusterRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.ClusterRemediatingV1Beta2Reason,
Message: "Remediation in progress from Machine m3",
Message: "Machine deletionTimestamp set from Machine m3",
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ func Test_setRemediatingCondition(t *testing.T) {
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, Message: "Machine deletionTimestamp set"}

tests := []struct {
name string
Expand Down Expand Up @@ -932,7 +932,7 @@ func Test_setRemediatingCondition(t *testing.T) {
Type: clusterv1.MachineDeploymentRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentRemediatingV1Beta2Reason,
Message: "Remediation in progress from Machine m3",
Message: "Machine deletionTimestamp set from Machine m3",
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -477,9 +477,10 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg

if ownerRemediatedCondition := v1beta2conditions.Get(t.Machine, clusterv1.MachineOwnerRemediatedV1Beta2Condition); ownerRemediatedCondition == nil || ownerRemediatedCondition.Status == metav1.ConditionTrue {
v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
})
}
}
Expand Down
11 changes: 8 additions & 3 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ type scope struct {
infrastructureObjectNotFound bool
getAndAdoptMachinesForMachineSetSucceeded bool
owningMachineDeployment *clusterv1.MachineDeployment
remediationPreflightCheckErrMessage string
scaleUpPreflightCheckErrMessage string
}

type machineSetReconcileFunc func(ctx context.Context, s *scope) (ctrl.Result, error)
Expand Down Expand Up @@ -602,6 +604,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e
// If the error is not nil use that as the message for the condition.
preflightCheckErrMessage = err.Error()
}
s.scaleUpPreflightCheckErrMessage = preflightCheckErrMessage
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage)
return result, err
}
Expand Down Expand Up @@ -1350,6 +1353,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
if preflightChecksFailed {
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
// WaitingForRemediationReason reason.
s.remediationPreflightCheckErrMessage = preflightCheckErrMessage
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Expand All @@ -1375,9 +1379,10 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
// Instead if we set the condition but the deletion does not go through on next reconcile either the
// condition will be fixed/updated or the Machine deletion will be retried.
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason,
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason,
Message: "Machine deletionTimestamp set",
}, &clusterv1.Condition{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionTrue,
Expand Down
29 changes: 19 additions & 10 deletions internal/controllers/machineset/machineset_controller_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package machineset
import (
"context"
"fmt"
"slices"
"sort"
"strings"
"time"
Expand Down Expand Up @@ -47,7 +48,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) {
// Conditions

// Update the ScalingUp and ScalingDown condition.
setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded)
setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded, s.scaleUpPreflightCheckErrMessage)
setScalingDownCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded)

// MachinesReady condition: aggregate the Machine's Ready condition.
Expand All @@ -59,7 +60,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) {
machines := collections.FromMachines(s.machines...)
machinesToBeRemediated := machines.Filter(collections.IsUnhealthyAndOwnerRemediated)
unhealthyMachines := machines.Filter(collections.IsUnhealthy)
setRemediatingCondition(ctx, s.machineSet, machinesToBeRemediated, unhealthyMachines, s.getAndAdoptMachinesForMachineSetSucceeded)
setRemediatingCondition(ctx, s.machineSet, machinesToBeRemediated, unhealthyMachines, s.getAndAdoptMachinesForMachineSetSucceeded, s.remediationPreflightCheckErrMessage)

setDeletingCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded)
}
Expand Down Expand Up @@ -92,7 +93,7 @@ func setReplicas(_ context.Context, ms *clusterv1.MachineSet, machines []*cluste
ms.Status.V1Beta2.UpToDateReplicas = ptr.To(upToDateReplicas)
}

func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool) {
func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool, scaleUpPreflightCheckErrMessage string) {
// If we got unexpected errors in listing the machines (this should never happen), surface them.
if !getAndAdoptMachinesForMachineSetSucceeded {
v1beta2conditions.Set(ms, metav1.Condition{
Expand Down Expand Up @@ -125,7 +126,7 @@ func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines
if currentReplicas >= desiredReplicas {
var message string
if missingReferencesMessage != "" {
message = fmt.Sprintf("Scaling up would be blocked %s", missingReferencesMessage)
message = fmt.Sprintf("Scaling up would be blocked because %s", missingReferencesMessage)
}
v1beta2conditions.Set(ms, metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Expand All @@ -138,8 +139,11 @@ func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines

// Scaling up.
message := fmt.Sprintf("Scaling up from %d to %d replicas", currentReplicas, desiredReplicas)
if missingReferencesMessage != "" {
message += fmt.Sprintf(" is blocked %s", missingReferencesMessage)
if missingReferencesMessage != "" || scaleUpPreflightCheckErrMessage != "" {
blockMessages := slices.DeleteFunc([]string{missingReferencesMessage, scaleUpPreflightCheckErrMessage}, func(s string) bool {
return s == ""
})
message += fmt.Sprintf(" is blocked because %s", strings.Join(blockMessages, " and "))
}
v1beta2conditions.Set(ms, metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Expand Down Expand Up @@ -281,7 +285,7 @@ func setMachinesUpToDateCondition(ctx context.Context, machineSet *clusterv1.Mac
v1beta2conditions.Set(machineSet, *upToDateCondition)
}

func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machinesToBeRemediated, unhealthyMachines collections.Machines, getAndAdoptMachinesForMachineSetSucceeded bool) {
func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machinesToBeRemediated, unhealthyMachines collections.Machines, getAndAdoptMachinesForMachineSetSucceeded bool, remediationPreflightCheckErrMessage string) {
if !getAndAdoptMachinesForMachineSetSucceeded {
v1beta2conditions.Set(machineSet, metav1.Condition{
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
Expand Down Expand Up @@ -320,11 +324,16 @@ func setRemediatingCondition(ctx context.Context, machineSet *clusterv1.MachineS
return
}

msg := remediatingCondition.Message
if remediationPreflightCheckErrMessage != "" {
msg = fmt.Sprintf("Triggering further remediations is blocked because %s; %s", remediationPreflightCheckErrMessage, remediatingCondition.Message)
}

v1beta2conditions.Set(machineSet, metav1.Condition{
Type: remediatingCondition.Type,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
Message: remediatingCondition.Message,
Message: msg,
})
}

Expand Down Expand Up @@ -383,10 +392,10 @@ func calculateMissingReferencesMessage(ms *clusterv1.MachineSet, bootstrapTempla
}

if len(missingObjects) == 1 {
return fmt.Sprintf("because %s does not exist", missingObjects[0])
return fmt.Sprintf("%s does not exist", missingObjects[0])
}

return fmt.Sprintf("because %s do not exist", strings.Join(missingObjects, " and "))
return fmt.Sprintf("%s do not exist", strings.Join(missingObjects, " and "))
}

func aggregateStaleMachines(machines []*clusterv1.Machine) string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func Test_setScalingUpCondition(t *testing.T) {
bootstrapObjectNotFound bool
infrastructureObjectNotFound bool
getAndAdoptMachinesForMachineSetSucceeded bool
scaleUpPreflightCheckErrMessage string
expectCondition metav1.Condition
}{
{
Expand Down Expand Up @@ -299,6 +300,27 @@ func Test_setScalingUpCondition(t *testing.T) {
Message: "Scaling up from 0 to 3 replicas is blocked because DockerMachineTemplate does not exist",
},
},
{
name: "scaling up and blocked by bootstrap and infrastructure object and preflight checks",
ms: scalingUpMachineSetWith3Replicas,
bootstrapObjectNotFound: true,
infrastructureObjectNotFound: true,
getAndAdoptMachinesForMachineSetSucceeded: true,
// This preflight check error can happen when a MachineSet is scaling up while the control plane
// already has a newer Kubernetes version.
scaleUpPreflightCheckErrMessage: "MachineSet version (1.25.5) and ControlPlane version (1.26.2) " +
"do not conform to kubeadm version skew policy as kubeadm only supports joining with the same " +
"major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)",
expectCondition: metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetScalingUpV1Beta2Reason,
Message: "Scaling up from 0 to 3 replicas is blocked because KubeadmBootstrapTemplate and DockerMachineTemplate " +
"do not exist and MachineSet version (1.25.5) and ControlPlane version (1.26.2) " +
"do not conform to kubeadm version skew policy as kubeadm only supports joining with the same " +
"major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)",
},
},
{
name: "deleting",
ms: deletingMachineSetWith3Replicas,
Expand All @@ -317,7 +339,7 @@ func Test_setScalingUpCondition(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded)
setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded, tt.scaleUpPreflightCheckErrMessage)

condition := v1beta2conditions.Get(tt.ms, clusterv1.MachineSetScalingUpV1Beta2Condition)
g.Expect(condition).ToNot(BeNil())
Expand Down Expand Up @@ -758,13 +780,15 @@ func Test_setRemediatingCondition(t *testing.T) {
healthCheckSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionTrue}
healthCheckNotSucceeded := clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, Status: corev1.ConditionFalse}
ownerRemediated := clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Message: "Remediation in progress"}
ownerRemediatedV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason, Message: "Machine deletionTimestamp set"}
ownerRemediatedWaitingForRemediationV1Beta2 := metav1.Condition{Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason, Message: "Waiting for remediation"}

tests := []struct {
name string
machineSet *clusterv1.MachineSet
machines []*clusterv1.Machine
getAndAdoptMachinesForMachineSetSucceeded bool
remediationPreflightCheckErrMessage string
expectCondition metav1.Condition
}{
{
Expand Down Expand Up @@ -806,7 +830,27 @@ func Test_setRemediatingCondition(t *testing.T) {
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
Message: "Remediation in progress from Machine m3",
Message: "Machine deletionTimestamp set from Machine m3",
},
},
{
name: "With machines to be remediated by MS and preflight check error",
machineSet: &clusterv1.MachineSet{},
machines: []*clusterv1.Machine{
fakeMachine("m1", withConditions(healthCheckSucceeded)), // Healthy machine
fakeMachine("m2", withConditions(healthCheckNotSucceeded)), // Unhealthy machine, not yet marked for remediation
fakeMachine("m3", withConditions(healthCheckNotSucceeded, ownerRemediated), withV1Beta2Condition(ownerRemediatedV1Beta2)),
fakeMachine("m4", withConditions(healthCheckNotSucceeded, ownerRemediated), withV1Beta2Condition(ownerRemediatedWaitingForRemediationV1Beta2)),
},
getAndAdoptMachinesForMachineSetSucceeded: true,
// This preflight check error can happen when a Machine becomes unhealthy while the control plane is upgrading.
remediationPreflightCheckErrMessage: "KubeadmControlPlane ns1/cp1 is upgrading (\"ControlPlaneIsStable\" preflight check failed)",
expectCondition: metav1.Condition{
Type: clusterv1.MachineSetRemediatingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetRemediatingV1Beta2Reason,
Message: "Triggering further remediations is blocked because KubeadmControlPlane ns1/cp1 is upgrading (\"ControlPlaneIsStable\" preflight check failed); " +
"Machine deletionTimestamp set from Machine m3; Waiting for remediation from Machine m4",
},
},
{
Expand Down Expand Up @@ -852,7 +896,7 @@ func Test_setRemediatingCondition(t *testing.T) {
machinesToBeRemediated = machines.Filter(collections.IsUnhealthyAndOwnerRemediated)
unHealthyMachines = machines.Filter(collections.IsUnhealthy)
}
setRemediatingCondition(ctx, tt.machineSet, machinesToBeRemediated, unHealthyMachines, tt.getAndAdoptMachinesForMachineSetSucceeded)
setRemediatingCondition(ctx, tt.machineSet, machinesToBeRemediated, unHealthyMachines, tt.getAndAdoptMachinesForMachineSetSucceeded, tt.remediationPreflightCheckErrMessage)

condition := v1beta2conditions.Get(tt.machineSet, clusterv1.MachineSetRemediatingV1Beta2Condition)
g.Expect(condition).ToNot(BeNil())
Expand Down
Loading