From cac790e26eff2775852871b5c307ee822d122cf9 Mon Sep 17 00:00:00 2001 From: Theo Barber-Bany Date: Tue, 30 Apr 2024 17:41:08 +0200 Subject: [PATCH] Implement paused condition for machine healthcheck controller This change updates the machine health check controller to set the paused condition based on if the cluster or machine health check is paused. --- .../machinehealthcheck_controller.go | 18 +- .../machinehealthcheck_controller_test.go | 233 ++++++++++-------- 2 files changed, 135 insertions(+), 116 deletions(-) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go index 08879720d839..d614aadf4e1e 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go @@ -92,14 +92,14 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt handler.EnqueueRequestsFromMapFunc(r.machineToMachineHealthCheck), ). WithOptions(options). - WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)). + WithEventFilter(predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(r.clusterToMachineHealthCheck), builder.WithPredicates( // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? predicates.All(ctrl.LoggerFrom(ctx), - predicates.ClusterUnpaused(ctrl.LoggerFrom(ctx)), + predicates.ClusterCreateUpdateEvent(ctrl.LoggerFrom(ctx)), predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue), ), ), @@ -139,12 +139,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re return ctrl.Result{}, err } - // Return early if the object or Cluster is paused. - if annotations.IsPaused(cluster, m) { - log.Info("Reconciliation is paused for this object") - return ctrl.Result{}, nil - } - // Initialize the patch helper patchHelper, err := patch.NewHelper(m, r.Client) if err != nil { @@ -163,6 +157,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re } }() + // Return early if the object or Cluster is paused. + if annotations.IsPaused(cluster, m) { + log.Info("Reconciliation is paused for this object") + conditions.MarkTrue(m, clusterv1.PausedCondition) + return ctrl.Result{}, nil + } + conditions.MarkFalse(m, clusterv1.PausedCondition, clusterv1.ResourceNotPausedReason, clusterv1.ConditionSeverityInfo, "Resource is operating as expected") + // Reconcile labels. if m.Labels == nil { m.Labels = make(map[string]string) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go index 0f876aa98371..51328c1fdd96 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go @@ -64,6 +64,20 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { } }() + conditionsRemediationAllowedNotPaused := clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + { + Type: clusterv1.PausedCondition, + Status: corev1.ConditionFalse, + Reason: clusterv1.ResourceNotPausedReason, + Severity: clusterv1.ConditionSeverityInfo, + Message: "Resource is operating as expected", + }, + } + t.Run("it should ensure the correct cluster-name label when no existing labels exist", func(t *testing.T) { g := NewWithT(t) cluster := createCluster(g, ns.Name) @@ -235,12 +249,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) }) @@ -288,12 +297,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) }) @@ -336,12 +340,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) }) @@ -393,12 +392,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) }) @@ -451,12 +445,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) }) @@ -509,12 +498,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) }) @@ -576,6 +560,13 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Reason: clusterv1.TooManyUnhealthyReason, Message: "Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: 3, unhealthy: 2, maxUnhealthy: 40%)", }, + { + Type: clusterv1.PausedCondition, + Status: corev1.ConditionFalse, + Reason: clusterv1.ResourceNotPausedReason, + Severity: clusterv1.ConditionSeverityInfo, + Message: "Resource is operating as expected", + }, }, })) @@ -666,12 +657,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) }) @@ -733,6 +719,13 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Reason: clusterv1.TooManyUnhealthyReason, Message: "Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: 3, unhealthy: 2, unhealthyRange: [3-5])", }, + { + Type: clusterv1.PausedCondition, + Status: corev1.ConditionFalse, + Reason: clusterv1.ResourceNotPausedReason, + Severity: clusterv1.ConditionSeverityInfo, + Message: "Resource is operating as expected", + }, }, })) @@ -830,12 +823,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Calculate how many Machines have health check succeeded = false. @@ -929,12 +917,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Calculate how many Machines have health check succeeded = false. @@ -1025,12 +1008,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 2, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Calculate how many Machines have health check succeeded = false. @@ -1109,12 +1087,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 1, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Transition the node to unhealthy. @@ -1146,6 +1119,13 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Type: clusterv1.RemediationAllowedCondition, Status: corev1.ConditionTrue, }, + { + Type: clusterv1.PausedCondition, + Status: corev1.ConditionFalse, + Reason: clusterv1.ResourceNotPausedReason, + Severity: clusterv1.ConditionSeverityInfo, + Message: "Resource is operating as expected", + }, }, })) @@ -1376,12 +1356,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { CurrentHealthy: 1, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Pause the machine @@ -1416,12 +1391,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 0, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Calculate how many Machines have health check succeeded = false. @@ -1528,12 +1498,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 1, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Transition the node to unhealthy. @@ -1561,12 +1526,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 0, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Calculate how many Machines have health check succeeded = false. @@ -1676,12 +1636,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 1, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Transition the node to unhealthy. @@ -1709,12 +1664,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 0, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Calculate how many Machines have health check succeeded = false. @@ -1760,12 +1710,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { RemediationsAllowed: 1, ObservedGeneration: 1, Targets: targetMachines, - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.RemediationAllowedCondition, - Status: corev1.ConditionTrue, - }, - }, + Conditions: conditionsRemediationAllowedNotPaused, })) // Calculate how many Machines have health check succeeded = false. @@ -1805,6 +1750,78 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { return obj }, timeout, 100*time.Millisecond).Should(BeNil()) }) + + t.Run("When the machine health check is paused, the condition is set to true", func(t *testing.T) { + g := NewWithT(t) + cluster := createCluster(g, ns.Name) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + g.Expect(env.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup := createMachinesWithNodes(g, cluster, + count(1), + firstMachineAsControlPlane(), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup() + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := env.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 1, + CurrentHealthy: 1, + ObservedGeneration: 1, + RemediationsAllowed: 1, + Targets: targetMachines, + Conditions: conditionsRemediationAllowedNotPaused, + })) + + // Transition the machine health check to paused. + g.Expect(env.Get(ctx, util.ObjectKey(cluster), cluster)).To(Succeed()) + cluster.Spec.Paused = true + g.Expect(env.Update(ctx, cluster)).To(Succeed()) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := env.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: 1, + CurrentHealthy: 1, + RemediationsAllowed: 1, + ObservedGeneration: 1, + Targets: targetMachines, + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedCondition, + Status: corev1.ConditionTrue, + }, + { + Type: clusterv1.PausedCondition, + Status: corev1.ConditionTrue, + }, + }, + })) + }) } func TestClusterToMachineHealthCheck(t *testing.T) {