From 7c8ba1dc604c9805eb923f8ed9ff2cc6ef8d9e43 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Wed, 6 Nov 2024 22:36:50 +0100 Subject: [PATCH] Add machine UpToDate condition to KCP # Conflicts: # controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go --- api/v1beta1/machine_types.go | 8 + .../kubeadm/internal/control_plane.go | 90 +++---- .../kubeadm/internal/control_plane_test.go | 77 +++++- .../internal/controllers/controller.go | 96 +++++--- .../internal/controllers/controller_test.go | 225 +++++++++++++++++- .../internal/controllers/remediation.go | 2 +- .../kubeadm/internal/controllers/scale.go | 2 +- .../kubeadm/internal/controllers/status.go | 11 +- .../internal/controllers/status_test.go | 4 +- controlplane/kubeadm/internal/filters.go | 54 +++-- controlplane/kubeadm/internal/filters_test.go | 172 +++++++++++++ 11 files changed, 624 insertions(+), 117 deletions(-) diff --git a/api/v1beta1/machine_types.go b/api/v1beta1/machine_types.go index c6827e6cd6af..3a3038370e5e 100644 --- a/api/v1beta1/machine_types.go +++ b/api/v1beta1/machine_types.go @@ -123,7 +123,15 @@ const ( const ( // MachineUpToDateV1Beta2Condition is true if the Machine spec matches the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment. // The Machine's owner (e.g. MachineDeployment) is authoritative to set their owned Machine's UpToDate conditions based on its current spec. + // NOTE: The Machine's owner might use this condition to surface also other use cases when Machine is considered not up to date, e.g. when MachineDeployment spec.rolloutAfter + // is expired and the Machine needs to be rolled out. MachineUpToDateV1Beta2Condition = "UpToDate" + + // MachineUpToDateV1Beta2Reason surface when a Machine spec matches the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment. + MachineUpToDateV1Beta2Reason = "UpToDate" + + // MachineNotUpToDateV1Beta2Reason surface when a Machine spec does not match the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment. + MachineNotUpToDateV1Beta2Reason = "NotUpToDate" ) // Machine's BootstrapConfigReady condition and corresponding reasons that will be used in v1Beta2 API version. diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 7d624645305a..697c66a6bf57 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -44,6 +44,10 @@ type ControlPlane struct { Machines collections.Machines machinesPatchHelpers map[string]*patch.Helper + machinesNotUptoDate collections.Machines + machinesNotUptoDateLogMessages map[string][]string + machinesNotUptoDateConditionMessages map[string][]string + // reconciliationTime is the time of the current reconciliation, and should be used for all "now" calculations reconciliationTime metav1.Time @@ -97,15 +101,35 @@ func NewControlPlane(ctx context.Context, managementCluster ManagementCluster, c patchHelpers[machine.Name] = patchHelper } + // Select machines that should be rolled out because of an outdated configuration or because rolloutAfter/Before expired. + reconciliationTime := metav1.Now() + machinesNotUptoDate := make(collections.Machines, len(ownedMachines)) + machinesNotUptoDateLogMessages := map[string][]string{} + machinesNotUptoDateConditionMessages := map[string][]string{} + for _, m := range ownedMachines { + upToDate, logMessages, conditionMessages, err := UpToDate(m, kcp, &reconciliationTime, infraObjects, kubeadmConfigs) + if err != nil { + return nil, err + } + if !upToDate { + machinesNotUptoDate.Insert(m) + machinesNotUptoDateLogMessages[m.Name] = logMessages + machinesNotUptoDateConditionMessages[m.Name] = conditionMessages + } + } + return &ControlPlane{ - KCP: kcp, - Cluster: cluster, - Machines: ownedMachines, - machinesPatchHelpers: patchHelpers, - KubeadmConfigs: kubeadmConfigs, - InfraResources: infraObjects, - reconciliationTime: metav1.Now(), - managementCluster: managementCluster, + KCP: kcp, + Cluster: cluster, + Machines: ownedMachines, + machinesPatchHelpers: patchHelpers, + machinesNotUptoDate: machinesNotUptoDate, + machinesNotUptoDateLogMessages: machinesNotUptoDateLogMessages, + machinesNotUptoDateConditionMessages: machinesNotUptoDateConditionMessages, + KubeadmConfigs: kubeadmConfigs, + InfraResources: infraObjects, + reconciliationTime: reconciliationTime, + managementCluster: managementCluster, }, nil } @@ -152,16 +176,12 @@ func (c *ControlPlane) FailureDomainWithMostMachines(ctx context.Context, machin return failuredomains.PickMost(ctx, c.Cluster.Status.FailureDomains.FilterControlPlane(), c.Machines, machines) } -// NextFailureDomainForScaleUp returns the failure domain with the fewest number of up-to-date machines. +// NextFailureDomainForScaleUp returns the failure domain with the fewest number of up-to-date, not deleted machines. func (c *ControlPlane) NextFailureDomainForScaleUp(ctx context.Context) (*string, error) { if len(c.Cluster.Status.FailureDomains.FilterControlPlane()) == 0 { return nil, nil } - upToDateMachines, err := c.UpToDateMachines() - if err != nil { - return nil, errors.Wrapf(err, "failed to determine next failure domain for scale up") - } - return failuredomains.PickFewest(ctx, c.FailureDomains().FilterControlPlane(), upToDateMachines), nil + return failuredomains.PickFewest(ctx, c.FailureDomains().FilterControlPlane(), c.UpToDateMachines().Filter(collections.Not(collections.HasDeletionTimestamp))), nil } // InitialControlPlaneConfig returns a new KubeadmConfigSpec that is to be used for an initializing control plane. @@ -198,40 +218,21 @@ func (c *ControlPlane) GetKubeadmConfig(machineName string) (*bootstrapv1.Kubead } // MachinesNeedingRollout return a list of machines that need to be rolled out. -func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string]string, error) { - // Ignore machines to be deleted. - machines := c.Machines.Filter(collections.Not(collections.HasDeletionTimestamp)) +func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string][]string) { + // Note: Machines already deleted are dropped because they will be replaced by new machines after deletion completes. + return c.machinesNotUptoDate.Filter(collections.Not(collections.HasDeletionTimestamp)), c.machinesNotUptoDateLogMessages +} - // Return machines if they are scheduled for rollout or if with an outdated configuration. - machinesNeedingRollout := make(collections.Machines, len(machines)) - rolloutReasons := map[string]string{} - for _, m := range machines { - reason, needsRollout, err := NeedsRollout(&c.reconciliationTime, c.KCP.Spec.RolloutAfter, c.KCP.Spec.RolloutBefore, c.InfraResources, c.KubeadmConfigs, c.KCP, m) - if err != nil { - return nil, nil, err - } - if needsRollout { - machinesNeedingRollout.Insert(m) - rolloutReasons[m.Name] = reason - } - } - return machinesNeedingRollout, rolloutReasons, nil +// NotUpToDateMachines return a list of machines that are not up to date with the control +// plane's configuration. +func (c *ControlPlane) NotUpToDateMachines() (collections.Machines, map[string][]string) { + return c.machinesNotUptoDate, c.machinesNotUptoDateConditionMessages } // UpToDateMachines returns the machines that are up to date with the control -// plane's configuration and therefore do not require rollout. -func (c *ControlPlane) UpToDateMachines() (collections.Machines, error) { - upToDateMachines := make(collections.Machines, len(c.Machines)) - for _, m := range c.Machines { - _, needsRollout, err := NeedsRollout(&c.reconciliationTime, c.KCP.Spec.RolloutAfter, c.KCP.Spec.RolloutBefore, c.InfraResources, c.KubeadmConfigs, c.KCP, m) - if err != nil { - return nil, err - } - if !needsRollout { - upToDateMachines.Insert(m) - } - } - return upToDateMachines, nil +// plane's configuration. +func (c *ControlPlane) UpToDateMachines() collections.Machines { + return c.Machines.Difference(c.machinesNotUptoDate) } // getInfraResources fetches the external infrastructure resource for each machine in the collection and returns a map of machine.Name -> infraResource. @@ -316,6 +317,7 @@ func (c *ControlPlane) PatchMachines(ctx context.Context) error { controlplanev1.MachineEtcdPodHealthyCondition, controlplanev1.MachineEtcdMemberHealthyCondition, }}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{ + clusterv1.MachineUpToDateV1Beta2Condition, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition, diff --git a/controlplane/kubeadm/internal/control_plane_test.go b/controlplane/kubeadm/internal/control_plane_test.go index 792bdd360c89..820397035133 100644 --- a/controlplane/kubeadm/internal/control_plane_test.go +++ b/controlplane/kubeadm/internal/control_plane_test.go @@ -22,6 +22,7 @@ import ( . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" @@ -30,8 +31,6 @@ import ( ) func TestControlPlane(t *testing.T) { - g := NewWithT(t) - t.Run("Failure domains", func(t *testing.T) { controlPlane := &ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{}, @@ -53,14 +52,88 @@ func TestControlPlane(t *testing.T) { } t.Run("With all machines in known failure domain, should return the FD with most number of machines", func(*testing.T) { + g := NewWithT(t) g.Expect(*controlPlane.FailureDomainWithMostMachines(ctx, controlPlane.Machines)).To(Equal("two")) }) t.Run("With some machines in non defined failure domains", func(*testing.T) { + g := NewWithT(t) controlPlane.Machines.Insert(machine("machine-5", withFailureDomain("unknown"))) g.Expect(*controlPlane.FailureDomainWithMostMachines(ctx, controlPlane.Machines)).To(Equal("unknown")) }) }) + + t.Run("MachinesUpToDate", func(t *testing.T) { + g := NewWithT(t) + cluster := &clusterv1.Cluster{ + Status: clusterv1.ClusterStatus{ + FailureDomains: clusterv1.FailureDomains{ + "one": failureDomain(true), + "two": failureDomain(true), + "three": failureDomain(true), + }, + }, + } + kcp := &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, + } + machines := collections.Machines{ + "machine-1": &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{Name: "m1"}, + Spec: clusterv1.MachineSpec{ + Version: ptr.To("v1.31.0"), + FailureDomain: ptr.To("one"), + InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m1"}, + }}, + "machine-2": &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{Name: "m2"}, + Spec: clusterv1.MachineSpec{ + Version: ptr.To("v1.29.0"), // not up-to-date + FailureDomain: ptr.To("two"), + InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m2"}, + }}, + "machine-3": &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{Name: "m3", DeletionTimestamp: ptr.To(metav1.Now())}, // deleted + Spec: clusterv1.MachineSpec{ + Version: ptr.To("v1.29.3"), // not up-to-date + FailureDomain: ptr.To("three"), + InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m3"}, + }}, + "machine-4": &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{Name: "m4", DeletionTimestamp: ptr.To(metav1.Now())}, // deleted + Spec: clusterv1.MachineSpec{ + Version: ptr.To("v1.31.0"), + FailureDomain: ptr.To("two"), + InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m1"}, + }}, + } + controlPlane, err := NewControlPlane(ctx, nil, env.GetClient(), cluster, kcp, machines) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(controlPlane.Machines).To(HaveLen(4)) + + machinesNotUptoDate, machinesNotUptoDateConditionMessages := controlPlane.NotUpToDateMachines() + g.Expect(machinesNotUptoDate.Names()).To(ConsistOf("m2", "m3")) + g.Expect(machinesNotUptoDateConditionMessages).To(HaveLen(2)) + g.Expect(machinesNotUptoDateConditionMessages).To(HaveKeyWithValue("m2", []string{"Version v1.29.0, v1.31.0 required"})) + g.Expect(machinesNotUptoDateConditionMessages).To(HaveKeyWithValue("m3", []string{"Version v1.29.3, v1.31.0 required"})) + + machinesNeedingRollout, machinesNotUptoDateLogMessages := controlPlane.MachinesNeedingRollout() + g.Expect(machinesNeedingRollout.Names()).To(ConsistOf("m2")) + g.Expect(machinesNotUptoDateLogMessages).To(HaveLen(2)) + g.Expect(machinesNotUptoDateLogMessages).To(HaveKeyWithValue("m2", []string{"Machine version \"v1.29.0\" is not equal to KCP version \"v1.31.0\""})) + g.Expect(machinesNotUptoDateLogMessages).To(HaveKeyWithValue("m3", []string{"Machine version \"v1.29.3\" is not equal to KCP version \"v1.31.0\""})) + + upToDateMachines := controlPlane.UpToDateMachines() + g.Expect(upToDateMachines).To(HaveLen(2)) + g.Expect(upToDateMachines.Names()).To(ConsistOf("m1", "m4")) + + fd, err := controlPlane.NextFailureDomainForScaleUp(ctx) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(fd).To(Equal(ptr.To("two"))) // deleted up-to-date machines should not be counted when picking the next failure domain for scale up + }) } func TestHasMachinesToBeRemediated(t *testing.T) { diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index f1ece2c75a90..414647e2cf7d 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -29,6 +29,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" "k8s.io/utils/ptr" @@ -374,7 +375,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl // Wait for the cluster infrastructure to be ready before creating machines if !controlPlane.Cluster.Status.InfrastructureReady { - // Note: in future we might want to move this inside reconcileControlPlaneConditions. + // Note: in future we might want to move this inside reconcileControlPlaneAndMachinesConditions. v1beta2conditions.Set(controlPlane.KCP, metav1.Condition{ Type: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition, Status: metav1.ConditionUnknown, @@ -400,7 +401,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl // If ControlPlaneEndpoint is not set, return early if !controlPlane.Cluster.Spec.ControlPlaneEndpoint.IsValid() { - // Note: in future we might want to move this inside reconcileControlPlaneConditions. + // Note: in future we might want to move this inside reconcileControlPlaneAndMachinesConditions. v1beta2conditions.Set(controlPlane.KCP, metav1.Condition{ Type: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition, Status: metav1.ConditionUnknown, @@ -437,7 +438,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl // Updates conditions reporting the status of static pods and the status of the etcd cluster. // NOTE: Conditions reporting KCP operation progress like e.g. Resized or SpecUpToDate are inlined with the rest of the execution. - if err := r.reconcileControlPlaneConditions(ctx, controlPlane); err != nil { + if err := r.reconcileControlPlaneAndMachinesConditions(ctx, controlPlane); err != nil { return ctrl.Result{}, err } @@ -460,17 +461,14 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl } // Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations. - machinesNeedingRollout, rolloutReasons, err := controlPlane.MachinesNeedingRollout() - if err != nil { - return ctrl.Result{}, err - } + machinesNeedingRollout, machinesNeedingRolloutLogMessages := controlPlane.MachinesNeedingRollout() switch { case len(machinesNeedingRollout) > 0: - var reasons []string - for _, rolloutReason := range rolloutReasons { - reasons = append(reasons, rolloutReason) + var allMessages []string + for machine, messages := range machinesNeedingRolloutLogMessages { + allMessages = append(allMessages, fmt.Sprintf("Machine %s needs rollout: %s", machine, strings.Join(messages, ","))) } - log.Info(fmt.Sprintf("Rolling out Control Plane machines: %s", strings.Join(reasons, ",")), "machinesNeedingRollout", machinesNeedingRollout.Names()) + log.Info(fmt.Sprintf("Rolling out Control Plane machines: %s", strings.Join(allMessages, ",")), "machinesNeedingRollout", machinesNeedingRollout.Names()) conditions.MarkFalse(controlPlane.KCP, controlplanev1.MachinesSpecUpToDateCondition, controlplanev1.RollingUpdateInProgressReason, clusterv1.ConditionSeverityWarning, "Rolling %d replicas with outdated spec (%d replicas up to date)", len(machinesNeedingRollout), len(controlPlane.Machines)-len(machinesNeedingRollout)) return r.upgradeControlPlane(ctx, controlPlane, machinesNeedingRollout) default: @@ -607,7 +605,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con // Updates conditions reporting the status of static pods and the status of the etcd cluster. // NOTE: Ignoring failures given that we are deleting - if err := r.reconcileControlPlaneConditions(ctx, controlPlane); err != nil { + if err := r.reconcileControlPlaneAndMachinesConditions(ctx, controlPlane); err != nil { log.Error(err, "Failed to reconcile conditions") } @@ -817,7 +815,7 @@ func (r *KubeadmControlPlaneReconciler) syncMachines(ctx context.Context, contro controlPlane.Machines[machineName] = updatedMachine // Since the machine is updated, re-create the patch helper so that any subsequent // Patch calls use the correct base machine object to calculate the diffs. - // Example: reconcileControlPlaneConditions patches the machine objects in a subsequent call + // Example: reconcileControlPlaneAndMachinesConditions patches the machine objects in a subsequent call // and, it should use the updated machine to calculate the diff. // Note: If the patchHelpers are not re-computed based on the new updated machines, subsequent // Patch calls will fail because the patch will be calculated based on an outdated machine and will error @@ -874,9 +872,18 @@ func (r *KubeadmControlPlaneReconciler) syncMachines(ctx context.Context, contro return nil } -// reconcileControlPlaneConditions is responsible of reconciling conditions reporting the status of static pods and -// the status of the etcd cluster. -func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneConditions(ctx context.Context, controlPlane *internal.ControlPlane) (reterr error) { +// reconcileControlPlaneAndMachinesConditions is responsible of reconciling conditions reporting the status of static pods and +// the status of the etcd cluster both on the KubeadmControlPlane and on machines. +// It also reconciles the UpToDate condition on Machines, so we can update them with a single patch operation. +func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneAndMachinesConditions(ctx context.Context, controlPlane *internal.ControlPlane) (reterr error) { + defer func() { + // Patch machines with the updated conditions. + reterr = kerrors.NewAggregate([]error{reterr, controlPlane.PatchMachines(ctx)}) + }() + + // Always reconcile machine's UpToDate condition + reconcileMachineUpToDateCondition(ctx, controlPlane) + // If the cluster is not yet initialized, there is no way to connect to the workload cluster and fetch information // for updating conditions. Return early. // We additionally check for the Available condition. The Available condition is set at the same time @@ -889,28 +896,19 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneConditions(ctx cont controlPlaneInitialized := conditions.Get(controlPlane.KCP, controlplanev1.AvailableCondition) if !controlPlane.KCP.Status.Initialized || controlPlaneInitialized == nil || controlPlaneInitialized.Status != corev1.ConditionTrue { - v1beta2conditions.Set(controlPlane.KCP, metav1.Condition{ - Type: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition, - Status: metav1.ConditionUnknown, - Reason: controlplanev1.KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason, - Message: "Waiting for Cluster control plane to be initialized", - }) - - v1beta2conditions.Set(controlPlane.KCP, metav1.Condition{ - Type: controlplanev1.KubeadmControlPlaneControlPlaneComponentsHealthyV1Beta2Condition, - Status: metav1.ConditionUnknown, - Reason: controlplanev1.KubeadmControlPlaneControlPlaneComponentsInspectionFailedV1Beta2Reason, - Message: "Waiting for Cluster control plane to be initialized", + // Overwrite conditions to InspectionFailed. + setConditionsToUnknown(setConditionsToUnknownInput{ + ControlPlane: controlPlane, + Overwrite: true, + EtcdClusterHealthyReason: controlplanev1.KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason, + ControlPlaneComponentsHealthyReason: controlplanev1.KubeadmControlPlaneControlPlaneComponentsInspectionFailedV1Beta2Reason, + StaticPodReason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + EtcdMemberHealthyReason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, + Message: "Waiting for Cluster control plane to be initialized", }) - return nil } - defer func() { - // Patch machines with the updated conditions. - reterr = kerrors.NewAggregate([]error{reterr, controlPlane.PatchMachines(ctx)}) - }() - // Remote conditions grace period is counted from the later of last probe success and control plane initialized. lastProbeSuccessTime := r.ClusterCache.GetLastProbeSuccessTimestamp(ctx, client.ObjectKeyFromObject(controlPlane.Cluster)) if time.Since(maxTime(lastProbeSuccessTime, controlPlaneInitialized.LastTransitionTime.Time)) > r.RemoteConditionsGracePeriod { @@ -968,6 +966,34 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneConditions(ctx cont return nil } +func reconcileMachineUpToDateCondition(_ context.Context, controlPlane *internal.ControlPlane) { + machinesNotUptoDate, machinesNotUptoDateConditionMessages := controlPlane.NotUpToDateMachines() + machinesNotUptoDateNames := sets.New(machinesNotUptoDate.Names()...) + + for _, machine := range controlPlane.Machines { + if machinesNotUptoDateNames.Has(machine.Name) { + message := "" + if reasons, ok := machinesNotUptoDateConditionMessages[machine.Name]; ok { + message = strings.Join(reasons, "; ") + } + + v1beta2conditions.Set(machine, metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineNotUpToDateV1Beta2Reason, + Message: message, + }) + + continue + } + v1beta2conditions.Set(machine, metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineUpToDateV1Beta2Reason, + }) + } +} + type setConditionsToUnknownInput struct { ControlPlane *internal.ControlPlane Overwrite bool @@ -1050,7 +1076,7 @@ func maxTime(t1, t2 time.Time) time.Time { // reconcileEtcdMembers ensures the number of etcd members is in sync with the number of machines/nodes. // This is usually required after a machine deletion. // -// NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this. +// NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneAndMachinesConditions before this. func (r *KubeadmControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context, controlPlane *internal.ControlPlane) error { log := ctrl.LoggerFrom(ctx) diff --git a/controlplane/kubeadm/internal/controllers/controller_test.go b/controlplane/kubeadm/internal/controllers/controller_test.go index fe1e893e84fb..5bb3c12a87f1 100644 --- a/controlplane/kubeadm/internal/controllers/controller_test.go +++ b/controlplane/kubeadm/internal/controllers/controller_test.go @@ -1842,7 +1842,7 @@ func TestKubeadmControlPlaneReconciler_syncMachines(t *testing.T) { g.Expect(updatedDeletingMachine.Spec).Should(BeComparableTo(deletingMachine.Spec)) } -func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testing.T) { +func TestKubeadmControlPlaneReconciler_reconcileControlPlaneAndMachinesConditions(t *testing.T) { now := time.Now() defaultMachine1 := clusterv1.Machine{ @@ -1850,7 +1850,14 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin Name: "machine1-test", Namespace: metav1.NamespaceDefault, }, + Spec: clusterv1.MachineSpec{ + Version: ptr.To("v1.31.0"), + InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m1"}, + }, } + defaultMachine1NotUpToDate := defaultMachine1.DeepCopy() + defaultMachine1NotUpToDate.Spec.Version = ptr.To("v1.30.0") + defaultMachine2 := clusterv1.Machine{ ObjectMeta: metav1.ObjectMeta{ Name: "machine2-test", @@ -1870,6 +1877,9 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin Name: "kcp-test", Namespace: metav1.NamespaceDefault, }, + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, Status: controlplanev1.KubeadmControlPlaneStatus{ Initialized: true, Conditions: clusterv1.Conditions{ @@ -1912,6 +1922,9 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin }) return kcp }(), + Machines: map[string]*clusterv1.Machine{ + defaultMachine1.Name: defaultMachine1.DeepCopy(), + }, }, expectKCPConditions: []metav1.Condition{ { @@ -1932,6 +1945,194 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin Message: "Waiting for Cluster control plane to be initialized", }, }, + expectMachineConditions: []metav1.Condition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Waiting for Cluster control plane to be initialized", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Waiting for Cluster control plane to be initialized", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Waiting for Cluster control plane to be initialized", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Waiting for Cluster control plane to be initialized", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, + Message: "Waiting for Cluster control plane to be initialized", + }, + { + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineUpToDateV1Beta2Reason, + }, + }, + }, + { + name: "Machines up to date", + controlPlane: func() *internal.ControlPlane { + controlPlane, err := internal.NewControlPlane(ctx, nil, env.GetClient(), defaultCluster, defaultKCP, collections.FromMachines( + defaultMachine1.DeepCopy(), + )) + if err != nil { + panic(err) + } + return controlPlane + }(), + managementCluster: &fakeManagementCluster{ + Workload: &fakeWorkloadCluster{ + Workload: &internal.Workload{ + Client: fake.NewClientBuilder().Build(), + }, + }, + }, + expectKCPConditions: []metav1.Condition{ + { + Type: controlplanev1.KubeadmControlPlaneInitializedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: controlplanev1.KubeadmControlPlaneInitializedV1Beta2Reason, + }, + { + Type: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneEtcdClusterHealthUnknownV1Beta2Reason, + Message: "Following Machines are reporting etcd member unknown: machine1-test", + }, + { + Type: controlplanev1.KubeadmControlPlaneControlPlaneComponentsHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneControlPlaneComponentsHealthUnknownV1Beta2Reason, + Message: "Following Machines are reporting control plane unknown: machine1-test", + }, + }, + expectMachineConditions: []metav1.Condition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineUpToDateV1Beta2Reason, + }, + }, + }, + { + name: "Machines not up to date", + controlPlane: func() *internal.ControlPlane { + controlPlane, err := internal.NewControlPlane(ctx, nil, env.GetClient(), defaultCluster, defaultKCP, collections.FromMachines( + defaultMachine1NotUpToDate.DeepCopy(), + )) + if err != nil { + panic(err) + } + return controlPlane + }(), + managementCluster: &fakeManagementCluster{ + Workload: &fakeWorkloadCluster{ + Workload: &internal.Workload{ + Client: fake.NewClientBuilder().Build(), + }, + }, + }, + expectKCPConditions: []metav1.Condition{ + { + Type: controlplanev1.KubeadmControlPlaneInitializedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: controlplanev1.KubeadmControlPlaneInitializedV1Beta2Reason, + }, + { + Type: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneEtcdClusterHealthUnknownV1Beta2Reason, + Message: "Following Machines are reporting etcd member unknown: machine1-test", + }, + { + Type: controlplanev1.KubeadmControlPlaneControlPlaneComponentsHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneControlPlaneComponentsHealthUnknownV1Beta2Reason, + Message: "Following Machines are reporting control plane unknown: machine1-test", + }, + }, + expectMachineConditions: []metav1.Condition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachinePodInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, + Message: "Node does not exist", + }, + { + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineNotUpToDateV1Beta2Reason, + Message: "Version v1.30.0, v1.31.0 required", + }, + }, }, { name: "connection down, preserve conditions as they have been set before (remote conditions grace period not passed yet)", @@ -2014,6 +2215,11 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin Status: metav1.ConditionTrue, Reason: controlplanev1.KubeadmControlPlaneMachinePodRunningV1Beta2Reason, }, + { + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineUpToDateV1Beta2Reason, + }, }, }, { @@ -2098,6 +2304,11 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberConnectionDownV1Beta2Reason, Message: fmt.Sprintf("Last successful probe at %s", now.Add(-3*time.Minute).Format(time.RFC3339)), }, + { + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineUpToDateV1Beta2Reason, + }, }, }, { @@ -2204,6 +2415,11 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberConnectionDownV1Beta2Reason, Message: fmt.Sprintf("Last successful probe at %s", now.Add(-6*time.Minute).Format(time.RFC3339)), }, + { + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineUpToDateV1Beta2Reason, + }, }, }, { @@ -2271,6 +2487,11 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, Message: "Please check controller logs for errors", }, + { + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineUpToDateV1Beta2Reason, + }, }, }, { @@ -2338,7 +2559,7 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneConditions(t *testin } tc.controlPlane.SetPatchHelpers(patchHelpers) - err := r.reconcileControlPlaneConditions(ctx, tc.controlPlane) + err := r.reconcileControlPlaneAndMachinesConditions(ctx, tc.controlPlane) if tc.expectErr != "" { g.Expect(err).To(HaveOccurred()) g.Expect(err.Error()).To(Equal(tc.expectErr)) diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 409f946c5b47..0aa0063fea8d 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -462,7 +462,7 @@ func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machin // - etc. // // NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers -// as well as reconcileControlPlaneConditions before this. +// as well as reconcileControlPlaneAndMachinesConditions before this. func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) { log := ctrl.LoggerFrom(ctx) diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 7f366e19c8b5..aa3a8190408b 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -157,7 +157,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( // - All the health conditions on the control plane machines are true. // If the control plane is not passing preflight checks, it requeue. // -// NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this. +// NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneAndMachinesConditions before this. func (r *KubeadmControlPlaneReconciler) preflightChecks(ctx context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) (ctrl.Result, error) { //nolint:unparam logger := ctrl.LoggerFrom(ctx) diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go index 2327d19f2cc5..1480a6691f9f 100644 --- a/controlplane/kubeadm/internal/controllers/status.go +++ b/controlplane/kubeadm/internal/controllers/status.go @@ -45,10 +45,7 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro // This is necessary for CRDs including scale subresources. controlPlane.KCP.Status.Selector = selector.String() - upToDateMachines, err := controlPlane.UpToDateMachines() - if err != nil { - return errors.Wrapf(err, "failed to update status") - } + upToDateMachines := controlPlane.UpToDateMachines() controlPlane.KCP.Status.UpdatedReplicas = int32(len(upToDateMachines)) replicas := int32(len(controlPlane.Machines)) @@ -153,11 +150,11 @@ func (r *KubeadmControlPlaneReconciler) updateV1Beta2Status(ctx context.Context, return } - // Note: some of the status is set on reconcileControlPlaneConditions (EtcdClusterHealthy, ControlPlaneComponentsHealthy conditions), + // Note: some of the status is set on reconcileControlPlaneAndMachinesConditions (EtcdClusterHealthy, ControlPlaneComponentsHealthy conditions), // reconcileClusterCertificates (CertificatesAvailable condition), and also in the defer patch at the end of // the main reconcile loop (status.ObservedGeneration) etc - // Note: KCP also sets status on machines in reconcileUnhealthyMachines and reconcileControlPlaneConditions; if for + // Note: KCP also sets status on machines in reconcileUnhealthyMachines and reconcileControlPlaneAndMachinesConditions; if for // any reason those functions are not called before, e.g. an error, this func relies on existing Machine's condition. setReplicas(ctx, controlPlane.KCP, controlPlane.Machines) @@ -502,7 +499,7 @@ func aggregateUnhealthyMachines(machines collections.Machines) string { } else { message += " are " } - message += "not healthy (not to be remediated by KCP)" + message += "not healthy (not to be remediated by KubeadmControlPlane)" return message } diff --git a/controlplane/kubeadm/internal/controllers/status_test.go b/controlplane/kubeadm/internal/controllers/status_test.go index aee7b88a53d4..1ffadfda241f 100644 --- a/controlplane/kubeadm/internal/controllers/status_test.go +++ b/controlplane/kubeadm/internal/controllers/status_test.go @@ -565,7 +565,7 @@ func Test_setRemediatingCondition(t *testing.T) { Type: controlplanev1.KubeadmControlPlaneRemediatingV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneNotRemediatingV1Beta2Reason, - Message: "Machine m2 is not healthy (not to be remediated by KCP)", + Message: "Machine m2 is not healthy (not to be remediated by KubeadmControlPlane)", }, }, { @@ -582,7 +582,7 @@ func Test_setRemediatingCondition(t *testing.T) { Type: controlplanev1.KubeadmControlPlaneRemediatingV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneNotRemediatingV1Beta2Reason, - Message: "Machines m1, m2 are not healthy (not to be remediated by KCP)", + Message: "Machines m1, m2 are not healthy (not to be remediated by KubeadmControlPlane)", }, }, } diff --git a/controlplane/kubeadm/internal/filters.go b/controlplane/kubeadm/internal/filters.go index acb54d51e545..65a788e62dbb 100644 --- a/controlplane/kubeadm/internal/filters.go +++ b/controlplane/kubeadm/internal/filters.go @@ -20,7 +20,6 @@ import ( "encoding/json" "fmt" "reflect" - "strings" "github.com/pkg/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -41,65 +40,74 @@ import ( // - mutated in-place (ex: NodeDrainTimeout) // - are not dictated by KCP (ex: ProviderID) // - are not relevant for the rollout decision (ex: failureDomain). -func matchesMachineSpec(infraConfigs map[string]*unstructured.Unstructured, machineConfigs map[string]*bootstrapv1.KubeadmConfig, kcp *controlplanev1.KubeadmControlPlane, machine *clusterv1.Machine) (string, bool, error) { - mismatchReasons := []string{} +func matchesMachineSpec(infraConfigs map[string]*unstructured.Unstructured, machineConfigs map[string]*bootstrapv1.KubeadmConfig, kcp *controlplanev1.KubeadmControlPlane, machine *clusterv1.Machine) (bool, []string, []string, error) { + logMessages := []string{} + conditionMessages := []string{} if !collections.MatchesKubernetesVersion(kcp.Spec.Version)(machine) { machineVersion := "" if machine != nil && machine.Spec.Version != nil { machineVersion = *machine.Spec.Version } - mismatchReasons = append(mismatchReasons, fmt.Sprintf("Machine version %q is not equal to KCP version %q", machineVersion, kcp.Spec.Version)) + logMessages = append(logMessages, fmt.Sprintf("Machine version %q is not equal to KCP version %q", machineVersion, kcp.Spec.Version)) + conditionMessages = append(conditionMessages, fmt.Sprintf("Version %s, %s required", machineVersion, kcp.Spec.Version)) } reason, matches, err := matchesKubeadmBootstrapConfig(machineConfigs, kcp, machine) if err != nil { - return "", false, errors.Wrapf(err, "failed to match Machine spec") + return false, nil, nil, errors.Wrapf(err, "failed to match Machine spec") } if !matches { - mismatchReasons = append(mismatchReasons, reason) + logMessages = append(logMessages, reason) + conditionMessages = append(conditionMessages, "KubeadmConfig is not up-to-date") } if reason, matches := matchesTemplateClonedFrom(infraConfigs, kcp, machine); !matches { - mismatchReasons = append(mismatchReasons, reason) + logMessages = append(logMessages, reason) + conditionMessages = append(conditionMessages, fmt.Sprintf("%s is not up-to-date", machine.Spec.InfrastructureRef.Kind)) } - if len(mismatchReasons) > 0 { - return strings.Join(mismatchReasons, ","), false, nil + if len(logMessages) > 0 || len(conditionMessages) > 0 { + return false, logMessages, conditionMessages, nil } - return "", true, nil + return true, nil, nil, nil } -// NeedsRollout checks if a Machine needs to be rolled out and returns the reason why. -func NeedsRollout(reconciliationTime, rolloutAfter *metav1.Time, rolloutBefore *controlplanev1.RolloutBefore, infraConfigs map[string]*unstructured.Unstructured, machineConfigs map[string]*bootstrapv1.KubeadmConfig, kcp *controlplanev1.KubeadmControlPlane, machine *clusterv1.Machine) (string, bool, error) { - rolloutReasons := []string{} +// UpToDate checks if a Machine is up to date with the control plane's configuration. +// If not, messages explaining why are provided with different level of detail for logs and conditions. +func UpToDate(machine *clusterv1.Machine, kcp *controlplanev1.KubeadmControlPlane, reconciliationTime *metav1.Time, infraConfigs map[string]*unstructured.Unstructured, machineConfigs map[string]*bootstrapv1.KubeadmConfig) (bool, []string, []string, error) { + logMessages := []string{} + conditionMessages := []string{} // Machines whose certificates are about to expire. - if collections.ShouldRolloutBefore(reconciliationTime, rolloutBefore)(machine) { - rolloutReasons = append(rolloutReasons, "certificates will expire soon, rolloutBefore expired") + if collections.ShouldRolloutBefore(reconciliationTime, kcp.Spec.RolloutBefore)(machine) { + logMessages = append(logMessages, "certificates will expire soon, rolloutBefore expired") + conditionMessages = append(conditionMessages, "Certificates will expire soon") } // Machines that are scheduled for rollout (KCP.Spec.RolloutAfter set, // the RolloutAfter deadline is expired, and the machine was created before the deadline). - if collections.ShouldRolloutAfter(reconciliationTime, rolloutAfter)(machine) { - rolloutReasons = append(rolloutReasons, "rolloutAfter expired") + if collections.ShouldRolloutAfter(reconciliationTime, kcp.Spec.RolloutAfter)(machine) { + logMessages = append(logMessages, "rolloutAfter expired") + conditionMessages = append(conditionMessages, "KubeadmControlPlane spec.rolloutAfter expired") } // Machines that do not match with KCP config. - mismatchReason, matches, err := matchesMachineSpec(infraConfigs, machineConfigs, kcp, machine) + matches, specLogMessages, specConditionMessages, err := matchesMachineSpec(infraConfigs, machineConfigs, kcp, machine) if err != nil { - return "", false, errors.Wrapf(err, "failed to determine if Machine %s needs rollout", machine.Name) + return false, nil, nil, errors.Wrapf(err, "failed to determine if Machine %s is up-to-date", machine.Name) } if !matches { - rolloutReasons = append(rolloutReasons, mismatchReason) + logMessages = append(logMessages, specLogMessages...) + conditionMessages = append(conditionMessages, specConditionMessages...) } - if len(rolloutReasons) > 0 { - return fmt.Sprintf("Machine %s needs rollout: %s", machine.Name, strings.Join(rolloutReasons, ",")), true, nil + if len(logMessages) > 0 || len(conditionMessages) > 0 { + return false, logMessages, conditionMessages, nil } - return "", false, nil + return true, nil, nil, nil } // matchesTemplateClonedFrom checks if a Machine has a corresponding infrastructure machine that diff --git a/controlplane/kubeadm/internal/filters_test.go b/controlplane/kubeadm/internal/filters_test.go index db78d4b06a4f..1d6ffde3c029 100644 --- a/controlplane/kubeadm/internal/filters_test.go +++ b/controlplane/kubeadm/internal/filters_test.go @@ -19,11 +19,13 @@ package internal import ( "encoding/json" "testing" + "time" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" @@ -1420,3 +1422,173 @@ func TestMatchesTemplateClonedFrom_WithClonedFromAnnotations(t *testing.T) { }) } } + +func TestUpToDate(t *testing.T) { + reconciliationTime := metav1.Now() + + defaultKcp := &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Replicas: nil, + Version: "v1.31.0", + MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{ + InfrastructureRef: corev1.ObjectReference{APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Kind: "AWSMachineTemplate", Name: "template1"}, + }, + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + ClusterName: "foo", + }, + }, + RolloutBefore: &controlplanev1.RolloutBefore{ + CertificatesExpiryDays: ptr.To(int32(60)), // rollout if certificates will expire in less then 60 days. + }, + RolloutAfter: ptr.To(metav1.Time{Time: reconciliationTime.Add(10 * 24 * time.Hour)}), // rollout 10 days from now. + }, + } + defaultMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + CreationTimestamp: metav1.Time{Time: reconciliationTime.Add(-2 * 24 * time.Hour)}, // two days ago. + Annotations: map[string]string{ + controlplanev1.KubeadmClusterConfigurationAnnotation: "{\n \"clusterName\": \"foo\"\n}", + }, + }, + Spec: clusterv1.MachineSpec{ + Version: ptr.To("v1.31.0"), + InfrastructureRef: corev1.ObjectReference{APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Kind: "AWSMachine", Name: "infra-machine1"}, + }, + Status: clusterv1.MachineStatus{ + CertificatesExpiryDate: &metav1.Time{Time: reconciliationTime.Add(100 * 24 * time.Hour)}, // certificates will expire in 100 days from now. + }, + } + + defaultInfraConfigs := map[string]*unstructured.Unstructured{ + defaultMachine.Name: { + Object: map[string]interface{}{ + "kind": "AWSMachine", + "apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1", + "metadata": map[string]interface{}{ + "name": "infra-config1", + "namespace": "default", + "annotations": map[string]interface{}{ + "cluster.x-k8s.io/cloned-from-name": "template1", + "cluster.x-k8s.io/cloned-from-groupkind": "AWSMachineTemplate.infrastructure.cluster.x-k8s.io", + }, + }, + }, + }, + } + + defaultMachineConfigs := map[string]*bootstrapv1.KubeadmConfig{ + defaultMachine.Name: { + Spec: bootstrapv1.KubeadmConfigSpec{ + InitConfiguration: &bootstrapv1.InitConfiguration{}, // first control-plane + }, + }, + } + + tests := []struct { + name string + kcp *controlplanev1.KubeadmControlPlane + machine *clusterv1.Machine + infraConfigs map[string]*unstructured.Unstructured + machineConfigs map[string]*bootstrapv1.KubeadmConfig + expectUptoDate bool + expectLogMessages []string + expectConditionMessages []string + }{ + { + name: "machine up-to-date", + kcp: defaultKcp, + machine: defaultMachine, + infraConfigs: defaultInfraConfigs, + machineConfigs: defaultMachineConfigs, + expectUptoDate: true, + expectLogMessages: nil, + expectConditionMessages: nil, + }, + { + name: "certificate are expiring soon", + kcp: func() *controlplanev1.KubeadmControlPlane { + kcp := defaultKcp.DeepCopy() + kcp.Spec.RolloutBefore = &controlplanev1.RolloutBefore{ + CertificatesExpiryDays: ptr.To(int32(150)), // rollout if certificates will expire in less then 150 days. + } + return kcp + }(), + machine: defaultMachine, // certificates will expire in 100 days from now. + infraConfigs: defaultInfraConfigs, + machineConfigs: defaultMachineConfigs, + expectUptoDate: false, + expectLogMessages: []string{"certificates will expire soon, rolloutBefore expired"}, + expectConditionMessages: []string{"Certificates will expire soon"}, + }, + { + name: "rollout after expired", + kcp: func() *controlplanev1.KubeadmControlPlane { + kcp := defaultKcp.DeepCopy() + kcp.Spec.RolloutAfter = ptr.To(metav1.Time{Time: reconciliationTime.Add(-1 * 24 * time.Hour)}) // one day ago + return kcp + }(), + machine: defaultMachine, // created two days ago + infraConfigs: defaultInfraConfigs, + machineConfigs: defaultMachineConfigs, + expectUptoDate: false, + expectLogMessages: []string{"rolloutAfter expired"}, + expectConditionMessages: []string{"KubeadmControlPlane spec.rolloutAfter expired"}, + }, + { + name: "kubernetes version does not match", + kcp: func() *controlplanev1.KubeadmControlPlane { + kcp := defaultKcp.DeepCopy() + kcp.Spec.Version = "v1.31.2" + return kcp + }(), + machine: defaultMachine, // defaultMachine has "v1.31.0" + infraConfigs: defaultInfraConfigs, + machineConfigs: defaultMachineConfigs, + expectUptoDate: false, + expectLogMessages: []string{"Machine version \"v1.31.0\" is not equal to KCP version \"v1.31.2\""}, + expectConditionMessages: []string{"Version v1.31.0, v1.31.2 required"}, + }, + { + name: "KubeadmConfig is not up-to-date", + kcp: func() *controlplanev1.KubeadmControlPlane { + kcp := defaultKcp.DeepCopy() + kcp.Spec.KubeadmConfigSpec.ClusterConfiguration.ClusterName = "bar" + return kcp + }(), + machine: defaultMachine, // was created with cluster name "foo" + infraConfigs: defaultInfraConfigs, + machineConfigs: defaultMachineConfigs, + expectUptoDate: false, + expectLogMessages: []string{"Machine KubeadmConfig ClusterConfiguration is outdated: diff: &v1beta1.ClusterConfiguration{\n ... // 10 identical fields\n ImageRepository: \"\",\n FeatureGates: nil,\n- ClusterName: \"foo\",\n+ ClusterName: \"bar\",\n }"}, + expectConditionMessages: []string{"KubeadmConfig is not up-to-date"}, + }, + { + name: "AWSMachine is not up-to-date", + kcp: func() *controlplanev1.KubeadmControlPlane { + kcp := defaultKcp.DeepCopy() + kcp.Spec.MachineTemplate.InfrastructureRef = corev1.ObjectReference{APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Kind: "AWSMachineTemplate", Name: "template2"} // kcp moving to template 2 + return kcp + }(), + machine: defaultMachine, + infraConfigs: defaultInfraConfigs, // infra config cloned from template1 + machineConfigs: defaultMachineConfigs, + expectUptoDate: false, + expectLogMessages: []string{"Infrastructure template on KCP rotated from AWSMachineTemplate.infrastructure.cluster.x-k8s.io template1 to AWSMachineTemplate.infrastructure.cluster.x-k8s.io template2"}, + expectConditionMessages: []string{"AWSMachine is not up-to-date"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + upToDate, logMessages, conditionMessages, err := UpToDate(tt.machine, tt.kcp, &reconciliationTime, tt.infraConfigs, tt.machineConfigs) + g.Expect(err).ToNot(HaveOccurred()) + + g.Expect(upToDate).To(Equal(tt.expectUptoDate)) + g.Expect(logMessages).To(Equal(tt.expectLogMessages)) + g.Expect(conditionMessages).To(Equal(tt.expectConditionMessages)) + }) + } +}