diff --git a/controlplane/kubeadm/internal/controllers/fakes_test.go b/controlplane/kubeadm/internal/controllers/fakes_test.go index cf9fcbafe66e..12a3691f3464 100644 --- a/controlplane/kubeadm/internal/controllers/fakes_test.go +++ b/controlplane/kubeadm/internal/controllers/fakes_test.go @@ -66,6 +66,14 @@ func (f *fakeManagementCluster) GetMachinePoolsForCluster(c context.Context, clu return f.MachinePools, nil } +type fakeManagementClusterWithGetWorkloadClusterError struct { + fakeManagementCluster +} + +func (f *fakeManagementClusterWithGetWorkloadClusterError) GetWorkloadCluster(_ context.Context, _ client.ObjectKey) (internal.WorkloadCluster, error) { + return nil, errors.New("failed to get workload cluster") +} + type fakeWorkloadCluster struct { *internal.Workload Status internal.ClusterStatus diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go index f135baf0aae2..48eb20eec24f 100644 --- a/controlplane/kubeadm/internal/controllers/status.go +++ b/controlplane/kubeadm/internal/controllers/status.go @@ -41,10 +41,11 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro replicas := int32(len(controlPlane.Machines)) desiredReplicas := *controlPlane.KCP.Spec.Replicas - // set basic data that does not require interacting with the workload cluster + // Set basic data that does not require interacting with the workload cluster. controlPlane.KCP.Status.Replicas = replicas - controlPlane.KCP.Status.ReadyReplicas = 0 - controlPlane.KCP.Status.UnavailableReplicas = replicas + // Status.Replicas is only ever 0 on the first reconcile for KCP, then Status.UnavailableReplicas is set to `desiredReplicas`. + // Otherwise keep it unchanged when `desiredReplicas` does not change. So as to avoid updating it when unable to get the workload cluster. + controlPlane.KCP.Status.UnavailableReplicas = desiredReplicas - controlPlane.KCP.Status.ReadyReplicas // Return early if the deletion timestamp is set, because we don't want to try to connect to the workload cluster // and we don't want to report resize condition (because it is set to deleting into reconcile delete). @@ -90,7 +91,7 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro return err } controlPlane.KCP.Status.ReadyReplicas = status.ReadyNodes - controlPlane.KCP.Status.UnavailableReplicas = replicas - status.ReadyNodes + controlPlane.KCP.Status.UnavailableReplicas = desiredReplicas - status.ReadyNodes // This only gets initialized once and does not change if the kubeadm config map goes away. if status.HasKubeadmConfig { diff --git a/controlplane/kubeadm/internal/controllers/status_test.go b/controlplane/kubeadm/internal/controllers/status_test.go index 833d0f717e42..0a10036fc9e9 100644 --- a/controlplane/kubeadm/internal/controllers/status_test.go +++ b/controlplane/kubeadm/internal/controllers/status_test.go @@ -54,7 +54,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusNoMachines(t *testing.T) { Name: "foo", }, Spec: controlplanev1.KubeadmControlPlaneSpec{ - Version: "v1.16.6", + Version: "v1.16.6", + Replicas: ptr.To[int32](1), MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{ InfrastructureRef: corev1.ObjectReference{ APIVersion: "test/v1alpha1", @@ -89,7 +90,7 @@ func TestKubeadmControlPlaneReconciler_updateStatusNoMachines(t *testing.T) { g.Expect(r.updateStatus(ctx, controlPlane)).To(Succeed()) g.Expect(kcp.Status.Replicas).To(BeEquivalentTo(0)) g.Expect(kcp.Status.ReadyReplicas).To(BeEquivalentTo(0)) - g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(0)) + g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(1)) g.Expect(kcp.Status.Initialized).To(BeFalse()) g.Expect(kcp.Status.Ready).To(BeFalse()) g.Expect(kcp.Status.Selector).NotTo(BeEmpty()) @@ -117,7 +118,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusAllMachinesNotReady(t *testin Name: "foo", }, Spec: controlplanev1.KubeadmControlPlaneSpec{ - Version: "v1.16.6", + Version: "v1.16.6", + Replicas: ptr.To[int32](3), MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{ InfrastructureRef: corev1.ObjectReference{ APIVersion: "test/v1alpha1", @@ -190,7 +192,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusAllMachinesReady(t *testing.T Name: "foo", }, Spec: controlplanev1.KubeadmControlPlaneSpec{ - Version: "v1.16.6", + Version: "v1.16.6", + Replicas: ptr.To[int32](3), MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{ InfrastructureRef: corev1.ObjectReference{ APIVersion: "test/v1alpha1", @@ -271,7 +274,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing Name: "foo", }, Spec: controlplanev1.KubeadmControlPlaneSpec{ - Version: "v1.16.6", + Version: "v1.16.6", + Replicas: ptr.To[int32](5), MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{ InfrastructureRef: corev1.ObjectReference{ APIVersion: "test/v1alpha1", @@ -331,6 +335,81 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing g.Expect(kcp.Status.Ready).To(BeTrue()) } +func TestKubeadmControlPlaneReconciler_updateStatusCannotGetWorkloadClusterStatus(t *testing.T) { + g := NewWithT(t) + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: metav1.NamespaceDefault, + }, + } + + kcp := &controlplanev1.KubeadmControlPlane{ + TypeMeta: metav1.TypeMeta{ + Kind: "KubeadmControlPlane", + APIVersion: controlplanev1.GroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: cluster.Namespace, + Name: "foo", + }, + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.16.6", + Replicas: ptr.To[int32](3), + MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{ + InfrastructureRef: corev1.ObjectReference{ + APIVersion: "test/v1alpha1", + Kind: "UnknownInfraMachine", + Name: "foo", + }, + }, + }, + Status: controlplanev1.KubeadmControlPlaneStatus{ + Ready: true, + Replicas: 3, + ReadyReplicas: 3, + UpdatedReplicas: 3, + UnavailableReplicas: 0, + }, + } + webhook := &controlplanev1webhooks.KubeadmControlPlane{} + g.Expect(webhook.Default(ctx, kcp)).To(Succeed()) + _, err := webhook.ValidateCreate(ctx, kcp) + g.Expect(err).ToNot(HaveOccurred()) + + machines := map[string]*clusterv1.Machine{} + objs := []client.Object{cluster.DeepCopy(), kcp.DeepCopy()} + for i := 0; i < 3; i++ { + name := fmt.Sprintf("test-%d", i) + m, n := createMachineNodePair(name, cluster, kcp, true) + objs = append(objs, n, m) + machines[m.Name] = m + } + + fakeClient := newFakeClient(objs...) + + r := &KubeadmControlPlaneReconciler{ + Client: fakeClient, + managementCluster: &fakeManagementClusterWithGetWorkloadClusterError{}, + recorder: record.NewFakeRecorder(32), + } + + controlPlane := &internal.ControlPlane{ + KCP: kcp, + Cluster: cluster, + Machines: machines, + } + controlPlane.InjectTestManagementCluster(r.managementCluster) + + // When updateStatus() returns a non-nil error(e.g. unable to get workload cluster), the original kcp.Status should not be updated. + g.Expect(r.updateStatus(ctx, controlPlane)).To(HaveOccurred()) + g.Expect(kcp.Status.Replicas).To(BeEquivalentTo(3)) + g.Expect(kcp.Status.ReadyReplicas).To(BeEquivalentTo(3)) + g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(0)) + g.Expect(kcp.Status.Ready).To(BeTrue()) +} + func TestKubeadmControlPlaneReconciler_machinesCreatedIsIsTrueEvenWhenTheNodesAreNotReady(t *testing.T) { g := NewWithT(t) diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 58682dba2c7c..a7daaa0c0342 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -183,6 +183,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re // Requeue if the reconcile failed because the ClusterCacheTracker was locked for // the current cluster because of concurrent access. if errors.Is(err, remote.ErrClusterLocked) { + if aggr, ok := err.(kerrors.Aggregate); ok && len(aggr.Errors()) > 1 { + // Print the errors if it's not only ErrClusterLocked. + log.Info(aggr.Error()) + } log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") return ctrl.Result{RequeueAfter: time.Minute}, nil } @@ -844,7 +848,8 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { } // updateStatus updates the Status field for the MachineSet -// It checks for the current state of the replicas and updates the Status of the MachineSet. +// It checks for the current state of the replicas and updates the Status field of the MachineSet. +// When unable to retrieve the Node status, it returns error and won't update the Status field of the MachineSet. func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error { log := ctrl.LoggerFrom(ctx) newStatus := ms.Status.DeepCopy() @@ -882,8 +887,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste node, err := r.getMachineNode(ctx, cluster, machine) if err != nil && machine.GetDeletionTimestamp().IsZero() { - log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node)) - continue + return errors.Wrapf(err, "unable to retrieve the status of Node %s", klog.KObj(node)) } if noderefutil.IsNodeReady(node) { @@ -956,6 +960,9 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus } node := &corev1.Node{} if err := remoteClient.Get(ctx, client.ObjectKey{Name: machine.Status.NodeRef.Name}, node); err != nil { + if apierrors.IsNotFound(err) { + return nil, nil + } return nil, errors.Wrapf(err, "error retrieving node %s for machine %s/%s", machine.Status.NodeRef.Name, machine.Namespace, machine.Name) } return node, nil