kubernetes-sigs · jessehu · Mar 4, 2024 · Apr 9, 2024 · Apr 7, 2024 · sbueringer
diff --git a/controlplane/kubeadm/internal/controllers/fakes_test.go b/controlplane/kubeadm/internal/controllers/fakes_test.go
@@ -66,6 +66,14 @@ func (f *fakeManagementCluster) GetMachinePoolsForCluster(c context.Context, clu
 	return f.MachinePools, nil
 }
 
+type fakeManagementClusterWithGetWorkloadClusterError struct {
+	fakeManagementCluster
+}
+
+func (f *fakeManagementClusterWithGetWorkloadClusterError) GetWorkloadCluster(_ context.Context, _ client.ObjectKey) (internal.WorkloadCluster, error) {
+	return nil, errors.New("failed to get workload cluster")
+}
+
 type fakeWorkloadCluster struct {
 	*internal.Workload
 	Status                     internal.ClusterStatus

diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go
@@ -41,10 +41,11 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro
 	replicas := int32(len(controlPlane.Machines))
 	desiredReplicas := *controlPlane.KCP.Spec.Replicas
 
-	// set basic data that does not require interacting with the workload cluster
+	// Set basic data that does not require interacting with the workload cluster.
 	controlPlane.KCP.Status.Replicas = replicas
-	controlPlane.KCP.Status.ReadyReplicas = 0
-	controlPlane.KCP.Status.UnavailableReplicas = replicas
+	// Status.Replicas is only ever 0 on the first reconcile for KCP, then Status.UnavailableReplicas is set to `desiredReplicas`.
+	// Otherwise keep it unchanged when `desiredReplicas` does not change. So as to avoid updating it when unable to get the workload cluster.
+	controlPlane.KCP.Status.UnavailableReplicas = desiredReplicas - controlPlane.KCP.Status.ReadyReplicas
 
 	// Return early if the deletion timestamp is set, because we don't want to try to connect to the workload cluster
 	// and we don't want to report resize condition (because it is set to deleting into reconcile delete).
@@ -90,7 +91,7 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro
 		return err
 	}
 	controlPlane.KCP.Status.ReadyReplicas = status.ReadyNodes
-	controlPlane.KCP.Status.UnavailableReplicas = replicas - status.ReadyNodes
+	controlPlane.KCP.Status.UnavailableReplicas = desiredReplicas - status.ReadyNodes
 
 	// This only gets initialized once and does not change if the kubeadm config map goes away.
 	if status.HasKubeadmConfig {

diff --git a/controlplane/kubeadm/internal/controllers/status_test.go b/controlplane/kubeadm/internal/controllers/status_test.go
@@ -54,7 +54,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusNoMachines(t *testing.T) {
 			Name:      "foo",
 		},
 		Spec: controlplanev1.KubeadmControlPlaneSpec{
-			Version: "v1.16.6",
+			Version:  "v1.16.6",
+			Replicas: ptr.To[int32](1),
 			MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
 				InfrastructureRef: corev1.ObjectReference{
 					APIVersion: "test/v1alpha1",
@@ -89,7 +90,7 @@ func TestKubeadmControlPlaneReconciler_updateStatusNoMachines(t *testing.T) {
 	g.Expect(r.updateStatus(ctx, controlPlane)).To(Succeed())
 	g.Expect(kcp.Status.Replicas).To(BeEquivalentTo(0))
 	g.Expect(kcp.Status.ReadyReplicas).To(BeEquivalentTo(0))
-	g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(0))
+	g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(1))
 	g.Expect(kcp.Status.Initialized).To(BeFalse())
 	g.Expect(kcp.Status.Ready).To(BeFalse())
 	g.Expect(kcp.Status.Selector).NotTo(BeEmpty())
@@ -117,7 +118,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusAllMachinesNotReady(t *testin
 			Name:      "foo",
 		},
 		Spec: controlplanev1.KubeadmControlPlaneSpec{
-			Version: "v1.16.6",
+			Version:  "v1.16.6",
+			Replicas: ptr.To[int32](3),
 			MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
 				InfrastructureRef: corev1.ObjectReference{
 					APIVersion: "test/v1alpha1",
@@ -190,7 +192,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusAllMachinesReady(t *testing.T
 			Name:      "foo",
 		},
 		Spec: controlplanev1.KubeadmControlPlaneSpec{
-			Version: "v1.16.6",
+			Version:  "v1.16.6",
+			Replicas: ptr.To[int32](3),
 			MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
 				InfrastructureRef: corev1.ObjectReference{
 					APIVersion: "test/v1alpha1",
@@ -271,7 +274,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing
 			Name:      "foo",
 		},
 		Spec: controlplanev1.KubeadmControlPlaneSpec{
-			Version: "v1.16.6",
+			Version:  "v1.16.6",
+			Replicas: ptr.To[int32](5),
 			MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
 				InfrastructureRef: corev1.ObjectReference{
 					APIVersion: "test/v1alpha1",
@@ -331,6 +335,81 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing
 	g.Expect(kcp.Status.Ready).To(BeTrue())
 }
 
+func TestKubeadmControlPlaneReconciler_updateStatusCannotGetWorkloadClusterStatus(t *testing.T) {
+	g := NewWithT(t)
+
+	cluster := &clusterv1.Cluster{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "foo",
+			Namespace: metav1.NamespaceDefault,
+		},
+	}
+
+	kcp := &controlplanev1.KubeadmControlPlane{
+		TypeMeta: metav1.TypeMeta{
+			Kind:       "KubeadmControlPlane",
+			APIVersion: controlplanev1.GroupVersion.String(),
+		},
+		ObjectMeta: metav1.ObjectMeta{
+			Namespace: cluster.Namespace,
+			Name:      "foo",
+		},
+		Spec: controlplanev1.KubeadmControlPlaneSpec{
+			Version:  "v1.16.6",
+			Replicas: ptr.To[int32](3),
+			MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
+				InfrastructureRef: corev1.ObjectReference{
+					APIVersion: "test/v1alpha1",
+					Kind:       "UnknownInfraMachine",
+					Name:       "foo",
+				},
+			},
+		},
+		Status: controlplanev1.KubeadmControlPlaneStatus{
+			Ready:               true,
+			Replicas:            3,
+			ReadyReplicas:       3,
+			UpdatedReplicas:     3,
+			UnavailableReplicas: 0,
+		},
+	}
+	webhook := &controlplanev1webhooks.KubeadmControlPlane{}
+	g.Expect(webhook.Default(ctx, kcp)).To(Succeed())
+	_, err := webhook.ValidateCreate(ctx, kcp)
+	g.Expect(err).ToNot(HaveOccurred())
+
+	machines := map[string]*clusterv1.Machine{}
+	objs := []client.Object{cluster.DeepCopy(), kcp.DeepCopy()}
+	for i := 0; i < 3; i++ {
+		name := fmt.Sprintf("test-%d", i)
+		m, n := createMachineNodePair(name, cluster, kcp, true)
+		objs = append(objs, n, m)
+		machines[m.Name] = m
+	}
+
+	fakeClient := newFakeClient(objs...)
+
+	r := &KubeadmControlPlaneReconciler{
+		Client:            fakeClient,
+		managementCluster: &fakeManagementClusterWithGetWorkloadClusterError{},
+		recorder:          record.NewFakeRecorder(32),
+	}
+
+	controlPlane := &internal.ControlPlane{
+		KCP:      kcp,
+		Cluster:  cluster,
+		Machines: machines,
+	}
+	controlPlane.InjectTestManagementCluster(r.managementCluster)
+
+	// When updateStatus() returns a non-nil error(e.g. unable to get workload cluster), the original kcp.Status should not be updated.
+	g.Expect(r.updateStatus(ctx, controlPlane)).To(HaveOccurred())
+	g.Expect(kcp.Status.Replicas).To(BeEquivalentTo(3))
+	g.Expect(kcp.Status.ReadyReplicas).To(BeEquivalentTo(3))
+	g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(0))
+	g.Expect(kcp.Status.Ready).To(BeTrue())
+}
+
 func TestKubeadmControlPlaneReconciler_machinesCreatedIsIsTrueEvenWhenTheNodesAreNotReady(t *testing.T) {
 	g := NewWithT(t)
 

diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go
@@ -183,6 +183,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
 		// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
 		// the current cluster because of concurrent access.
 		if errors.Is(err, remote.ErrClusterLocked) {
+			if aggr, ok := err.(kerrors.Aggregate); ok && len(aggr.Errors()) > 1 {
+				// Print the errors if it's not only ErrClusterLocked.
+				log.Info(aggr.Error())
+			}
 			log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker")
 			return ctrl.Result{RequeueAfter: time.Minute}, nil
 		}
@@ -844,7 +848,8 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool {
 }
 
 // updateStatus updates the Status field for the MachineSet
-// It checks for the current state of the replicas and updates the Status of the MachineSet.
+// It checks for the current state of the replicas and updates the Status field of the MachineSet.
+// When unable to retrieve the Node status, it returns error and won't update the Status field of the MachineSet.
 func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error {
 	log := ctrl.LoggerFrom(ctx)
 	newStatus := ms.Status.DeepCopy()
@@ -882,8 +887,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
 
 		node, err := r.getMachineNode(ctx, cluster, machine)
 		if err != nil && machine.GetDeletionTimestamp().IsZero() {
-			log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node))
-			continue
+			return errors.Wrapf(err, "unable to retrieve the status of Node %s", klog.KObj(node))
 		}
 
 		if noderefutil.IsNodeReady(node) {
@@ -956,6 +960,9 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus
 	}
 	node := &corev1.Node{}
 	if err := remoteClient.Get(ctx, client.ObjectKey{Name: machine.Status.NodeRef.Name}, node); err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil, nil
+		}
 		return nil, errors.Wrapf(err, "error retrieving node %s for machine %s/%s", machine.Status.NodeRef.Name, machine.Namespace, machine.Name)
 	}
 	return node, nil