Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Do not update KCP and MS status when unable to get workload cluster #10229

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions controlplane/kubeadm/internal/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ func (f *fakeManagementCluster) GetMachinePoolsForCluster(c context.Context, clu
return f.MachinePools, nil
}

type fakeManagementClusterWithGetWorkloadClusterError struct {
fakeManagementCluster
}

func (f *fakeManagementClusterWithGetWorkloadClusterError) GetWorkloadCluster(_ context.Context, _ client.ObjectKey) (internal.WorkloadCluster, error) {
return nil, errors.New("failed to get workload cluster")
}

type fakeWorkloadCluster struct {
*internal.Workload
Status internal.ClusterStatus
Expand Down
9 changes: 5 additions & 4 deletions controlplane/kubeadm/internal/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro
replicas := int32(len(controlPlane.Machines))
desiredReplicas := *controlPlane.KCP.Spec.Replicas

// set basic data that does not require interacting with the workload cluster
// Set basic data that does not require interacting with the workload cluster.
controlPlane.KCP.Status.Replicas = replicas
controlPlane.KCP.Status.ReadyReplicas = 0
controlPlane.KCP.Status.UnavailableReplicas = replicas
// Status.Replicas is only ever 0 on the first reconcile for KCP, then Status.UnavailableReplicas is set to `desiredReplicas`.
// Otherwise keep it unchanged when `desiredReplicas` does not change. So as to avoid updating it when unable to get the workload cluster.
controlPlane.KCP.Status.UnavailableReplicas = desiredReplicas - controlPlane.KCP.Status.ReadyReplicas
jessehu marked this conversation as resolved.
Show resolved Hide resolved

// Return early if the deletion timestamp is set, because we don't want to try to connect to the workload cluster
// and we don't want to report resize condition (because it is set to deleting into reconcile delete).
Expand Down Expand Up @@ -90,7 +91,7 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro
return err
}
controlPlane.KCP.Status.ReadyReplicas = status.ReadyNodes
controlPlane.KCP.Status.UnavailableReplicas = replicas - status.ReadyNodes
controlPlane.KCP.Status.UnavailableReplicas = desiredReplicas - status.ReadyNodes

// This only gets initialized once and does not change if the kubeadm config map goes away.
if status.HasKubeadmConfig {
Expand Down
89 changes: 84 additions & 5 deletions controlplane/kubeadm/internal/controllers/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusNoMachines(t *testing.T) {
Name: "foo",
},
Spec: controlplanev1.KubeadmControlPlaneSpec{
Version: "v1.16.6",
Version: "v1.16.6",
Replicas: ptr.To[int32](1),
MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
InfrastructureRef: corev1.ObjectReference{
APIVersion: "test/v1alpha1",
Expand Down Expand Up @@ -89,7 +90,7 @@ func TestKubeadmControlPlaneReconciler_updateStatusNoMachines(t *testing.T) {
g.Expect(r.updateStatus(ctx, controlPlane)).To(Succeed())
g.Expect(kcp.Status.Replicas).To(BeEquivalentTo(0))
g.Expect(kcp.Status.ReadyReplicas).To(BeEquivalentTo(0))
g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(0))
g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(1))
g.Expect(kcp.Status.Initialized).To(BeFalse())
g.Expect(kcp.Status.Ready).To(BeFalse())
g.Expect(kcp.Status.Selector).NotTo(BeEmpty())
Expand Down Expand Up @@ -117,7 +118,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusAllMachinesNotReady(t *testin
Name: "foo",
},
Spec: controlplanev1.KubeadmControlPlaneSpec{
Version: "v1.16.6",
Version: "v1.16.6",
Replicas: ptr.To[int32](3),
MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
InfrastructureRef: corev1.ObjectReference{
APIVersion: "test/v1alpha1",
Expand Down Expand Up @@ -190,7 +192,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusAllMachinesReady(t *testing.T
Name: "foo",
},
Spec: controlplanev1.KubeadmControlPlaneSpec{
Version: "v1.16.6",
Version: "v1.16.6",
Replicas: ptr.To[int32](3),
MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
InfrastructureRef: corev1.ObjectReference{
APIVersion: "test/v1alpha1",
Expand Down Expand Up @@ -271,7 +274,8 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing
Name: "foo",
},
Spec: controlplanev1.KubeadmControlPlaneSpec{
Version: "v1.16.6",
Version: "v1.16.6",
Replicas: ptr.To[int32](5),
MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
InfrastructureRef: corev1.ObjectReference{
APIVersion: "test/v1alpha1",
Expand Down Expand Up @@ -331,6 +335,81 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing
g.Expect(kcp.Status.Ready).To(BeTrue())
}

func TestKubeadmControlPlaneReconciler_updateStatusCannotGetWorkloadClusterStatus(t *testing.T) {
g := NewWithT(t)

cluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
Namespace: metav1.NamespaceDefault,
},
}

kcp := &controlplanev1.KubeadmControlPlane{
TypeMeta: metav1.TypeMeta{
Kind: "KubeadmControlPlane",
APIVersion: controlplanev1.GroupVersion.String(),
},
ObjectMeta: metav1.ObjectMeta{
Namespace: cluster.Namespace,
Name: "foo",
},
Spec: controlplanev1.KubeadmControlPlaneSpec{
Version: "v1.16.6",
Replicas: ptr.To[int32](3),
MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
InfrastructureRef: corev1.ObjectReference{
APIVersion: "test/v1alpha1",
Kind: "UnknownInfraMachine",
Name: "foo",
},
},
},
Status: controlplanev1.KubeadmControlPlaneStatus{
Ready: true,
Replicas: 3,
ReadyReplicas: 3,
UpdatedReplicas: 3,
UnavailableReplicas: 0,
},
}
webhook := &controlplanev1webhooks.KubeadmControlPlane{}
g.Expect(webhook.Default(ctx, kcp)).To(Succeed())
_, err := webhook.ValidateCreate(ctx, kcp)
g.Expect(err).ToNot(HaveOccurred())

machines := map[string]*clusterv1.Machine{}
objs := []client.Object{cluster.DeepCopy(), kcp.DeepCopy()}
for i := 0; i < 3; i++ {
name := fmt.Sprintf("test-%d", i)
m, n := createMachineNodePair(name, cluster, kcp, true)
objs = append(objs, n, m)
machines[m.Name] = m
}

fakeClient := newFakeClient(objs...)

r := &KubeadmControlPlaneReconciler{
Client: fakeClient,
managementCluster: &fakeManagementClusterWithGetWorkloadClusterError{},
recorder: record.NewFakeRecorder(32),
}

controlPlane := &internal.ControlPlane{
KCP: kcp,
Cluster: cluster,
Machines: machines,
}
controlPlane.InjectTestManagementCluster(r.managementCluster)

// When updateStatus() returns a non-nil error(e.g. unable to get workload cluster), the original kcp.Status should not be updated.
g.Expect(r.updateStatus(ctx, controlPlane)).To(HaveOccurred())
g.Expect(kcp.Status.Replicas).To(BeEquivalentTo(3))
g.Expect(kcp.Status.ReadyReplicas).To(BeEquivalentTo(3))
g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(0))
g.Expect(kcp.Status.Ready).To(BeTrue())
}
Copy link
Member

@sbueringer sbueringer Apr 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add an additional call to updateStatus here:

  • drop one controlPlane.machines (or add another one)
  • call updateStatus
  • check that status.Replicas changed and the other ones stayed the same

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. Will add the case after the discussion is resolved: #10229 (comment)


func TestKubeadmControlPlaneReconciler_machinesCreatedIsIsTrueEvenWhenTheNodesAreNotReady(t *testing.T) {
g := NewWithT(t)

Expand Down
13 changes: 10 additions & 3 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
// the current cluster because of concurrent access.
if errors.Is(err, remote.ErrClusterLocked) {
if aggr, ok := err.(kerrors.Aggregate); ok && len(aggr.Errors()) > 1 {
// Print the errors if it's not only ErrClusterLocked.
log.Info(aggr.Error())
}
log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker")
return ctrl.Result{RequeueAfter: time.Minute}, nil
}
Expand Down Expand Up @@ -844,7 +848,8 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool {
}

// updateStatus updates the Status field for the MachineSet
// It checks for the current state of the replicas and updates the Status of the MachineSet.
// It checks for the current state of the replicas and updates the Status field of the MachineSet.
// When unable to retrieve the Node status, it returns error and won't update the Status field of the MachineSet.
func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error {
Copy link
Member

@sbueringer sbueringer Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here a proposed solution to the ErrClusterLocked issue: (that Fabrizio was referring to)

	// Retry getting a remote client.
	// Note: This is to ensure that we don't run into errors here just 
	// because multiple reconcilers try to create the client at the same time
	// (for the case where the workload cluster is reachable).
	var canCommunicateWithWorkloadCluster bool
	var remoteClient client.Client
	err = retry.OnError(wait.Backoff{
		Steps:    5,
		Duration: 200 * time.Millisecond,
		Factor:   1.0,
	}, func(err error) bool {
		// Retry as long as we get remote.ErrClusterLocked errors. 
		return errors.Is(err, remote.ErrClusterLocked)
	}, func() error {
		remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
		if err != nil {
			return err
		}
		canCommunicateWithWorkloadCluster = true
		return nil
	})
	if err != nil {
		log.Error(err, "Unable to retrieve status of Nodes: failed to create remote client")
	}

	for _, machine := range filteredMachines {
		log := log.WithValues("Machine", klog.KObj(machine))

		if templateLabel.Matches(labels.Set(machine.Labels)) {
			fullyLabeledReplicasCount++
		}

		if machine.Status.NodeRef == nil {
			log.V(4).Info("Waiting for the machine controller to set status.nodeRef on the Machine")
			continue
		}

		if !canCommunicateWithWorkloadCluster {
			// Skip the rest of the for loop if we can't communicate with the workload cluster.
			continue
		}

		node, err := r.getMachineNode(ctx, remoteClient, machine)
		if err != nil && machine.GetDeletionTimestamp().IsZero() {
			log.Error(err, "Unable to retrieve Node status", "Node", klog.KObj(node))
			continue
		}

		if noderefutil.IsNodeReady(node) {
			readyReplicasCount++
			if noderefutil.IsNodeAvailable(node, ms.Spec.MinReadySeconds, metav1.Now()) {
				availableReplicasCount++
			}
		} else if machine.GetDeletionTimestamp().IsZero() {
			log.V(4).Info("Waiting for the Kubernetes node on the machine to report ready state")
		}
	}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is that when the workload cluster is reachable, we only get ErrClusterLocked for a very short amount of time (the time it takes to create a client). For this case it is good enough to simply retry creating the client.

We will fallback to the previous behavior only if the workload cluster is actually not reachable.

Copy link
Contributor Author

@jessehu jessehu Apr 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool!I'm not quite sure if it's worthy to add this bulk of code to resolve the temporary inconsistency of some status replicas and conditions fields caused by ErrClusterLocked error. The simple change in current PR can solve this problem while there are inconsistency but acceptable and won't cause issues per my thinking (maybe I missed some cases).
cc @fabriziopandini for awareness.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sbueringer and I are in total agreement, and we are trying to help in re-focusing the PR to the original issue. We also took some additional steps to help in finding a way forward by proposing an alternative solution.

With regards to the current PR, I already tried to explain my concern and I will be happy to add more if you have any doubts (and I already answered one).

But given the concern above, I'm personally -1 to merge the PR in the current state.

Instead, we both think the change proposed by @sbueringer solves the original issues, but ultimately it is up to you to accept it or not

Copy link
Contributor Author

@jessehu jessehu Apr 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @fabriziopandini for the details. Here are my two cents:

  1. Should this new if block calls return err instead of continue since canCommunicateWithWorkloadCluster won't change in the for loop.
		if !canCommunicateWithWorkloadCluster {
			// Skip the rest of the for loop if we can't communicate with the workload cluster.
			continue
		}
  1. There are many places that return ErrClusterLocked (not only in MS controller and KCP controller). If we want this retry logic, shall we add it in other places as well? Or just here to resolve the original status updating issue?
  2. Current PR code returns err in the for loop in updateStatus() in MS controller when hitting ErrClusterLocked or other error. In this case, it won't update MS.Status because the status update code is after the for loop at lin 903 newStatus.Replicas = int32(len(filteredMachines)).
    However KCP controller is different and already updates some status fields before hitting ErrClusterLocked.
machineset_controller.go:

		if err != nil && machine.GetDeletionTimestamp().IsZero() {
-		    log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node))
-			continue
+			return errors.Wrapf(err, "unable to retrieve the status of Node %s", klog.KObj(node))
		}

图片

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more thing: not sure if canCommunicateWithWorkloadCluster == true can 100% guarentee getMachineNode() won't hit ErrClusterLocked since they are two serial func calls but not atom calls ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this new if block calls return err instead of continue since canCommunicateWithWorkloadCluster won't change in the for loop.

Our main concern is basically that if we return an error here as soon as we hit ErrClusterLocked we don't update the status at all anymore. This should be okay in the happy path, which is that we can actually communicate with the workload cluster and it's just a matter of waiting until another goroutine successfully created the client so we can use it. The problem we see is if the workload cluster is actually not reachable. Because in that case we will just continuously return the error forever. In this case we "freeze" the values of status: FullyLabeledReplicas, ReadyReplicas, AvailableReplicas, ObservedGeneration and a few conditions.

The concern Fabrizio raised (and I didn't have on my radar before) is that if we freeze the status indefinitely in these cases (or rather until the workload cluster is reachable) this can be pretty confusing for users. So for this case we should actually have a more extensive solution which also covers signalling to users that we can't communicate with the workload cluster anymore.

What we were trying to suggest was a mitigation for the issue for the happy path, where we actually can talk to the workload cluster, but replicas are flipping only because of the way we create the client initially for a cluster.

There are many places that return ErrClusterLocked (not only in MS controller and KCP controller). If we want this retry logic, shall we add it in other places as well? Or just here to resolve the original status updating issue?

I'm not really sure about this one. Basically we introduced the current "TryLock" over a "Lock" to make sure we don't block all reconcile for a cluster when a workload cluster is not reachable. The "Lock" we had before lead to really problematic performance issues when some workload clusters (with a lot of Machines) were unreachable.
So I think we should be careful with introducing these retries as they can lead to serious performance degradation (basically by just introducing this here, every reconcile of a MachineSet of an unreachable cluster will take 1 second).

log := ctrl.LoggerFrom(ctx)
newStatus := ms.Status.DeepCopy()
Expand Down Expand Up @@ -882,8 +887,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste

node, err := r.getMachineNode(ctx, cluster, machine)
if err != nil && machine.GetDeletionTimestamp().IsZero() {
log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node))
continue
return errors.Wrapf(err, "unable to retrieve the status of Node %s", klog.KObj(node))
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

@fabriziopandini fabriziopandini Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the issue

We probably also want to have a timeout on this behaviour. If we haven't seen the Node in x time then we assume its status is unknown

But if I'm not wrong with the current implementation we are going to freeze the replica count indefinetly, which could be confusing or eventually also wrong, given that the number of replicas/machine might change in the meantime.

Frankly speaking, in order to properly fix this issue I think that we first need to figure out how to get/store the "last seen" information for each node.

Given that info, we can decide if it is ok to flip the replica status or if to wait (but I don't have a concrete idea on how to do so, I need some time to research into it)

Copy link
Member

@sbueringer sbueringer Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important point that I discussed with Fabrizio. We're not only freezing the replica fields, we would also freeze other status fields like ObservedGeneration and conditions.

}

if noderefutil.IsNodeReady(node) {
Expand Down Expand Up @@ -956,6 +960,9 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus
}
node := &corev1.Node{}
if err := remoteClient.Get(ctx, client.ObjectKey{Name: machine.Status.NodeRef.Name}, node); err != nil {
if apierrors.IsNotFound(err) {
Copy link
Member

@fabriziopandini fabriziopandini Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why are we ignoring not found.
Is that for the case when someone manually deletes a node? If yes, please add a comment (but it also seems unrelated to the issue we are trying to fix)

Copy link
Member

@sbueringer sbueringer Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for clarification.

The idea behind this one was to signal to the calling function that the node doesn't exist (vs. that it's just an error). Because not finding the Node is also a valuable information (which basically comes down to that the node is not ready)

This was added to preserve the previous behavior one level above (where we only logged the error before but still considered a not found node a node that is not ready)

But yeah it becomes pretty had to understand

return nil, nil
}
return nil, errors.Wrapf(err, "error retrieving node %s for machine %s/%s", machine.Status.NodeRef.Name, machine.Namespace, machine.Name)
}
return node, nil
Expand Down