Skip to content

Commit

Permalink
Add v1beta2 available condition to KCP
Browse files Browse the repository at this point in the history
# Conflicts:
#	controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go
#	controlplane/kubeadm/internal/controllers/status.go
#	controlplane/kubeadm/internal/controllers/status_test.go
  • Loading branch information
fabriziopandini committed Nov 7, 2024
1 parent bf61b1e commit 5c3b51f
Show file tree
Hide file tree
Showing 6 changed files with 892 additions and 32 deletions.
24 changes: 21 additions & 3 deletions controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,32 @@ import clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"

// KubeadmControlPlane's Available condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// KubeadmControlPlaneAvailableV1Beta2Condition is True if the control plane can be reached, EtcdClusterHealthy is true,
// and CertificatesAvailable is true.
// KubeadmControlPlaneAvailableV1Beta2Condition is true if KubeadmControlPlane is not deleted, `CertificatesAvailable` is true,
// at least one Machine with healthy control plane components, and etcd has enough operational members to meet quorum requirements.
// More specifically, considering how kubeadm layouts components:
// - Kubernetes API server, scheduler and controller manager health is inferred by the status of
// the corresponding Pods hosted on each machine.
// - In case of managed etcd, also a healthy etcd Pod and a healthy etcd member must exist on the same
// machine with the healthy Kubernetes API server, scheduler and controller manager, otherwise the k8s control
// plane cannot be considered operational (if etcd is not operational on a machine, most likely also API server,
// scheduler and controller manager on the same machine will be impacted).
// - In case of external etcd, KCP cannot make any assumption on etcd status, so all the etcd checks are skipped.
KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition

// KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason documents a failure when inspecting the status of the
// etcd cluster hosted on KubeadmControlPlane controlled machines.
KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason

// KubeadmControlPlaneAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is available.
KubeadmControlPlaneAvailableV1Beta2Reason = clusterv1.AvailableV1Beta2Reason

// KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is not available.
KubeadmControlPlaneNotAvailableV1Beta2Reason = clusterv1.NotAvailableV1Beta2Reason
)

// KubeadmControlPlane's Initialized condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// KubeadmControlPlaneInitializedV1Beta2Condition is True when the control plane is functional enough to accept
// KubeadmControlPlaneInitializedV1Beta2Condition is true when the control plane is functional enough to accept
// requests. This information is usually used as a signal for starting all the provisioning operations that
// depend on a functional API server, but do not require a full HA control plane to exist.
KubeadmControlPlaneInitializedV1Beta2Condition = "Initialized"
Expand Down
11 changes: 11 additions & 0 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1"
"sigs.k8s.io/cluster-api/controllers/external"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/failuredomains"
"sigs.k8s.io/cluster-api/util/patch"
Expand Down Expand Up @@ -58,6 +59,16 @@ type ControlPlane struct {
KubeadmConfigs map[string]*bootstrapv1.KubeadmConfig
InfraResources map[string]*unstructured.Unstructured

// EtcdMembers is the list of members read while computing reconcileControlPlaneConditions; also additional info below
// comes from the same func.
// NOTE: Those info are computed based on the info KCP was able to collect during inspection (e.g. if on a 3 CP
// control plane one etcd member is down, those info are based on the answer collected from two members only).
// NOTE: Those info are specifically designed for computing KCP's Available condition.
EtcdMembers []*etcd.Member
EtcdMembersAgreeOnMemberList bool
EtcdMembersAgreeOnClusterID bool
EtcdMembersAndMachinesAreMatching bool

managementCluster ManagementCluster
workloadCluster WorkloadCluster

Expand Down
138 changes: 137 additions & 1 deletion controlplane/kubeadm/internal/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
Expand Down Expand Up @@ -168,7 +169,7 @@ func (r *KubeadmControlPlaneReconciler) updateV1Beta2Status(ctx context.Context,
setMachinesUpToDateCondition(ctx, controlPlane.KCP, controlPlane.Machines)
setRemediatingCondition(ctx, controlPlane.KCP, controlPlane.MachinesToBeRemediatedByKCP(), controlPlane.UnhealthyMachines())
setDeletingCondition(ctx, controlPlane.KCP, controlPlane.DeletingReason, controlPlane.DeletingMessage)
// TODO: Available
setAvailableCondition(ctx, controlPlane.KCP, controlPlane.IsEtcdManaged(), controlPlane.EtcdMembers, controlPlane.EtcdMembersAgreeOnMemberList, controlPlane.EtcdMembersAgreeOnClusterID, controlPlane.EtcdMembersAndMachinesAreMatching, controlPlane.Machines)
}

func setReplicas(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, machines collections.Machines) {
Expand Down Expand Up @@ -441,6 +442,141 @@ func setDeletingCondition(_ context.Context, kcp *controlplanev1.KubeadmControlP
})
}

func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList, etcdMembersAgreeOnClusterID, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) {
if !kcp.Status.Initialized {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "Control plane not yet initialized",
})
return
}

if etcdIsManaged {
if etcdMembers == nil {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionUnknown,
Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason,
Message: "Failed to get etcd members",
})
return
}

if !etcdMembersAgreeOnMemberList {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "At least one etcd member reports a list of etcd members different than the list reported by other members",
})
return
}

if !etcdMembersAgreeOnClusterID {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members",
})
return
}

if !etcdMembersAndMachinesAreMatching {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "The list of etcd members does not match the list of Machines and Nodes",
})
return
}
}

// Determine control plane availability looking at machines conditions, which at this stage are
// already surfacing status from etcd member and all control plane pods hosted on every machine.
// Note: we intentionally use the number of etcd members to determine the etcd quorum because
// etcd members might not match with machines, e.g. while provisioning a new machine.
etcdQuorum := (len(etcdMembers) / 2.0) + 1
k8sControlPlaneHealthy := 0
etcdMembersHealthy := 0
for _, machine := range machines {
// if external etcd, only look at the status of the K8s control plane components on this machine.
if !etcdIsManaged {
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
k8sControlPlaneHealthy++
}
continue
}

// Otherwise, etcd is managed.
// In this case, when looking at the k8s control plane we should consider how kubeadm layouts control plane components,
// and more specifically:
// - API server on one machine only connect to the local etcd member
// - ControllerManager and scheduler on a machine connect to the local API server (not to the control plane endpoint)
// As a consequence, we consider the K8s control plane on this machine healthy only if everything is healthy.

if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
etcdMembersHealthy++
}

if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
k8sControlPlaneHealthy++
}
}

if kcp.DeletionTimestamp.IsZero() &&
(!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) &&
k8sControlPlaneHealthy >= 1 &&
v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason,
})
return
}

messages := []string{}
if !kcp.DeletionTimestamp.IsZero() {
messages = append(messages, "Control plane metadata.deletionTimestamp is set")
}

if !v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
messages = append(messages, "Control plane certificates are not available")
}

if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
switch etcdMembersHealthy {
case 0:
messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
case 1:
messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
default:
messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required for etcd quorum", etcdMembersHealthy, etcdQuorum))
}
}

if k8sControlPlaneHealthy < 1 {
messages = append(messages, "There are no Machines with healthy control plane components, at least 1 required")
}

v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: strings.Join(messages, ";"),
})
}

func aggregateStaleMachines(machines collections.Machines) string {
if len(machines) == 0 {
return ""
Expand Down
Loading

0 comments on commit 5c3b51f

Please sign in to comment.