Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add machine UpToDate condition to KCP #11389

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions api/v1beta1/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,15 @@ const (
const (
// MachineUpToDateV1Beta2Condition is true if the Machine spec matches the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment.
// The Machine's owner (e.g. MachineDeployment) is authoritative to set their owned Machine's UpToDate conditions based on its current spec.
// NOTE: The Machine's owner might use this condition to surface also other use cases when Machine is considered not up to date, e.g. when MachineDeployment spec.rolloutAfter
// is expired and the Machine needs to be rolled out.
fabriziopandini marked this conversation as resolved.
Show resolved Hide resolved
MachineUpToDateV1Beta2Condition = "UpToDate"

// MachineUpToDateV1Beta2Reason surface when a Machine spec matches the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment.
MachineUpToDateV1Beta2Reason = "UpToDate"

// MachineNotUpToDateV1Beta2Reason surface when a Machine spec does not match the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment.
MachineNotUpToDateV1Beta2Reason = "NotUpToDate"
)

// Machine's BootstrapConfigReady condition and corresponding reasons that will be used in v1Beta2 API version.
Expand Down
90 changes: 46 additions & 44 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ type ControlPlane struct {
Machines collections.Machines
machinesPatchHelpers map[string]*patch.Helper

machinesNotUptoDate collections.Machines
machinesNotUptoDateLogMessages map[string][]string
machinesNotUptoDateConditionMessages map[string][]string

// reconciliationTime is the time of the current reconciliation, and should be used for all "now" calculations
reconciliationTime metav1.Time

Expand Down Expand Up @@ -97,15 +101,35 @@ func NewControlPlane(ctx context.Context, managementCluster ManagementCluster, c
patchHelpers[machine.Name] = patchHelper
}

// Select machines that should be rolled out because of an outdated configuration or because rolloutAfter/Before expired.
reconciliationTime := metav1.Now()
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
machinesNotUptoDate := make(collections.Machines, len(ownedMachines))
machinesNotUptoDateLogMessages := map[string][]string{}
machinesNotUptoDateConditionMessages := map[string][]string{}
for _, m := range ownedMachines {
upToDate, logMessages, conditionMessages, err := UpToDate(m, kcp, &reconciliationTime, infraObjects, kubeadmConfigs)
if err != nil {
return nil, err
}
if !upToDate {
machinesNotUptoDate.Insert(m)
machinesNotUptoDateLogMessages[m.Name] = logMessages
machinesNotUptoDateConditionMessages[m.Name] = conditionMessages
}
}

return &ControlPlane{
KCP: kcp,
Cluster: cluster,
Machines: ownedMachines,
machinesPatchHelpers: patchHelpers,
KubeadmConfigs: kubeadmConfigs,
InfraResources: infraObjects,
reconciliationTime: metav1.Now(),
managementCluster: managementCluster,
KCP: kcp,
Cluster: cluster,
Machines: ownedMachines,
machinesPatchHelpers: patchHelpers,
machinesNotUptoDate: machinesNotUptoDate,
machinesNotUptoDateLogMessages: machinesNotUptoDateLogMessages,
machinesNotUptoDateConditionMessages: machinesNotUptoDateConditionMessages,
KubeadmConfigs: kubeadmConfigs,
InfraResources: infraObjects,
reconciliationTime: reconciliationTime,
managementCluster: managementCluster,
}, nil
}

Expand Down Expand Up @@ -152,16 +176,12 @@ func (c *ControlPlane) FailureDomainWithMostMachines(ctx context.Context, machin
return failuredomains.PickMost(ctx, c.Cluster.Status.FailureDomains.FilterControlPlane(), c.Machines, machines)
}

// NextFailureDomainForScaleUp returns the failure domain with the fewest number of up-to-date machines.
// NextFailureDomainForScaleUp returns the failure domain with the fewest number of up-to-date, not deleted machines.
func (c *ControlPlane) NextFailureDomainForScaleUp(ctx context.Context) (*string, error) {
if len(c.Cluster.Status.FailureDomains.FilterControlPlane()) == 0 {
return nil, nil
}
upToDateMachines, err := c.UpToDateMachines()
if err != nil {
return nil, errors.Wrapf(err, "failed to determine next failure domain for scale up")
}
return failuredomains.PickFewest(ctx, c.FailureDomains().FilterControlPlane(), upToDateMachines), nil
return failuredomains.PickFewest(ctx, c.FailureDomains().FilterControlPlane(), c.UpToDateMachines().Filter(collections.Not(collections.HasDeletionTimestamp))), nil
}

// InitialControlPlaneConfig returns a new KubeadmConfigSpec that is to be used for an initializing control plane.
Expand Down Expand Up @@ -198,40 +218,21 @@ func (c *ControlPlane) GetKubeadmConfig(machineName string) (*bootstrapv1.Kubead
}

// MachinesNeedingRollout return a list of machines that need to be rolled out.
func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string]string, error) {
// Ignore machines to be deleted.
machines := c.Machines.Filter(collections.Not(collections.HasDeletionTimestamp))
func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string][]string) {
// Note: Machines already deleted are dropped because they will be replaced by new machines after deletion completes.
return c.machinesNotUptoDate.Filter(collections.Not(collections.HasDeletionTimestamp)), c.machinesNotUptoDateLogMessages
}

// Return machines if they are scheduled for rollout or if with an outdated configuration.
machinesNeedingRollout := make(collections.Machines, len(machines))
rolloutReasons := map[string]string{}
for _, m := range machines {
reason, needsRollout, err := NeedsRollout(&c.reconciliationTime, c.KCP.Spec.RolloutAfter, c.KCP.Spec.RolloutBefore, c.InfraResources, c.KubeadmConfigs, c.KCP, m)
if err != nil {
return nil, nil, err
}
if needsRollout {
machinesNeedingRollout.Insert(m)
rolloutReasons[m.Name] = reason
}
}
return machinesNeedingRollout, rolloutReasons, nil
// NotUpToDateMachines return a list of machines that are not up to date with the control
// plane's configuration.
func (c *ControlPlane) NotUpToDateMachines() (collections.Machines, map[string][]string) {
return c.machinesNotUptoDate, c.machinesNotUptoDateConditionMessages
}

// UpToDateMachines returns the machines that are up to date with the control
// plane's configuration and therefore do not require rollout.
func (c *ControlPlane) UpToDateMachines() (collections.Machines, error) {
upToDateMachines := make(collections.Machines, len(c.Machines))
for _, m := range c.Machines {
_, needsRollout, err := NeedsRollout(&c.reconciliationTime, c.KCP.Spec.RolloutAfter, c.KCP.Spec.RolloutBefore, c.InfraResources, c.KubeadmConfigs, c.KCP, m)
if err != nil {
return nil, err
}
if !needsRollout {
upToDateMachines.Insert(m)
}
}
return upToDateMachines, nil
// plane's configuration.
func (c *ControlPlane) UpToDateMachines() collections.Machines {
return c.Machines.Difference(c.machinesNotUptoDate)
}

// getInfraResources fetches the external infrastructure resource for each machine in the collection and returns a map of machine.Name -> infraResource.
Expand Down Expand Up @@ -316,6 +317,7 @@ func (c *ControlPlane) PatchMachines(ctx context.Context) error {
controlplanev1.MachineEtcdPodHealthyCondition,
controlplanev1.MachineEtcdMemberHealthyCondition,
}}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{
clusterv1.MachineUpToDateV1Beta2Condition,
controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition,
controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition,
controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition,
Expand Down
77 changes: 75 additions & 2 deletions controlplane/kubeadm/internal/control_plane_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
Expand All @@ -30,8 +31,6 @@ import (
)

func TestControlPlane(t *testing.T) {
g := NewWithT(t)

t.Run("Failure domains", func(t *testing.T) {
controlPlane := &ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{},
Expand All @@ -53,14 +52,88 @@ func TestControlPlane(t *testing.T) {
}

t.Run("With all machines in known failure domain, should return the FD with most number of machines", func(*testing.T) {
g := NewWithT(t)
g.Expect(*controlPlane.FailureDomainWithMostMachines(ctx, controlPlane.Machines)).To(Equal("two"))
})

t.Run("With some machines in non defined failure domains", func(*testing.T) {
g := NewWithT(t)
controlPlane.Machines.Insert(machine("machine-5", withFailureDomain("unknown")))
g.Expect(*controlPlane.FailureDomainWithMostMachines(ctx, controlPlane.Machines)).To(Equal("unknown"))
})
})

t.Run("MachinesUpToDate", func(t *testing.T) {
g := NewWithT(t)
cluster := &clusterv1.Cluster{
Status: clusterv1.ClusterStatus{
FailureDomains: clusterv1.FailureDomains{
"one": failureDomain(true),
"two": failureDomain(true),
"three": failureDomain(true),
},
},
}
kcp := &controlplanev1.KubeadmControlPlane{
Spec: controlplanev1.KubeadmControlPlaneSpec{
Version: "v1.31.0",
},
}
machines := collections.Machines{
"machine-1": &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "m1"},
Spec: clusterv1.MachineSpec{
Version: ptr.To("v1.31.0"),
FailureDomain: ptr.To("one"),
InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m1"},
}},
"machine-2": &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "m2"},
Spec: clusterv1.MachineSpec{
Version: ptr.To("v1.29.0"), // not up-to-date
FailureDomain: ptr.To("two"),
InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m2"},
}},
"machine-3": &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "m3", DeletionTimestamp: ptr.To(metav1.Now())}, // deleted
Spec: clusterv1.MachineSpec{
Version: ptr.To("v1.29.3"), // not up-to-date
FailureDomain: ptr.To("three"),
InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m3"},
}},
"machine-4": &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "m4", DeletionTimestamp: ptr.To(metav1.Now())}, // deleted
Spec: clusterv1.MachineSpec{
Version: ptr.To("v1.31.0"),
FailureDomain: ptr.To("two"),
InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m1"},
}},
}
controlPlane, err := NewControlPlane(ctx, nil, env.GetClient(), cluster, kcp, machines)
g.Expect(err).NotTo(HaveOccurred())

g.Expect(controlPlane.Machines).To(HaveLen(4))

machinesNotUptoDate, machinesNotUptoDateConditionMessages := controlPlane.NotUpToDateMachines()
g.Expect(machinesNotUptoDate.Names()).To(ConsistOf("m2", "m3"))
g.Expect(machinesNotUptoDateConditionMessages).To(HaveLen(2))
g.Expect(machinesNotUptoDateConditionMessages).To(HaveKeyWithValue("m2", []string{"Version v1.29.0, v1.31.0 required"}))
g.Expect(machinesNotUptoDateConditionMessages).To(HaveKeyWithValue("m3", []string{"Version v1.29.3, v1.31.0 required"}))

machinesNeedingRollout, machinesNotUptoDateLogMessages := controlPlane.MachinesNeedingRollout()
g.Expect(machinesNeedingRollout.Names()).To(ConsistOf("m2"))
g.Expect(machinesNotUptoDateLogMessages).To(HaveLen(2))
g.Expect(machinesNotUptoDateLogMessages).To(HaveKeyWithValue("m2", []string{"Machine version \"v1.29.0\" is not equal to KCP version \"v1.31.0\""}))
g.Expect(machinesNotUptoDateLogMessages).To(HaveKeyWithValue("m3", []string{"Machine version \"v1.29.3\" is not equal to KCP version \"v1.31.0\""}))

upToDateMachines := controlPlane.UpToDateMachines()
g.Expect(upToDateMachines).To(HaveLen(2))
g.Expect(upToDateMachines.Names()).To(ConsistOf("m1", "m4"))

fd, err := controlPlane.NextFailureDomainForScaleUp(ctx)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(fd).To(Equal(ptr.To("two"))) // deleted up-to-date machines should not be counted when picking the next failure domain for scale up
})
}

func TestHasMachinesToBeRemediated(t *testing.T) {
Expand Down
Loading