From 4578c2377c6b6076683c33a0aa3d997a3f45857c Mon Sep 17 00:00:00 2001 From: Danil-Grigorev Date: Wed, 11 Sep 2024 18:24:33 +0200 Subject: [PATCH] Fix: E2E failures in CI Removed: - Metrics and pod logs collection. Crust gather collects logs for all resources. Fixed: - MachineDeployment checks for running machines. MachineSets are picked at random, as they are indistinguishable based on labels, and belong to the same MachineDeployment. This causes flakes as old MachineSet is expected to scale accordingly, while the new one performed it instead. - Increased ClusterClass apply timeouts. CAPD webhooks may take longer to stand up. Signed-off-by: Danil-Grigorev --- test/e2e/common.go | 113 +++++++++++++++++++++++++++--- test/e2e/e2e_clusterclass_test.go | 4 +- test/e2e/e2e_suite_test.go | 6 +- test/e2e/e2e_test.go | 2 +- test/e2e/e2e_upgrade_test.go | 6 +- test/e2e/helpers.go | 76 ++++++++++---------- 6 files changed, 155 insertions(+), 52 deletions(-) diff --git a/test/e2e/common.go b/test/e2e/common.go index 8de682b9..7890ec16 100644 --- a/test/e2e/common.go +++ b/test/e2e/common.go @@ -32,9 +32,11 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/cmd/clusterctl/client/config" "sigs.k8s.io/cluster-api/test/framework" "sigs.k8s.io/cluster-api/test/framework/clusterctl" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/yaml" ) @@ -55,16 +57,11 @@ func Byf(format string, a ...interface{}) { By(fmt.Sprintf(format, a...)) } -func setupSpecNamespace(ctx context.Context, specName string, clusterProxy framework.ClusterProxy, artifactFolder string) (*corev1.Namespace, context.CancelFunc) { +func setupSpecNamespace(ctx context.Context, specName string, clusterProxy framework.ClusterProxy, _ string) (*corev1.Namespace, context.CancelFunc) { Byf("Creating a namespace for hosting the %q test spec", specName) - namespace, cancelWatches := framework.CreateNamespaceAndWatchEvents(ctx, framework.CreateNamespaceAndWatchEventsInput{ - Creator: clusterProxy.GetClient(), - ClientSet: clusterProxy.GetClientSet(), - Name: fmt.Sprintf("%s-%s", specName, util.RandomString(6)), - LogFolder: filepath.Join(artifactFolder, "clusters", clusterProxy.GetName()), - }) - return namespace, cancelWatches + _, cancelWatches := context.WithCancel(ctx) + return framework.CreateNamespace(ctx, framework.CreateNamespaceInput{Creator: clusterProxy.GetClient(), Name: fmt.Sprintf("%s-%s", specName, util.RandomString(6))}, "40s", "10s"), cancelWatches } func cleanupInstallation(ctx context.Context, clusterctlLogFolder, clusterctlConfigPath string, proxy framework.ClusterProxy) func() { @@ -191,3 +188,103 @@ func localLoadE2EConfig(configPath string) *clusterctl.E2EConfig { return config } + +// UpgradeManagementCluster upgrades provider a management cluster using clusterctl, and waits for the cluster to be ready. +func UpgradeManagementCluster(ctx context.Context, input clusterctl.UpgradeManagementClusterAndWaitInput) { + Expect(ctx).NotTo(BeNil(), "ctx is required for UpgradeManagementCluster") + Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling UpgradeManagementCluster") + Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling UpgradeManagementCluster") + + // Check if the user want a custom upgrade + isCustomUpgrade := input.CoreProvider != "" || + len(input.BootstrapProviders) > 0 || + len(input.ControlPlaneProviders) > 0 || + len(input.InfrastructureProviders) > 0 || + len(input.IPAMProviders) > 0 || + len(input.RuntimeExtensionProviders) > 0 || + len(input.AddonProviders) > 0 + + Expect((input.Contract != "" && !isCustomUpgrade) || (input.Contract == "" && isCustomUpgrade)).To(BeTrue(), `Invalid argument. Either the input.Contract parameter or at least one of the following providers has to be set: + input.CoreProvider, input.BootstrapProviders, input.ControlPlaneProviders, input.InfrastructureProviders, input.IPAMProviders, input.RuntimeExtensionProviders, input.AddonProviders`) + + Expect(os.MkdirAll(input.LogFolder, 0750)).To(Succeed(), "Invalid argument. input.LogFolder can't be created for UpgradeManagementClusterAndWait") + + upgradeInput := clusterctl.UpgradeInput{ + ClusterctlConfigPath: input.ClusterctlConfigPath, + ClusterctlVariables: input.ClusterctlVariables, + ClusterName: input.ClusterProxy.GetName(), + KubeconfigPath: input.ClusterProxy.GetKubeconfigPath(), + Contract: input.Contract, + CoreProvider: input.CoreProvider, + BootstrapProviders: input.BootstrapProviders, + ControlPlaneProviders: input.ControlPlaneProviders, + InfrastructureProviders: input.InfrastructureProviders, + IPAMProviders: input.IPAMProviders, + RuntimeExtensionProviders: input.RuntimeExtensionProviders, + AddonProviders: input.AddonProviders, + LogFolder: input.LogFolder, + } + + clusterctl.Upgrade(ctx, upgradeInput) + + // We have to skip collecting metrics, as it causes failures in CI +} + +// InitManagementCluster initializes a management using clusterctl. +func InitManagementCluster(ctx context.Context, input clusterctl.InitManagementClusterAndWatchControllerLogsInput, intervals ...interface{}) { + Expect(ctx).NotTo(BeNil(), "ctx is required for InitManagementCluster") + Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling InitManagementCluster") + Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling InitManagementCluster") + Expect(input.InfrastructureProviders).ToNot(BeEmpty(), "Invalid argument. input.InfrastructureProviders can't be empty when calling InitManagementCluster") + Expect(os.MkdirAll(input.LogFolder, 0750)).To(Succeed(), "Invalid argument. input.LogFolder can't be created for InitManagementCluster") + + logger := log.FromContext(ctx) + + if input.CoreProvider == "" { + input.CoreProvider = config.ClusterAPIProviderName + } + if len(input.BootstrapProviders) == 0 { + input.BootstrapProviders = []string{config.KubeadmBootstrapProviderName} + } + if len(input.ControlPlaneProviders) == 0 { + input.ControlPlaneProviders = []string{config.KubeadmControlPlaneProviderName} + } + + client := input.ClusterProxy.GetClient() + controllersDeployments := framework.GetControllerDeployments(ctx, framework.GetControllerDeploymentsInput{ + Lister: client, + }) + if len(controllersDeployments) == 0 { + initInput := clusterctl.InitInput{ + // pass reference to the management cluster hosting this test + KubeconfigPath: input.ClusterProxy.GetKubeconfigPath(), + // pass the clusterctl config file that points to the local provider repository created for this test + ClusterctlConfigPath: input.ClusterctlConfigPath, + // setup the desired list of providers for a single-tenant management cluster + CoreProvider: input.CoreProvider, + BootstrapProviders: input.BootstrapProviders, + ControlPlaneProviders: input.ControlPlaneProviders, + InfrastructureProviders: input.InfrastructureProviders, + IPAMProviders: input.IPAMProviders, + RuntimeExtensionProviders: input.RuntimeExtensionProviders, + AddonProviders: input.AddonProviders, + // setup clusterctl logs folder + LogFolder: input.LogFolder, + } + + clusterctl.Init(ctx, initInput) + } + + logger.Info("Waiting for provider controllers to be running") + + controllersDeployments = framework.GetControllerDeployments(ctx, framework.GetControllerDeploymentsInput{ + Lister: client, + }) + Expect(controllersDeployments).ToNot(BeEmpty(), "The list of controller deployments should not be empty") + for _, deployment := range controllersDeployments { + framework.WaitForDeploymentsAvailable(ctx, framework.WaitForDeploymentsAvailableInput{ + Getter: client, + Deployment: deployment, + }, intervals...) + } +} diff --git a/test/e2e/e2e_clusterclass_test.go b/test/e2e/e2e_clusterclass_test.go index 7c48d9d3..1bb8a99b 100644 --- a/test/e2e/e2e_clusterclass_test.go +++ b/test/e2e/e2e_clusterclass_test.go @@ -102,7 +102,9 @@ var _ = Describe("Workload cluster creation", func() { } }) Expect(err).ToNot(HaveOccurred()) - Expect(bootstrapClusterProxy.Apply(ctx, []byte(clusterClassConfig))).To(Succeed(), "Failed to apply ClusterClass definition") + Eventually(func() error { + return bootstrapClusterProxy.Apply(ctx, []byte(clusterClassConfig)) + }, e2eConfig.GetIntervals(specName, "wait-cluster")...).Should(Succeed(), "Failed to apply ClusterClass definition") By("Create a Docker Cluster from topology") diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index bc92160e..d5d86d57 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -236,7 +236,7 @@ func setupBootstrapCluster(config *clusterctl.E2EConfig, scheme *runtime.Scheme, // initBootstrapCluster initializes a bootstrap cluster with the latest minor version. func initBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config *clusterctl.E2EConfig, clusterctlConfig, artifactFolder string) { - clusterctl.InitManagementClusterAndWatchControllerLogs(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{ + InitManagementCluster(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{ ClusterProxy: bootstrapClusterProxy, ClusterctlConfigPath: clusterctlConfig, InfrastructureProviders: config.InfrastructureProviders(), @@ -245,13 +245,14 @@ func initBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config * BootstrapProviders: []string{"rke2-bootstrap"}, ControlPlaneProviders: []string{"rke2-control-plane"}, LogFolder: filepath.Join(artifactFolder, "clusters", bootstrapClusterProxy.GetName()), + DisableMetricsCollection: true, }, config.GetIntervals(bootstrapClusterProxy.GetName(), "wait-controllers")...) } // initUpgradableBootstrapCluster initializes a bootstrap cluster with the latest minor version N-1 and used to perform an upgrade to the latest version. // Make sure to update the version in the providers list to the latest minor version N-1. func initUpgradableBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config *clusterctl.E2EConfig, clusterctlConfig, artifactFolder string) { - clusterctl.InitManagementClusterAndWatchControllerLogs(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{ + InitManagementCluster(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{ ClusterProxy: bootstrapClusterProxy, ClusterctlConfigPath: clusterctlConfig, InfrastructureProviders: config.InfrastructureProviders(), @@ -260,6 +261,7 @@ func initUpgradableBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy BootstrapProviders: []string{"rke2-bootstrap:v0.6.0"}, ControlPlaneProviders: []string{"rke2-control-plane:v0.6.0"}, LogFolder: filepath.Join(artifactFolder, "clusters", bootstrapClusterProxy.GetName()), + DisableMetricsCollection: true, }, config.GetIntervals(bootstrapClusterProxy.GetName(), "wait-controllers")...) } diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 1a4c04fc..51ab7588 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -151,7 +151,7 @@ var _ = Describe("Workload cluster creation", func() { }, result) WaitForClusterToUpgrade(ctx, WaitForClusterToUpgradeInput{ - Lister: bootstrapClusterProxy.GetClient(), + Reader: bootstrapClusterProxy.GetClient(), ControlPlane: result.ControlPlane, MachineDeployments: result.MachineDeployments, VersionAfterUpgrade: e2eConfig.GetVariable(KubernetesVersionUpgradeTo), diff --git a/test/e2e/e2e_upgrade_test.go b/test/e2e/e2e_upgrade_test.go index 77594396..f6f15791 100644 --- a/test/e2e/e2e_upgrade_test.go +++ b/test/e2e/e2e_upgrade_test.go @@ -115,13 +115,13 @@ var _ = Describe("Workload cluster creation", func() { }, e2eConfig.GetIntervals(specName, "wait-control-plane")...) By("Upgrading to latest boostrap/controlplane provider version") - clusterctl.UpgradeManagementClusterAndWait(ctx, clusterctl.UpgradeManagementClusterAndWaitInput{ + UpgradeManagementCluster(ctx, clusterctl.UpgradeManagementClusterAndWaitInput{ ClusterProxy: bootstrapClusterProxy, ClusterctlConfigPath: clusterctlConfigPath, BootstrapProviders: []string{"rke2-bootstrap:v0.7.99"}, ControlPlaneProviders: []string{"rke2-control-plane:v0.7.99"}, LogFolder: clusterctlLogFolder, - }, e2eConfig.GetIntervals(specName, "wait-controllers")...) + }) WaitForControlPlaneToBeReady(ctx, WaitForControlPlaneToBeReadyInput{ Getter: bootstrapClusterProxy.GetClient(), @@ -174,7 +174,7 @@ var _ = Describe("Workload cluster creation", func() { }, result) WaitForClusterToUpgrade(ctx, WaitForClusterToUpgradeInput{ - Lister: bootstrapClusterProxy.GetClient(), + Reader: bootstrapClusterProxy.GetClient(), ControlPlane: result.ControlPlane, MachineDeployments: result.MachineDeployments, VersionAfterUpgrade: e2eConfig.GetVariable(KubernetesVersionUpgradeTo), diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go index 480c7cea..9e8a247c 100644 --- a/test/e2e/helpers.go +++ b/test/e2e/helpers.go @@ -29,7 +29,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/pkg/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" @@ -138,14 +137,6 @@ func ApplyClusterTemplateAndWait(ctx context.Context, input ApplyClusterTemplate }) Expect(workloadClusterTemplate).ToNot(BeNil(), "Failed to get the cluster template") - // Ensure we have a Cluster for dump and cleanup steps in AfterEach even if ApplyClusterTemplateAndWait fails. - result.Cluster = &clusterv1.Cluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: input.ConfigCluster.ClusterName, - Namespace: input.ConfigCluster.Namespace, - }, - } - ApplyCustomClusterTemplateAndWait(ctx, ApplyCustomClusterTemplateAndWaitInput{ ClusterProxy: input.ClusterProxy, CustomTemplateYAML: workloadClusterTemplate, @@ -174,19 +165,10 @@ func ApplyCustomClusterTemplateAndWait(ctx context.Context, input ApplyCustomClu Byf("Creating the workload cluster with name %q from the provided yaml", input.ClusterName) - // Ensure we have a Cluster for dump and cleanup steps in AfterEach even if ApplyClusterTemplateAndWait fails. - result.Cluster = &clusterv1.Cluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: input.ClusterName, - Namespace: input.Namespace, - }, - } - Byf("Applying the cluster template yaml of cluster %s", klog.KRef(input.Namespace, input.ClusterName)) Eventually(func() error { return input.ClusterProxy.Apply(ctx, input.CustomTemplateYAML, input.Args...) - // return input.ClusterProxy.CreateOrUpdate(ctx, input.CustomTemplateYAML, input.CreateOrUpdateOpts...) - }, 1*time.Minute).Should(Succeed(), "Failed to apply the cluster template") + }, input.WaitForClusterIntervals...).Should(Succeed(), "Failed to apply the cluster template") // Once we applied the cluster template we can run PreWaitForCluster. // Note: This can e.g. be used to verify the BeforeClusterCreate lifecycle hook is executed @@ -218,7 +200,7 @@ func ApplyCustomClusterTemplateAndWait(ctx context.Context, input ApplyCustomClu input.WaitForControlPlaneMachinesReady(ctx, input, result) Byf("Waiting for the machine deployments of cluster %s to be provisioned", klog.KRef(input.Namespace, input.ClusterName)) - result.MachineDeployments = framework.DiscoveryAndWaitForMachineDeployments(ctx, framework.DiscoveryAndWaitForMachineDeploymentsInput{ + result.MachineDeployments = DiscoveryAndWaitForMachineDeployments(ctx, framework.DiscoveryAndWaitForMachineDeploymentsInput{ Lister: input.ClusterProxy.GetClient(), Cluster: result.Cluster, }, input.WaitForMachineDeployments...) @@ -285,7 +267,7 @@ func DiscoveryAndWaitForRKE2ControlPlaneInitialized(ctx context.Context, input D Namespace: input.Cluster.Namespace, }) g.Expect(controlPlane).ToNot(BeNil()) - }, "10s", "1s").Should(Succeed(), "Couldn't get the control plane for the cluster %s", klog.KObj(input.Cluster)) + }, "2m", "1s").Should(Succeed(), "Couldn't get the control plane for the cluster %s", klog.KObj(input.Cluster)) return controlPlane } @@ -445,7 +427,7 @@ func WaitForMachineConditions(ctx context.Context, input WaitForMachineCondition // WaitForClusterToUpgradeInput is the input for WaitForClusterToUpgrade. type WaitForClusterToUpgradeInput struct { - Lister framework.Lister + Reader framework.GetLister ControlPlane *controlplanev1.RKE2ControlPlane MachineDeployments []*clusterv1.MachineDeployment VersionAfterUpgrade string @@ -455,32 +437,52 @@ type WaitForClusterToUpgradeInput struct { func WaitForClusterToUpgrade(ctx context.Context, input WaitForClusterToUpgradeInput, intervals ...interface{}) { By("Waiting for machines to update") - var totalMachineCount int32 - totalMachineCount = *input.ControlPlane.Spec.Replicas + Eventually(func() error { + cp := input.ControlPlane.DeepCopy() + if err := input.Reader.Get(ctx, client.ObjectKeyFromObject(input.ControlPlane), cp); err != nil { + return fmt.Errorf("failed to get control plane: %w", err) + } - for _, md := range input.MachineDeployments { - totalMachineCount += *md.Spec.Replicas - } + updatedDeployments := []*clusterv1.MachineDeployment{} + for _, md := range input.MachineDeployments { + copy := &clusterv1.MachineDeployment{} + if err := input.Reader.Get(ctx, client.ObjectKeyFromObject(md), copy); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to get updated machine deployment: %w", err) + } - Eventually(func() (bool, error) { - machineList := &clusterv1.MachineList{} - if err := input.Lister.List(ctx, machineList); err != nil { - return false, fmt.Errorf("failed to list machines: %w", err) + updatedDeployments = append(updatedDeployments, copy) } - if len(machineList.Items) != int(totalMachineCount) { // not all replicas are created - return false, nil + machineList := &clusterv1.MachineList{} + if err := input.Reader.List(ctx, machineList); err != nil { + return fmt.Errorf("failed to list machines: %w", err) } for _, machine := range machineList.Items { expectedVersion := input.VersionAfterUpgrade + "+rke2r1" - if machine.Spec.Version != nil && *machine.Spec.Version != expectedVersion { - return false, nil + if machine.Spec.Version == nil || *machine.Spec.Version != expectedVersion { + return fmt.Errorf("Expected machine version to match %s, got %v", expectedVersion, machine.Spec.Version) } } - return true, nil - }, intervals...).Should(BeTrue(), framework.PrettyPrint(input.ControlPlane)) + ready := cp.Status.ReadyReplicas == cp.Status.Replicas + if !ready { + return fmt.Errorf("Control plane is not ready: %d ready from %d", cp.Status.ReadyReplicas, cp.Status.Replicas) + } + + expected := cp.Spec.Replicas != nil && *cp.Spec.Replicas == cp.Status.Replicas + if !expected { + return fmt.Errorf("Control plane is not scaled: %d replicas from %d", cp.Spec.Replicas, cp.Status.Replicas) + } + + for _, md := range updatedDeployments { + if md.Spec.Replicas == nil || *md.Spec.Replicas != md.Status.ReadyReplicas { + return fmt.Errorf("Not all machine deployments are updated yet expected %v!=%d", md.Spec.Replicas, md.Status.ReadyReplicas) + } + } + + return nil + }, intervals...).Should(Succeed()) } // setDefaults sets the default values for ApplyCustomClusterTemplateAndWaitInput if not set.