From b89cccb1db436eef97bf8410b99b9eeb5c0162e0 Mon Sep 17 00:00:00 2001 From: oraz Date: Wed, 6 Nov 2024 15:57:24 +0200 Subject: [PATCH 1/6] Try to remediate only a Ready node and get node role Improve getAvailableWorkerNodes to getReadyWorkerNodes by getting a list of all the ready worker nodes. Otherwise we will try to remediate node and fail on fetching its boot time at BeforeEach. Furthemrmore getNodeRoleFromMachine function will add the node role at the list of nodes prior to every e2e test --- test/e2e/far_e2e_test.go | 44 ++++++++++++++++++++++++++------------- test/e2e/utils/cluster.go | 20 +++++++++++++++++- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 67c5e93e..086a0bff 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -55,10 +55,6 @@ var _ = Describe("FAR E2e", func() { fenceAgent, nodeIdentifierPrefix string testShareParam map[v1alpha1.ParameterName]string testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string - selectedNode *corev1.Node - nodeName string - pod *corev1.Pod - startTime, nodeBootTimeBefore time.Time err error ) BeforeEach(func() { @@ -89,14 +85,20 @@ var _ = Describe("FAR E2e", func() { // runFARTests is a utility function to run FAR tests. // It accepts a remediation strategy and a condition to determine if the tests should be skipped. runFARTests := func(remediationStrategy v1alpha1.RemediationStrategyType, skipCondition func() bool) { - var availableWorkerNodes *corev1.NodeList + var ( + availableWorkerNodes *corev1.NodeList + selectedNode *corev1.Node + nodeName string + pod *corev1.Pod + startTime, nodeBootTimeBefore time.Time + ) BeforeEach(func() { if skipCondition() { Skip("Skip this block due to unsupported condition") } if availableWorkerNodes == nil { - availableWorkerNodes = getAvailableWorkerNodes() + availableWorkerNodes = getReadyWorkerNodes() } selectedNode = pickRemediatedNode(availableWorkerNodes) nodeName = selectedNode.Name @@ -240,24 +242,36 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha return testNodeParam, nil } -// getAvailableNodes a list of available worker nodes in the cluster -func getAvailableWorkerNodes() *corev1.NodeList { - availableNodes := &corev1.NodeList{} +// getReadyWorkerNodes returns a list of ready worker nodes in the cluster if any +func getReadyWorkerNodes() *corev1.NodeList { + availableWorkerNodes := &corev1.NodeList{} selector := labels.NewSelector() - requirement, _ := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{}) + requirement, err := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{}) + Expect(err).To(BeNil()) selector = selector.Add(*requirement) - Expect(k8sClient.List(context.Background(), availableNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred()) - if len(availableNodes.Items) < 1 { - Fail("No worker nodes found in the cluster") + Expect(k8sClient.List(context.Background(), availableWorkerNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred()) + + // Filter nodes to only include those in "Ready" state + readyWorkerNodes := &corev1.NodeList{} + for _, node := range availableWorkerNodes.Items { + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue { + readyWorkerNodes.Items = append(readyWorkerNodes.Items, node) + break // "Ready" was found + } + } + } + if len(readyWorkerNodes.Items) < 1 { + Skip("There isn't an available (and ready) worker node in the cluster") } - return availableNodes + return readyWorkerNodes } // pickRemediatedNode randomly returns a next remediated node from the current available nodes, // and then the node is removed from the list of available nodes func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node { if len(availableNodes.Items) < 1 { - Fail("No available node found for remediation") + Skip("There isn't an available (and ready) worker node in the cluster") } // Generate a random seed based on the current time r := rand.New(rand.NewSource(time.Now().UnixNano())) diff --git a/test/e2e/utils/cluster.go b/test/e2e/utils/cluster.go index 07098066..14ad74de 100644 --- a/test/e2e/utils/cluster.go +++ b/test/e2e/utils/cluster.go @@ -48,6 +48,23 @@ func GetSecretData(clientSet *kubernetes.Clientset, secretName, secretNamespace, return string(secret.Data[secretData1]), string(secret.Data[secretData2]), nil } +// getNodeRoleFromMachine return node role "master/control-plane" or "worker" from machine label if present, otherwise "unknown" +func getNodeRoleFromMachine(nodeLabels map[string]string) string { + machineLabelPrefixRole := "machine.openshift.io/cluster-api-machine-" + // look for machine.openshift.io/cluster-api-machine-role or machine.openshift.io/cluster-api-machine-type label + for _, labelKey := range []string{machineLabelPrefixRole + "role", machineLabelPrefixRole + "type"} { + if labelVal, isFound := nodeLabels[labelKey]; isFound { + if labelVal == "worker" { + return "worker" + } + if labelVal == "master" { + return "master/control-plane" + } + } + } + return "unknown" +} + // GetAWSNodeInfoList returns a list of the node names and their identification, e.g., AWS instance ID func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.NodeName]string, error) { // oc get machine -n openshift-machine-api MACHINE_NAME -o jsonpath='{.spec.providerID}' @@ -73,13 +90,14 @@ func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.No continue } nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name) + nodeRole := getNodeRoleFromMachine(machine.Labels) providerID := *machine.Spec.ProviderID // Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2 splitedProviderID := strings.Split(providerID, "/i-") instanceID := "i-" + splitedProviderID[1] nodeList[nodeName] = instanceID - fmt.Printf("node: %s Instance ID: %s \n", nodeName, instanceID) + fmt.Printf("node: %s, Role: %s, Instance ID: %s \n", nodeName, nodeRole, instanceID) } return nodeList, missNodeMachineErr } From c2baa2f3b1dd694486e30de99d2c6b1c6b4fc336 Mon Sep 17 00:00:00 2001 From: oraz Date: Wed, 6 Nov 2024 15:59:28 +0200 Subject: [PATCH 2/6] Fix listing machines with no associated node Bad return error and continue inside if was removed in favor of else to the if statement --- test/e2e/utils/cluster.go | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/test/e2e/utils/cluster.go b/test/e2e/utils/cluster.go index 14ad74de..92e3a8d4 100644 --- a/test/e2e/utils/cluster.go +++ b/test/e2e/utils/cluster.go @@ -79,25 +79,28 @@ func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.No } var missNodeMachineErr error + missNodeMachineNames := "" // creates map for nodeName and AWS instance ID for _, machine := range machineList.Items { if machine.Status.NodeRef == nil || machine.Spec.ProviderID == nil { if missNodeMachineErr != nil { - missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing\n%w", machine.Spec.Name, missNodeMachineErr) + missNodeMachineNames += ", " + machine.ObjectMeta.GetName() + missNodeMachineErr = fmt.Errorf("machines %s are not associated with any node or there provider ID is missing", missNodeMachineNames) } else { - missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing", machine.Spec.Name) + missNodeMachineNames = machine.ObjectMeta.GetName() + missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it's provider ID is missing", machine.ObjectMeta.GetName()) } - continue + } else { + nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name) + nodeRole := getNodeRoleFromMachine(machine.Labels) + providerID := *machine.Spec.ProviderID + + // Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2 + splitedProviderID := strings.Split(providerID, "/i-") + instanceID := "i-" + splitedProviderID[1] + nodeList[nodeName] = instanceID + fmt.Printf("node: %s, Role: %s, Instance ID: %s \n", nodeName, nodeRole, instanceID) } - nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name) - nodeRole := getNodeRoleFromMachine(machine.Labels) - providerID := *machine.Spec.ProviderID - - // Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2 - splitedProviderID := strings.Split(providerID, "/i-") - instanceID := "i-" + splitedProviderID[1] - nodeList[nodeName] = instanceID - fmt.Printf("node: %s, Role: %s, Instance ID: %s \n", nodeName, nodeRole, instanceID) } return nodeList, missNodeMachineErr } From b9cc66b67e61fcfc57d23a9402de1e9656f454b2 Mon Sep 17 00:00:00 2001 From: oraz Date: Tue, 12 Nov 2024 12:37:56 +0200 Subject: [PATCH 3/6] Extend RetryCount for AWS agent RetryCount of 5 is too low for running the AWS agent. Leading to a context timed out, and stop trying to reboot the unhealthy node --- test/e2e/far_e2e_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 086a0bff..bbe92ddb 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -317,7 +317,7 @@ func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.Para SharedParameters: sharedParameters, NodeParameters: nodeParameters, RemediationStrategy: strategy, - RetryCount: 5, + RetryCount: 10, RetryInterval: metav1.Duration{Duration: 20 * time.Second}, Timeout: metav1.Duration{Duration: 60 * time.Second}, }, From 8589575a67f85b8b82334a1ea5edd51f334659c6 Mon Sep 17 00:00:00 2001 From: oraz Date: Tue, 12 Nov 2024 14:41:42 +0200 Subject: [PATCH 4/6] Fail testing on non-supported platform and change toleration operator for test pods Move cluster information from BeforeEach to an independent test, and skip subsequent tests on failure. Tested pods toleration should use the Exist operator over Equal as the toleration value is irrelevant --- test/e2e/far_e2e_test.go | 62 ++++++++++++++++++++++----------------- test/e2e/utils/command.go | 2 +- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index bbe92ddb..85fbea0e 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -38,7 +38,6 @@ const ( //TODO: try to minimize timeout // eventually parameters - timeoutLogs = "3m0s" timeoutTaint = "2s" // Timeout for checking the FAR taint timeoutReboot = "6m0s" // fencing with fence_aws should be completed within 6 minutes timeoutAfterReboot = "5s" // Timeout for verifying steps after the node has been rebooted @@ -48,7 +47,10 @@ const ( skipOOSREnvVarName = "SKIP_OOST_REMEDIATION_VERIFICATION" ) -var remediationTimes []time.Duration +var ( + stopTesting bool + remediationTimes []time.Duration +) var _ = Describe("FAR E2e", func() { var ( @@ -57,29 +59,32 @@ var _ = Describe("FAR E2e", func() { testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string err error ) - BeforeEach(func() { - // create FAR CR spec based on OCP platformn - clusterPlatform, err := e2eUtils.GetClusterInfo(configClient) - Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform") - log.Info("Begin e2e test", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type)) - - switch clusterPlatform.Status.PlatformStatus.Type { - case configv1.AWSPlatformType: - fenceAgent = fenceAgentAWS - nodeIdentifierPrefix = nodeIdentifierPrefixAWS - By("running fence_aws") - case configv1.BareMetalPlatformType: - fenceAgent = fenceAgentIPMI - nodeIdentifierPrefix = nodeIdentifierPrefixIPMI - By("running fence_ipmilan") - default: - Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)") - } + When("trying to identify cluster platform", func() { + It("should be AWS/BareMetal for finding the needed cluster and node parameters", func() { + // create FAR CR spec based on OCP platformn + clusterPlatform, err := e2eUtils.GetClusterInfo(configClient) + Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform") + log.Info("Getting Cluster Infromation", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type)) + + switch clusterPlatform.Status.PlatformStatus.Type { + case configv1.AWSPlatformType: + fenceAgent = fenceAgentAWS + nodeIdentifierPrefix = nodeIdentifierPrefixAWS + By("running fence_aws") + case configv1.BareMetalPlatformType: + fenceAgent = fenceAgentIPMI + nodeIdentifierPrefix = nodeIdentifierPrefixIPMI + By("running fence_ipmilan") + default: + stopTesting = true // Mark to stop subsequent tests + Fail("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)") + } - testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction) - Expect(err).ToNot(HaveOccurred(), "can't get shared information") - testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type) - Expect(err).ToNot(HaveOccurred(), "can't get node information") + testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction) + Expect(err).ToNot(HaveOccurred(), "can't get shared information") + testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type) + Expect(err).ToNot(HaveOccurred(), "can't get node information") + }) }) // runFARTests is a utility function to run FAR tests. @@ -93,6 +98,9 @@ var _ = Describe("FAR E2e", func() { startTime, nodeBootTimeBefore time.Time ) BeforeEach(func() { + if stopTesting { + Skip("Skip testing due to unsupported platform") + } if skipCondition() { Skip("Skip this block due to unsupported condition") } @@ -109,7 +117,7 @@ var _ = Describe("FAR E2e", func() { Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node") // create tested pod which will be deleted by the far CR - pod = createTestedPod(nodeName, testContainerName) + pod = createTestedPod(nodeName) DeferCleanup(cleanupTestedResources, pod) // set the node as "unhealthy" by disabling kubelet @@ -284,14 +292,14 @@ func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node { } // createTestedPod creates tested pod which will be deleted by the far CR -func createTestedPod(nodeName, containerName string) *corev1.Pod { +func createTestedPod(nodeName string) *corev1.Pod { pod := e2eUtils.GetPod(nodeName, testContainerName) pod.Name = testPodName pod.Namespace = testNsName pod.Spec.Tolerations = []corev1.Toleration{ { Key: v1alpha1.FARNoExecuteTaintKey, - Operator: corev1.TolerationOpEqual, + Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute, }, } diff --git a/test/e2e/utils/command.go b/test/e2e/utils/command.go index cef9f22d..61ec205e 100644 --- a/test/e2e/utils/command.go +++ b/test/e2e/utils/command.go @@ -193,7 +193,7 @@ func GetPod(nodeName, containerName string) *corev1.Pod { Tolerations: []corev1.Toleration{ { Key: v1alpha1.FARNoExecuteTaintKey, - Operator: corev1.TolerationOpEqual, + Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute, }, { From 156442edea4ebbb626c0091462187feaa8d57909 Mon Sep 17 00:00:00 2001 From: oraz Date: Wed, 13 Nov 2024 10:41:38 +0200 Subject: [PATCH 5/6] Fail on missing Ready Worker nodes Previously it was just skiping but it hides that we don't run tests and still getting a green result --- test/e2e/far_e2e_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 85fbea0e..917acd21 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -270,7 +270,7 @@ func getReadyWorkerNodes() *corev1.NodeList { } } if len(readyWorkerNodes.Items) < 1 { - Skip("There isn't an available (and ready) worker node in the cluster") + Fail("There isn't an available (and ready) worker node in the cluster") } return readyWorkerNodes } @@ -279,7 +279,7 @@ func getReadyWorkerNodes() *corev1.NodeList { // and then the node is removed from the list of available nodes func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node { if len(availableNodes.Items) < 1 { - Skip("There isn't an available (and ready) worker node in the cluster") + Fail("There isn't an available (and ready) worker node in the cluster") } // Generate a random seed based on the current time r := rand.New(rand.NewSource(time.Now().UnixNano())) From 556f51a21e87ef73c29a14f968be5e33725156e6 Mon Sep 17 00:00:00 2001 From: oraz Date: Thu, 14 Nov 2024 08:27:26 +0200 Subject: [PATCH 6/6] Fail early on missing Ready Worker node Move the check of missing Ready worker node outside of the previous two functions. It simplifies the code --- test/e2e/far_e2e_test.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 917acd21..c6015ec7 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -108,6 +108,10 @@ var _ = Describe("FAR E2e", func() { if availableWorkerNodes == nil { availableWorkerNodes = getReadyWorkerNodes() } + if len(availableWorkerNodes.Items) < 1 { + Fail("There isn't an available (and Ready) worker node in the cluster") + } + selectedNode = pickRemediatedNode(availableWorkerNodes) nodeName = selectedNode.Name printNodeDetails(selectedNode, nodeIdentifierPrefix, testNodeParam) @@ -269,18 +273,12 @@ func getReadyWorkerNodes() *corev1.NodeList { } } } - if len(readyWorkerNodes.Items) < 1 { - Fail("There isn't an available (and ready) worker node in the cluster") - } return readyWorkerNodes } // pickRemediatedNode randomly returns a next remediated node from the current available nodes, // and then the node is removed from the list of available nodes func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node { - if len(availableNodes.Items) < 1 { - Fail("There isn't an available (and ready) worker node in the cluster") - } // Generate a random seed based on the current time r := rand.New(rand.NewSource(time.Now().UnixNano())) // Randomly select a worker node