Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remediate Ready Worker Node in E2E #159

Merged
merged 6 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 65 additions & 43 deletions test/e2e/far_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ const (

//TODO: try to minimize timeout
// eventually parameters
timeoutLogs = "3m0s"
timeoutTaint = "2s" // Timeout for checking the FAR taint
timeoutReboot = "6m0s" // fencing with fence_aws should be completed within 6 minutes
timeoutAfterReboot = "5s" // Timeout for verifying steps after the node has been rebooted
Expand All @@ -48,55 +47,66 @@ const (
skipOOSREnvVarName = "SKIP_OOST_REMEDIATION_VERIFICATION"
)

var remediationTimes []time.Duration
var (
stopTesting bool
remediationTimes []time.Duration
)

var _ = Describe("FAR E2e", func() {
var (
fenceAgent, nodeIdentifierPrefix string
testShareParam map[v1alpha1.ParameterName]string
testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string
selectedNode *corev1.Node
nodeName string
pod *corev1.Pod
startTime, nodeBootTimeBefore time.Time
err error
)
BeforeEach(func() {
// create FAR CR spec based on OCP platformn
clusterPlatform, err := e2eUtils.GetClusterInfo(configClient)
Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform")
log.Info("Begin e2e test", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type))

switch clusterPlatform.Status.PlatformStatus.Type {
case configv1.AWSPlatformType:
fenceAgent = fenceAgentAWS
nodeIdentifierPrefix = nodeIdentifierPrefixAWS
By("running fence_aws")
case configv1.BareMetalPlatformType:
fenceAgent = fenceAgentIPMI
nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
By("running fence_ipmilan")
default:
Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)")
}
When("trying to identify cluster platform", func() {
slintes marked this conversation as resolved.
Show resolved Hide resolved
It("should be AWS/BareMetal for finding the needed cluster and node parameters", func() {
// create FAR CR spec based on OCP platformn
clusterPlatform, err := e2eUtils.GetClusterInfo(configClient)
Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform")
log.Info("Getting Cluster Infromation", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type))

switch clusterPlatform.Status.PlatformStatus.Type {
case configv1.AWSPlatformType:
fenceAgent = fenceAgentAWS
nodeIdentifierPrefix = nodeIdentifierPrefixAWS
By("running fence_aws")
case configv1.BareMetalPlatformType:
fenceAgent = fenceAgentIPMI
nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
By("running fence_ipmilan")
default:
stopTesting = true // Mark to stop subsequent tests
Fail("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)")
}

testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction)
Expect(err).ToNot(HaveOccurred(), "can't get shared information")
testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type)
Expect(err).ToNot(HaveOccurred(), "can't get node information")
testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction)
Expect(err).ToNot(HaveOccurred(), "can't get shared information")
testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type)
Expect(err).ToNot(HaveOccurred(), "can't get node information")
})
})

// runFARTests is a utility function to run FAR tests.
// It accepts a remediation strategy and a condition to determine if the tests should be skipped.
runFARTests := func(remediationStrategy v1alpha1.RemediationStrategyType, skipCondition func() bool) {
var availableWorkerNodes *corev1.NodeList
var (
availableWorkerNodes *corev1.NodeList
selectedNode *corev1.Node
nodeName string
pod *corev1.Pod
startTime, nodeBootTimeBefore time.Time
)
BeforeEach(func() {
if stopTesting {
Skip("Skip testing due to unsupported platform")
}
if skipCondition() {
Skip("Skip this block due to unsupported condition")
}

if availableWorkerNodes == nil {
availableWorkerNodes = getAvailableWorkerNodes()
availableWorkerNodes = getReadyWorkerNodes()
}
selectedNode = pickRemediatedNode(availableWorkerNodes)
nodeName = selectedNode.Name
Expand All @@ -107,7 +117,7 @@ var _ = Describe("FAR E2e", func() {
Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node")

// create tested pod which will be deleted by the far CR
pod = createTestedPod(nodeName, testContainerName)
pod = createTestedPod(nodeName)
DeferCleanup(cleanupTestedResources, pod)

// set the node as "unhealthy" by disabling kubelet
Expand Down Expand Up @@ -240,24 +250,36 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha
return testNodeParam, nil
}

// getAvailableNodes a list of available worker nodes in the cluster
func getAvailableWorkerNodes() *corev1.NodeList {
availableNodes := &corev1.NodeList{}
// getReadyWorkerNodes returns a list of ready worker nodes in the cluster if any
func getReadyWorkerNodes() *corev1.NodeList {
availableWorkerNodes := &corev1.NodeList{}
selector := labels.NewSelector()
requirement, _ := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{})
requirement, err := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{})
Expect(err).To(BeNil())
selector = selector.Add(*requirement)
Expect(k8sClient.List(context.Background(), availableNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred())
if len(availableNodes.Items) < 1 {
Fail("No worker nodes found in the cluster")
Expect(k8sClient.List(context.Background(), availableWorkerNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred())

// Filter nodes to only include those in "Ready" state
readyWorkerNodes := &corev1.NodeList{}
for _, node := range availableWorkerNodes.Items {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue {
readyWorkerNodes.Items = append(readyWorkerNodes.Items, node)
break // "Ready" was found
}
}
}
if len(readyWorkerNodes.Items) < 1 {
Skip("There isn't an available (and ready) worker node in the cluster")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should result in a failure IMHO

}
return availableNodes
return readyWorkerNodes
slintes marked this conversation as resolved.
Show resolved Hide resolved
}

// pickRemediatedNode randomly returns a next remediated node from the current available nodes,
// and then the node is removed from the list of available nodes
func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node {
if len(availableNodes.Items) < 1 {
Fail("No available node found for remediation")
Skip("There isn't an available (and ready) worker node in the cluster")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why switch from Fail to Skip? This is hiding that we don't run tests...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually not sure if this If is needed as it was already checked (and will Fail) in https://github.com/medik8s/fence-agents-remediation/pull/159/files/8589575a67f85b8b82334a1ea5edd51f334659c6#diff-b9729ac4458c7637438e3bf558999b35adedcaf4d13af1e3eb752db501fd4997R272.
Do you see a value of checking that again after getReadyWorkerNodes function? It might be nice to make them independent 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about moving these checks to the place where both functions are called?
getWorkerNodes returns an empty list. Caller decides if that´s a problem. If yes, fail. If not, go on with pickNode...

}
// Generate a random seed based on the current time
r := rand.New(rand.NewSource(time.Now().UnixNano()))
Expand All @@ -270,14 +292,14 @@ func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node {
}

// createTestedPod creates tested pod which will be deleted by the far CR
func createTestedPod(nodeName, containerName string) *corev1.Pod {
func createTestedPod(nodeName string) *corev1.Pod {
pod := e2eUtils.GetPod(nodeName, testContainerName)
pod.Name = testPodName
pod.Namespace = testNsName
pod.Spec.Tolerations = []corev1.Toleration{
{
Key: v1alpha1.FARNoExecuteTaintKey,
Operator: corev1.TolerationOpEqual,
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoExecute,
},
}
Expand All @@ -303,7 +325,7 @@ func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.Para
SharedParameters: sharedParameters,
NodeParameters: nodeParameters,
RemediationStrategy: strategy,
RetryCount: 5,
RetryCount: 10,
slintes marked this conversation as resolved.
Show resolved Hide resolved
RetryInterval: metav1.Duration{Duration: 20 * time.Second},
Timeout: metav1.Duration{Duration: 60 * time.Second},
},
Expand Down
43 changes: 32 additions & 11 deletions test/e2e/utils/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,23 @@ func GetSecretData(clientSet *kubernetes.Clientset, secretName, secretNamespace,
return string(secret.Data[secretData1]), string(secret.Data[secretData2]), nil
}

// getNodeRoleFromMachine return node role "master/control-plane" or "worker" from machine label if present, otherwise "unknown"
func getNodeRoleFromMachine(nodeLabels map[string]string) string {
machineLabelPrefixRole := "machine.openshift.io/cluster-api-machine-"
// look for machine.openshift.io/cluster-api-machine-role or machine.openshift.io/cluster-api-machine-type label
for _, labelKey := range []string{machineLabelPrefixRole + "role", machineLabelPrefixRole + "type"} {
if labelVal, isFound := nodeLabels[labelKey]; isFound {
if labelVal == "worker" {
return "worker"
}
if labelVal == "master" {
return "master/control-plane"
}
}
}
return "unknown"
}

// GetAWSNodeInfoList returns a list of the node names and their identification, e.g., AWS instance ID
func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.NodeName]string, error) {
// oc get machine -n openshift-machine-api MACHINE_NAME -o jsonpath='{.spec.providerID}'
Expand All @@ -62,24 +79,28 @@ func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.No
}

var missNodeMachineErr error
missNodeMachineNames := ""
// creates map for nodeName and AWS instance ID
for _, machine := range machineList.Items {
if machine.Status.NodeRef == nil || machine.Spec.ProviderID == nil {
if missNodeMachineErr != nil {
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing\n%w", machine.Spec.Name, missNodeMachineErr)
missNodeMachineNames += ", " + machine.ObjectMeta.GetName()
missNodeMachineErr = fmt.Errorf("machines %s are not associated with any node or there provider ID is missing", missNodeMachineNames)
} else {
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing", machine.Spec.Name)
missNodeMachineNames = machine.ObjectMeta.GetName()
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it's provider ID is missing", machine.ObjectMeta.GetName())
}
continue
} else {
nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name)
nodeRole := getNodeRoleFromMachine(machine.Labels)
providerID := *machine.Spec.ProviderID

// Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2
splitedProviderID := strings.Split(providerID, "/i-")
instanceID := "i-" + splitedProviderID[1]
nodeList[nodeName] = instanceID
fmt.Printf("node: %s, Role: %s, Instance ID: %s \n", nodeName, nodeRole, instanceID)
}
nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name)
providerID := *machine.Spec.ProviderID

// Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2
splitedProviderID := strings.Split(providerID, "/i-")
instanceID := "i-" + splitedProviderID[1]
nodeList[nodeName] = instanceID
fmt.Printf("node: %s Instance ID: %s \n", nodeName, instanceID)
}
return nodeList, missNodeMachineErr
}
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/utils/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ func GetPod(nodeName, containerName string) *corev1.Pod {
Tolerations: []corev1.Toleration{
{
Key: v1alpha1.FARNoExecuteTaintKey,
Operator: corev1.TolerationOpEqual,
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoExecute,
},
{
Expand Down