Skip to content

Commit

Permalink
Merge pull request #68 from razo7/set-unhealthy-node-e2e
Browse files Browse the repository at this point in the history
Set Node Unhealthy in E2E by Stopping Kubelet
  • Loading branch information
openshift-merge-robot authored Jul 31, 2023
2 parents 45e1c1f + 48bcf47 commit 830e87c
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 31 deletions.
91 changes: 64 additions & 27 deletions test/e2e/far_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ const (
pollInterval = 10 * time.Second
)

var previousNodeName string

var _ = Describe("FAR E2e", func() {
var (
fenceAgent, nodeIdentifierPrefix string
Expand Down Expand Up @@ -81,54 +79,67 @@ var _ = Describe("FAR E2e", func() {

Context("stress cluster", func() {
var (
err error
testNodeName string
nodes, filteredNodes *corev1.NodeList
nodeName string
va *storagev1.VolumeAttachment
pod *corev1.Pod
creationTimePod, nodeBootTimeBefore time.Time
far *v1alpha1.FenceAgentsRemediation
err error
)
BeforeEach(func() {
nodes := &corev1.NodeList{}
nodes = &corev1.NodeList{}
selector := labels.NewSelector()
requirement, _ := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{})
selector = selector.Add(*requirement)
Expect(k8sClient.List(context.Background(), nodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred())
if len(nodes.Items) < 1 {
Fail("No worker nodes found in the cluster")
}

testNodeName = randomizeWorkerNode(nodes)
previousNodeName = testNodeName
nodeNameParam := v1alpha1.NodeName(testNodeName)
if filteredNodes != nil {
nodes = filteredNodes
}
selectedNode := randomizeWorkerNode(nodes)
nodeName = selectedNode.Name
nodeNameParam := v1alpha1.NodeName(nodeName)
parameterName := v1alpha1.ParameterName(nodeIdentifierPrefix)
testNodeID := testNodeParam[parameterName][nodeNameParam]
log.Info("Testing Node", "Node name", testNodeName, "Node ID", testNodeID)
log.Info("Testing Node", "Node name", nodeName, "Node ID", testNodeID)

// filter the last remediated node from the list of available nodes
filteredNodes = &corev1.NodeList{}
for _, node := range nodes.Items {
if node.Name != nodeName {
filteredNodes.Items = append(filteredNodes.Items, node)
}
}

// save the node's boot time prior to the fence agent call
nodeBootTimeBefore, err = e2eUtils.GetBootTime(clientSet, testNodeName, testNsName, log)
nodeBootTimeBefore, err = e2eUtils.GetBootTime(clientSet, nodeName, testNsName, log)
Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node")

// create tested pod, and save its creation time
// it will be deleted by FAR CR
pod = e2eUtils.GetPod(testNodeName, testContainerName)
pod = e2eUtils.GetPod(nodeName, testContainerName)
pod.Name = testPodName
pod.Namespace = testNsName
Expect(k8sClient.Create(context.Background(), pod)).To(Succeed())
log.Info("Tested pod has been created", "pod", testPodName)
creationTimePod = metav1.Now().Time
va = createVA(testNodeName)
va = createVA(nodeName)
DeferCleanup(cleanupTestedResources, va, pod)

far = createFAR(testNodeName, fenceAgent, testShareParam, testNodeParam)
// set the node as "unhealthy" by disabling kubelet
makeNodeUnready(selectedNode)

far := createFAR(nodeName, fenceAgent, testShareParam, testNodeParam)
DeferCleanup(deleteFAR, far)
})
When("running FAR to reboot two nodes", func() {
It("should successfully remediate the first node", func() {
checkRemediation(testNodeName, nodeBootTimeBefore, creationTimePod, va, pod)
checkRemediation(nodeName, nodeBootTimeBefore, creationTimePod, va, pod)
})
It("should successfully remediate the second node", func() {
checkRemediation(testNodeName, nodeBootTimeBefore, creationTimePod, va, pod)
checkRemediation(nodeName, nodeBootTimeBefore, creationTimePod, va, pod)
})
})
})
Expand Down Expand Up @@ -221,17 +232,13 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha
return testNodeParam, nil
}

// randomizeWorkerNode returns a worker node name which is different than the previous one
// randomizeWorkerNode returns a worker node that his name is different than the previous one
// (on the first call it will allways return new node)
func randomizeWorkerNode(nodes *corev1.NodeList) string {
nodeName := previousNodeName
for previousNodeName == nodeName {
// Generate a random seed based on the current time
r := rand.New(rand.NewSource(time.Now().UnixNano()))
// Randomly select a worker node
nodeName = nodes.Items[r.Intn(len(nodes.Items))].Name
}
return nodeName
func randomizeWorkerNode(nodes *corev1.NodeList) *corev1.Node {
// Generate a random seed based on the current time
r := rand.New(rand.NewSource(time.Now().UnixNano()))
// Randomly select a worker node
return &nodes.Items[r.Intn(len(nodes.Items))]
}

// createVA creates dummy volume attachment for testing the resource deletion
Expand Down Expand Up @@ -309,6 +316,36 @@ func wasFarTaintAdded(nodeName string) {
log.Info("FAR taint was added", "node name", node.Name, "taint key", farTaint.Key, "taint effect", farTaint.Effect)
}

// waitForNodeHealthyCondition waits until the node's ready condition matches the given status, and it fails after timeout
func waitForNodeHealthyCondition(node *corev1.Node, condStatus corev1.ConditionStatus) {
Eventually(func() corev1.ConditionStatus {
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(node), node)).To(Succeed())
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady {
return cond.Status
}
}
return corev1.ConditionStatus("failure")
}, timeoutReboot, pollInterval).Should(Equal(condStatus))
}

// makeNodeUnready stops kubelet and wait for the node condition to be not ready unless the node was already unready
func makeNodeUnready(node *corev1.Node) {
log.Info("making node unready", "node name", node.GetName())
// check if node is unready already
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(node), node)).To(Succeed())
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionUnknown {
log.Info("node is already unready", "node name", node.GetName())
return
}
}

Expect(e2eUtils.StopKubelet(clientSet, node.Name, testNsName, log)).To(Succeed())
waitForNodeHealthyCondition(node, corev1.ConditionUnknown)
log.Info("node is unready", "node name", node.GetName())
}

// checkFarLogs gets the FAR pod and checks whether its logs have logString
func checkFarLogs(logString string) {
EventuallyWithOffset(1, func() string {
Expand Down
19 changes: 15 additions & 4 deletions test/e2e/utils/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,21 @@ const (
containerTestName = "test-command"
)

// GetBootTime gets the boot time of the given node by running a pod on it executing uptime command
// StopKubelet runs cmd command to stop kubelet for the node and returns an error only if it fails
func StopKubelet(c *kubernetes.Clientset, nodeName string, testNsName string, log logr.Logger) error {
cmd := "microdnf install util-linux -y && /usr/bin/nsenter -m/proc/1/ns/mnt /bin/systemctl stop kubelet"
_, err := runCommandInCluster(c, nodeName, testNsName, cmd, log)
if err != nil && strings.Contains(err.Error(), "connection refused") {
log.Info("ignoring expected error when stopping kubelet", "error", err.Error())
return nil
}
return err
}

// GetBootTime returns the node's boot time, otherwise it fails and returns an error
func GetBootTime(c *kubernetes.Clientset, nodeName string, ns string, log logr.Logger) (time.Time, error) {
emptyTime := time.Time{}
output, err := RunCommandInCluster(c, nodeName, ns, "microdnf install procps -y >/dev/null 2>&1 && uptime -s", log)
output, err := runCommandInCluster(c, nodeName, ns, "microdnf install procps -y >/dev/null 2>&1 && uptime -s", log)
if err != nil {
return emptyTime, err
}
Expand All @@ -44,8 +55,8 @@ func GetBootTime(c *kubernetes.Clientset, nodeName string, ns string, log logr.L
return bootTime, nil
}

// RunCommandInCluster runs a command in a pod in the cluster and returns the output
func RunCommandInCluster(c *kubernetes.Clientset, nodeName string, ns string, command string, log logr.Logger) (string, error) {
// runCommandInCluster runs a command in a pod in the cluster and returns the output
func runCommandInCluster(c *kubernetes.Clientset, nodeName string, ns string, command string, log logr.Logger) (string, error) {

// create a pod and wait that it's running
pod := GetPod(nodeName, containerTestName)
Expand Down

0 comments on commit 830e87c

Please sign in to comment.