From a303f9e372fcbade4d3fb78e6ac02acd5e6d681f Mon Sep 17 00:00:00 2001
From: Akarsha-rai <akrai@redhat.com>
Date: Mon, 29 Apr 2024 16:27:59 +0530
Subject: [PATCH 1/5] To verify that the cluster remains accessible and NO
 DU/DL after failures

Signed-off-by: Akarsha-rai <akrai@redhat.com>
---
 ...no_data_loss_and_corruption_on_failures.py | 158 ++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py

diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
new file mode 100644
index 00000000000..d6f5fdb2963
--- /dev/null
+++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
@@ -0,0 +1,158 @@
+import logging
+import pytest
+import time
+
+
+from ocs_ci.framework.pytest_customization.marks import tier4a
+from ocs_ci.framework import config
+from ocs_ci.ocs.dr.dr_workload import validate_data_integrity
+from ocs_ci.ocs import constants
+from ocs_ci.ocs.node import wait_for_nodes_status, get_node_objs
+from ocs_ci.ocs.resources.pod import restart_pods_having_label
+from ocs_ci.helpers.dr_helpers import (
+    set_current_primary_cluster_context,
+    set_current_secondary_cluster_context,
+    get_current_primary_cluster_name,
+    get_active_acm_index,
+)
+from ocs_ci.framework.pytest_customization.marks import turquoise_squad
+from ocs_ci.utility import vsphere
+from ocs_ci.utility.utils import ceph_health_check
+
+logger = logging.getLogger(__name__)
+
+
+@tier4a
+@turquoise_squad
+class TestNoDataLossAndDataCorruptionOnFailures:
+    """
+    The  Objective of this test cases is to make sure that the MDR cluster remains accessible
+    and NO DU/DL/DC is observed when following Failures are induced with supported applications are running
+
+    1) Noobaa pods failures - repeat at least 5-7 times
+    2) Rolling reboot of the nodes in all zones one at a time
+    3) RHCS nodes failures
+        a. 1 RHCS node in one zone
+        b. All the RHCS nodes in one zone
+        c. All the RHCS nodes in one zone - Repeated to mimic Santa lucia issue
+
+    """
+
+    @pytest.mark.polarion_id("OCS-XXXX")
+    def test_no_data_loss_and_data_corruption_on_failures(
+        self, setup_acm_ui, nodes_multicluster, dr_workload
+    ):
+
+        # Deploy Subscription based application
+        sub = dr_workload(num_of_subscription=1)[0]
+        self.namespace = sub.workload_namespace
+        self.workload_type = sub.workload_type
+
+        # Deploy AppSet based application
+        appset = dr_workload(num_of_subscription=0, num_of_appset=1)[0]
+
+        # Workloads list
+        workloads = [sub, appset]
+
+        # Create application on Primary managed cluster
+        set_current_primary_cluster_context(self.namespace)
+        self.primary_cluster_name = get_current_primary_cluster_name(
+            namespace=self.namespace
+        )
+
+        # Validate data integrity
+        for wl in workloads:
+            config.switch_to_cluster_by_name(self.primary_cluster_name)
+            validate_data_integrity(wl.workload_namespace)
+
+        # Noobaa pod restarts atleast 5 times and verify the data integrity
+        restart_pods_having_label(label=constants.NOOBAA_APP_LABEL)
+        for wl in workloads:
+            config.switch_to_cluster_by_name(self.primary_cluster_name)
+            validate_data_integrity(wl.workload_namespace)
+
+        # Get the nodes from one active zone
+        config.switch_ctx(get_active_acm_index())
+        active_hub_index = config.cur_index
+        zone = config.ENV_DATA.get("zone")
+        active_hub_cluster_node_objs = get_node_objs()
+        set_current_primary_cluster_context(self.namespace)
+        if config.ENV_DATA.get("zone") == zone:
+            managed_cluster_index = config.cur_index
+            managed_cluster_node_objs = get_node_objs()
+        else:
+            set_current_secondary_cluster_context(self.namespace)
+            managed_cluster_index = config.cur_index
+            managed_cluster_node_objs = get_node_objs()
+        external_cluster_node_roles = config.EXTERNAL_MODE.get(
+            "external_cluster_node_roles"
+        )
+        ceph_node_ips = []
+        for ceph_node in external_cluster_node_roles:
+            if (
+                external_cluster_node_roles[ceph_node].get("location").get("datacenter")
+                != "zone-b"
+            ):
+                continue
+            else:
+                ceph_node_ips.append(
+                    external_cluster_node_roles[ceph_node].get("ip_address")
+                )
+
+        # Rolling reboot of the nodes in all zones one at a time
+        wait_time = 120
+        logger.info("Shutting down all the nodes from active hub zone")
+        nodes_multicluster[managed_cluster_index].restart_nodes(
+            managed_cluster_node_objs
+        )
+        nodes_multicluster[active_hub_index].restart_nodes(active_hub_cluster_node_objs)
+        host = config.ENV_DATA["vsphere_server"]
+        user = config.ENV_DATA["vsphere_user"]
+        password = config.ENV_DATA["vsphere_password"]
+        vm_objs = vsphere.VSPHERE(host, user, password)
+        ceph_vms = [
+            vm_objs.get_vm_by_ip(ip=each_ip, dc="None") for each_ip in ceph_node_ips
+        ]
+        vm_objs.restart_vms(vms=ceph_vms)
+        logger.info(
+            "All nodes from active hub zone are rebooted/restarted."
+            f"Wait for {wait_time} for the nodes up"
+        )
+        time.sleep(wait_time)
+        wait_for_nodes_status([node.name for node in managed_cluster_node_objs])
+        wait_for_nodes_status([node.name for node in active_hub_cluster_node_objs])
+        # Validate ceph health OK
+        ceph_health_check(tries=40, delay=30)
+
+        # Again verify the data integrity of application
+        for wl in workloads:
+            config.switch_to_cluster_by_name(self.primary_cluster_name)
+            validate_data_integrity(wl.workload_namespace)
+
+        # RHCS nodes failures
+        # 1 RHCS node in one zone
+        vm_objs.restart_vms(vms=ceph_vms[0])
+        time.sleep(wait_time)
+        # Validate ceph health OK
+        ceph_health_check(tries=40, delay=30)
+
+        # All the RHCS nodes in one zone
+        vm_objs.restart_vms(vms=ceph_vms)
+        time.sleep(wait_time)
+        # Validate ceph health OK
+        ceph_health_check(tries=40, delay=30)
+
+        # All the RHCS nodes in one zone - Repeated to mimic Santa lucia issue
+        for i in range(10):
+            vm_objs.restart_vms(vms=ceph_vms)
+            logger.info(
+                f"Wait {wait_time} before another restart of ceph nodes from zones"
+            )
+            time.sleep(wait_time)
+            # Validate ceph health OK
+        ceph_health_check(tries=120, delay=30)
+
+        # Again verify the data integrity of application
+        for wl in workloads:
+            config.switch_to_cluster_by_name(self.primary_cluster_name)
+            validate_data_integrity(wl.workload_namespace)

From f78801774ea18b40502ef441fe00a7174058cd8e Mon Sep 17 00:00:00 2001
From: Akarsha-rai <akrai@redhat.com>
Date: Tue, 16 Jul 2024 12:26:19 +0530
Subject: [PATCH 2/5] To verify that the cluster remains accessible and NO
 DU/DL after failures

Signed-off-by: Akarsha-rai <akrai@redhat.com>
---
 ...no_data_loss_and_corruption_on_failures.py | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
index d6f5fdb2963..f1c8e3c0587 100644
--- a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
+++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
@@ -38,21 +38,14 @@ class TestNoDataLossAndDataCorruptionOnFailures:
 
     """
 
-    @pytest.mark.polarion_id("OCS-XXXX")
+    @pytest.mark.polarion_id("OCS-4793")
     def test_no_data_loss_and_data_corruption_on_failures(
-        self, setup_acm_ui, nodes_multicluster, dr_workload
+        self, nodes_multicluster, dr_workload
     ):
 
         # Deploy Subscription based application
-        sub = dr_workload(num_of_subscription=1)[0]
-        self.namespace = sub.workload_namespace
-        self.workload_type = sub.workload_type
-
-        # Deploy AppSet based application
-        appset = dr_workload(num_of_subscription=0, num_of_appset=1)[0]
-
-        # Workloads list
-        workloads = [sub, appset]
+        workloads = dr_workload(num_of_subscription=1, num_of_appset=1)
+        self.namespace = workloads[0].workload_namespace
 
         # Create application on Primary managed cluster
         set_current_primary_cluster_context(self.namespace)
@@ -66,12 +59,13 @@ def test_no_data_loss_and_data_corruption_on_failures(
             validate_data_integrity(wl.workload_namespace)
 
         # Noobaa pod restarts atleast 5 times and verify the data integrity
-        restart_pods_having_label(label=constants.NOOBAA_APP_LABEL)
+        for i in range(5):
+            restart_pods_having_label(label=constants.NOOBAA_APP_LABEL)
         for wl in workloads:
             config.switch_to_cluster_by_name(self.primary_cluster_name)
             validate_data_integrity(wl.workload_namespace)
 
-        # Get the nodes from one active zone
+        # Get the nodes from one active zone and reboot of the nodes in all zones
         config.switch_ctx(get_active_acm_index())
         active_hub_index = config.cur_index
         zone = config.ENV_DATA.get("zone")
@@ -98,7 +92,6 @@ def test_no_data_loss_and_data_corruption_on_failures(
                 ceph_node_ips.append(
                     external_cluster_node_roles[ceph_node].get("ip_address")
                 )
-
         # Rolling reboot of the nodes in all zones one at a time
         wait_time = 120
         logger.info("Shutting down all the nodes from active hub zone")

From 8f0c41fdcb2a575acc48886e0a1c98221effb6ad Mon Sep 17 00:00:00 2001
From: Akarsha-rai <akrai@redhat.com>
Date: Tue, 16 Jul 2024 13:10:41 +0530
Subject: [PATCH 3/5] To verify that the cluster remains accessible and NO
 DU/DL after failures

Signed-off-by: Akarsha-rai <akrai@redhat.com>
---
 ...t_no_data_loss_and_corruption_on_failures.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
index f1c8e3c0587..d46f8722e61 100644
--- a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
+++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
@@ -61,6 +61,9 @@ def test_no_data_loss_and_data_corruption_on_failures(
         # Noobaa pod restarts atleast 5 times and verify the data integrity
         for i in range(5):
             restart_pods_having_label(label=constants.NOOBAA_APP_LABEL)
+        logger.info(
+            "Verify the data integrity of application after repeated failures of Noobaa pods"
+        )
         for wl in workloads:
             config.switch_to_cluster_by_name(self.primary_cluster_name)
             validate_data_integrity(wl.workload_namespace)
@@ -93,12 +96,14 @@ def test_no_data_loss_and_data_corruption_on_failures(
                     external_cluster_node_roles[ceph_node].get("ip_address")
                 )
         # Rolling reboot of the nodes in all zones one at a time
-        wait_time = 120
+        wait_time = 300
         logger.info("Shutting down all the nodes from active hub zone")
-        nodes_multicluster[managed_cluster_index].restart_nodes(
+        nodes_multicluster[managed_cluster_index].restart_nodes_by_stop_and_start(
             managed_cluster_node_objs
         )
-        nodes_multicluster[active_hub_index].restart_nodes(active_hub_cluster_node_objs)
+        nodes_multicluster[active_hub_index].restart_nodes_by_stop_and_start(
+            active_hub_cluster_node_objs
+        )
         host = config.ENV_DATA["vsphere_server"]
         user = config.ENV_DATA["vsphere_user"]
         password = config.ENV_DATA["vsphere_password"]
@@ -118,6 +123,9 @@ def test_no_data_loss_and_data_corruption_on_failures(
         ceph_health_check(tries=40, delay=30)
 
         # Again verify the data integrity of application
+        logger.info(
+            "Verify the data integrity of application after all nodes from active hub zone are rebooted"
+        )
         for wl in workloads:
             config.switch_to_cluster_by_name(self.primary_cluster_name)
             validate_data_integrity(wl.workload_namespace)
@@ -146,6 +154,9 @@ def test_no_data_loss_and_data_corruption_on_failures(
         ceph_health_check(tries=120, delay=30)
 
         # Again verify the data integrity of application
+        logger.info(
+            "Verify the data integrity of application after repeated restart of ceph nodes from zones"
+        )
         for wl in workloads:
             config.switch_to_cluster_by_name(self.primary_cluster_name)
             validate_data_integrity(wl.workload_namespace)

From 20b460aae810593d115661ee74da764351bb09d0 Mon Sep 17 00:00:00 2001
From: Akarsha-rai <akrai@redhat.com>
Date: Tue, 16 Jul 2024 19:23:11 +0530
Subject: [PATCH 4/5] To verify that the cluster remains accessible and NO
 DU/DL after failures

Signed-off-by: Akarsha-rai <akrai@redhat.com>
---
 .../metro-dr/test_no_data_loss_and_corruption_on_failures.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
index d46f8722e61..78975fad0f4 100644
--- a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
+++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
@@ -132,7 +132,7 @@ def test_no_data_loss_and_data_corruption_on_failures(
 
         # RHCS nodes failures
         # 1 RHCS node in one zone
-        vm_objs.restart_vms(vms=ceph_vms[0])
+        vm_objs.restart_vms(vms=[ceph_vms[0]])
         time.sleep(wait_time)
         # Validate ceph health OK
         ceph_health_check(tries=40, delay=30)

From 2e5c6aacf6cf803ab71bc97b198b3733b0b980dd Mon Sep 17 00:00:00 2001
From: Akarsha-rai <akrai@redhat.com>
Date: Wed, 17 Jul 2024 12:42:18 +0530
Subject: [PATCH 5/5] To verify that the cluster remains accessible and NO
 DU/DL after failures

Signed-off-by: Akarsha-rai <akrai@redhat.com>
---
 .../test_no_data_loss_and_corruption_on_failures.py       | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
index 78975fad0f4..4ea9adeec53 100644
--- a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
+++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
@@ -3,7 +3,11 @@
 import time
 
 
-from ocs_ci.framework.pytest_customization.marks import tier4a
+from ocs_ci.framework.pytest_customization.marks import (
+    tier4a,
+    turquoise_squad,
+    vsphere_platform_required,
+)
 from ocs_ci.framework import config
 from ocs_ci.ocs.dr.dr_workload import validate_data_integrity
 from ocs_ci.ocs import constants
@@ -15,7 +19,6 @@
     get_current_primary_cluster_name,
     get_active_acm_index,
 )
-from ocs_ci.framework.pytest_customization.marks import turquoise_squad
 from ocs_ci.utility import vsphere
 from ocs_ci.utility.utils import ceph_health_check
 
@@ -24,6 +27,7 @@
 
 @tier4a
 @turquoise_squad
+@vsphere_platform_required
 class TestNoDataLossAndDataCorruptionOnFailures:
     """
     The  Objective of this test cases is to make sure that the MDR cluster remains accessible