From a303f9e372fcbade4d3fb78e6ac02acd5e6d681f Mon Sep 17 00:00:00 2001 From: Akarsha-rai Date: Mon, 29 Apr 2024 16:27:59 +0530 Subject: [PATCH 1/5] To verify that the cluster remains accessible and NO DU/DL after failures Signed-off-by: Akarsha-rai --- ...no_data_loss_and_corruption_on_failures.py | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py new file mode 100644 index 00000000000..d6f5fdb2963 --- /dev/null +++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py @@ -0,0 +1,158 @@ +import logging +import pytest +import time + + +from ocs_ci.framework.pytest_customization.marks import tier4a +from ocs_ci.framework import config +from ocs_ci.ocs.dr.dr_workload import validate_data_integrity +from ocs_ci.ocs import constants +from ocs_ci.ocs.node import wait_for_nodes_status, get_node_objs +from ocs_ci.ocs.resources.pod import restart_pods_having_label +from ocs_ci.helpers.dr_helpers import ( + set_current_primary_cluster_context, + set_current_secondary_cluster_context, + get_current_primary_cluster_name, + get_active_acm_index, +) +from ocs_ci.framework.pytest_customization.marks import turquoise_squad +from ocs_ci.utility import vsphere +from ocs_ci.utility.utils import ceph_health_check + +logger = logging.getLogger(__name__) + + +@tier4a +@turquoise_squad +class TestNoDataLossAndDataCorruptionOnFailures: + """ + The Objective of this test cases is to make sure that the MDR cluster remains accessible + and NO DU/DL/DC is observed when following Failures are induced with supported applications are running + + 1) Noobaa pods failures - repeat at least 5-7 times + 2) Rolling reboot of the nodes in all zones one at a time + 3) RHCS nodes failures + a. 1 RHCS node in one zone + b. All the RHCS nodes in one zone + c. All the RHCS nodes in one zone - Repeated to mimic Santa lucia issue + + """ + + @pytest.mark.polarion_id("OCS-XXXX") + def test_no_data_loss_and_data_corruption_on_failures( + self, setup_acm_ui, nodes_multicluster, dr_workload + ): + + # Deploy Subscription based application + sub = dr_workload(num_of_subscription=1)[0] + self.namespace = sub.workload_namespace + self.workload_type = sub.workload_type + + # Deploy AppSet based application + appset = dr_workload(num_of_subscription=0, num_of_appset=1)[0] + + # Workloads list + workloads = [sub, appset] + + # Create application on Primary managed cluster + set_current_primary_cluster_context(self.namespace) + self.primary_cluster_name = get_current_primary_cluster_name( + namespace=self.namespace + ) + + # Validate data integrity + for wl in workloads: + config.switch_to_cluster_by_name(self.primary_cluster_name) + validate_data_integrity(wl.workload_namespace) + + # Noobaa pod restarts atleast 5 times and verify the data integrity + restart_pods_having_label(label=constants.NOOBAA_APP_LABEL) + for wl in workloads: + config.switch_to_cluster_by_name(self.primary_cluster_name) + validate_data_integrity(wl.workload_namespace) + + # Get the nodes from one active zone + config.switch_ctx(get_active_acm_index()) + active_hub_index = config.cur_index + zone = config.ENV_DATA.get("zone") + active_hub_cluster_node_objs = get_node_objs() + set_current_primary_cluster_context(self.namespace) + if config.ENV_DATA.get("zone") == zone: + managed_cluster_index = config.cur_index + managed_cluster_node_objs = get_node_objs() + else: + set_current_secondary_cluster_context(self.namespace) + managed_cluster_index = config.cur_index + managed_cluster_node_objs = get_node_objs() + external_cluster_node_roles = config.EXTERNAL_MODE.get( + "external_cluster_node_roles" + ) + ceph_node_ips = [] + for ceph_node in external_cluster_node_roles: + if ( + external_cluster_node_roles[ceph_node].get("location").get("datacenter") + != "zone-b" + ): + continue + else: + ceph_node_ips.append( + external_cluster_node_roles[ceph_node].get("ip_address") + ) + + # Rolling reboot of the nodes in all zones one at a time + wait_time = 120 + logger.info("Shutting down all the nodes from active hub zone") + nodes_multicluster[managed_cluster_index].restart_nodes( + managed_cluster_node_objs + ) + nodes_multicluster[active_hub_index].restart_nodes(active_hub_cluster_node_objs) + host = config.ENV_DATA["vsphere_server"] + user = config.ENV_DATA["vsphere_user"] + password = config.ENV_DATA["vsphere_password"] + vm_objs = vsphere.VSPHERE(host, user, password) + ceph_vms = [ + vm_objs.get_vm_by_ip(ip=each_ip, dc="None") for each_ip in ceph_node_ips + ] + vm_objs.restart_vms(vms=ceph_vms) + logger.info( + "All nodes from active hub zone are rebooted/restarted." + f"Wait for {wait_time} for the nodes up" + ) + time.sleep(wait_time) + wait_for_nodes_status([node.name for node in managed_cluster_node_objs]) + wait_for_nodes_status([node.name for node in active_hub_cluster_node_objs]) + # Validate ceph health OK + ceph_health_check(tries=40, delay=30) + + # Again verify the data integrity of application + for wl in workloads: + config.switch_to_cluster_by_name(self.primary_cluster_name) + validate_data_integrity(wl.workload_namespace) + + # RHCS nodes failures + # 1 RHCS node in one zone + vm_objs.restart_vms(vms=ceph_vms[0]) + time.sleep(wait_time) + # Validate ceph health OK + ceph_health_check(tries=40, delay=30) + + # All the RHCS nodes in one zone + vm_objs.restart_vms(vms=ceph_vms) + time.sleep(wait_time) + # Validate ceph health OK + ceph_health_check(tries=40, delay=30) + + # All the RHCS nodes in one zone - Repeated to mimic Santa lucia issue + for i in range(10): + vm_objs.restart_vms(vms=ceph_vms) + logger.info( + f"Wait {wait_time} before another restart of ceph nodes from zones" + ) + time.sleep(wait_time) + # Validate ceph health OK + ceph_health_check(tries=120, delay=30) + + # Again verify the data integrity of application + for wl in workloads: + config.switch_to_cluster_by_name(self.primary_cluster_name) + validate_data_integrity(wl.workload_namespace) From f78801774ea18b40502ef441fe00a7174058cd8e Mon Sep 17 00:00:00 2001 From: Akarsha-rai Date: Tue, 16 Jul 2024 12:26:19 +0530 Subject: [PATCH 2/5] To verify that the cluster remains accessible and NO DU/DL after failures Signed-off-by: Akarsha-rai --- ...no_data_loss_and_corruption_on_failures.py | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py index d6f5fdb2963..f1c8e3c0587 100644 --- a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py +++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py @@ -38,21 +38,14 @@ class TestNoDataLossAndDataCorruptionOnFailures: """ - @pytest.mark.polarion_id("OCS-XXXX") + @pytest.mark.polarion_id("OCS-4793") def test_no_data_loss_and_data_corruption_on_failures( - self, setup_acm_ui, nodes_multicluster, dr_workload + self, nodes_multicluster, dr_workload ): # Deploy Subscription based application - sub = dr_workload(num_of_subscription=1)[0] - self.namespace = sub.workload_namespace - self.workload_type = sub.workload_type - - # Deploy AppSet based application - appset = dr_workload(num_of_subscription=0, num_of_appset=1)[0] - - # Workloads list - workloads = [sub, appset] + workloads = dr_workload(num_of_subscription=1, num_of_appset=1) + self.namespace = workloads[0].workload_namespace # Create application on Primary managed cluster set_current_primary_cluster_context(self.namespace) @@ -66,12 +59,13 @@ def test_no_data_loss_and_data_corruption_on_failures( validate_data_integrity(wl.workload_namespace) # Noobaa pod restarts atleast 5 times and verify the data integrity - restart_pods_having_label(label=constants.NOOBAA_APP_LABEL) + for i in range(5): + restart_pods_having_label(label=constants.NOOBAA_APP_LABEL) for wl in workloads: config.switch_to_cluster_by_name(self.primary_cluster_name) validate_data_integrity(wl.workload_namespace) - # Get the nodes from one active zone + # Get the nodes from one active zone and reboot of the nodes in all zones config.switch_ctx(get_active_acm_index()) active_hub_index = config.cur_index zone = config.ENV_DATA.get("zone") @@ -98,7 +92,6 @@ def test_no_data_loss_and_data_corruption_on_failures( ceph_node_ips.append( external_cluster_node_roles[ceph_node].get("ip_address") ) - # Rolling reboot of the nodes in all zones one at a time wait_time = 120 logger.info("Shutting down all the nodes from active hub zone") From 8f0c41fdcb2a575acc48886e0a1c98221effb6ad Mon Sep 17 00:00:00 2001 From: Akarsha-rai Date: Tue, 16 Jul 2024 13:10:41 +0530 Subject: [PATCH 3/5] To verify that the cluster remains accessible and NO DU/DL after failures Signed-off-by: Akarsha-rai --- ...t_no_data_loss_and_corruption_on_failures.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py index f1c8e3c0587..d46f8722e61 100644 --- a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py +++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py @@ -61,6 +61,9 @@ def test_no_data_loss_and_data_corruption_on_failures( # Noobaa pod restarts atleast 5 times and verify the data integrity for i in range(5): restart_pods_having_label(label=constants.NOOBAA_APP_LABEL) + logger.info( + "Verify the data integrity of application after repeated failures of Noobaa pods" + ) for wl in workloads: config.switch_to_cluster_by_name(self.primary_cluster_name) validate_data_integrity(wl.workload_namespace) @@ -93,12 +96,14 @@ def test_no_data_loss_and_data_corruption_on_failures( external_cluster_node_roles[ceph_node].get("ip_address") ) # Rolling reboot of the nodes in all zones one at a time - wait_time = 120 + wait_time = 300 logger.info("Shutting down all the nodes from active hub zone") - nodes_multicluster[managed_cluster_index].restart_nodes( + nodes_multicluster[managed_cluster_index].restart_nodes_by_stop_and_start( managed_cluster_node_objs ) - nodes_multicluster[active_hub_index].restart_nodes(active_hub_cluster_node_objs) + nodes_multicluster[active_hub_index].restart_nodes_by_stop_and_start( + active_hub_cluster_node_objs + ) host = config.ENV_DATA["vsphere_server"] user = config.ENV_DATA["vsphere_user"] password = config.ENV_DATA["vsphere_password"] @@ -118,6 +123,9 @@ def test_no_data_loss_and_data_corruption_on_failures( ceph_health_check(tries=40, delay=30) # Again verify the data integrity of application + logger.info( + "Verify the data integrity of application after all nodes from active hub zone are rebooted" + ) for wl in workloads: config.switch_to_cluster_by_name(self.primary_cluster_name) validate_data_integrity(wl.workload_namespace) @@ -146,6 +154,9 @@ def test_no_data_loss_and_data_corruption_on_failures( ceph_health_check(tries=120, delay=30) # Again verify the data integrity of application + logger.info( + "Verify the data integrity of application after repeated restart of ceph nodes from zones" + ) for wl in workloads: config.switch_to_cluster_by_name(self.primary_cluster_name) validate_data_integrity(wl.workload_namespace) From 20b460aae810593d115661ee74da764351bb09d0 Mon Sep 17 00:00:00 2001 From: Akarsha-rai Date: Tue, 16 Jul 2024 19:23:11 +0530 Subject: [PATCH 4/5] To verify that the cluster remains accessible and NO DU/DL after failures Signed-off-by: Akarsha-rai --- .../metro-dr/test_no_data_loss_and_corruption_on_failures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py index d46f8722e61..78975fad0f4 100644 --- a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py +++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py @@ -132,7 +132,7 @@ def test_no_data_loss_and_data_corruption_on_failures( # RHCS nodes failures # 1 RHCS node in one zone - vm_objs.restart_vms(vms=ceph_vms[0]) + vm_objs.restart_vms(vms=[ceph_vms[0]]) time.sleep(wait_time) # Validate ceph health OK ceph_health_check(tries=40, delay=30) From 2e5c6aacf6cf803ab71bc97b198b3733b0b980dd Mon Sep 17 00:00:00 2001 From: Akarsha-rai Date: Wed, 17 Jul 2024 12:42:18 +0530 Subject: [PATCH 5/5] To verify that the cluster remains accessible and NO DU/DL after failures Signed-off-by: Akarsha-rai --- .../test_no_data_loss_and_corruption_on_failures.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py index 78975fad0f4..4ea9adeec53 100644 --- a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py +++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py @@ -3,7 +3,11 @@ import time -from ocs_ci.framework.pytest_customization.marks import tier4a +from ocs_ci.framework.pytest_customization.marks import ( + tier4a, + turquoise_squad, + vsphere_platform_required, +) from ocs_ci.framework import config from ocs_ci.ocs.dr.dr_workload import validate_data_integrity from ocs_ci.ocs import constants @@ -15,7 +19,6 @@ get_current_primary_cluster_name, get_active_acm_index, ) -from ocs_ci.framework.pytest_customization.marks import turquoise_squad from ocs_ci.utility import vsphere from ocs_ci.utility.utils import ceph_health_check @@ -24,6 +27,7 @@ @tier4a @turquoise_squad +@vsphere_platform_required class TestNoDataLossAndDataCorruptionOnFailures: """ The Objective of this test cases is to make sure that the MDR cluster remains accessible