Skip to content

Commit

Permalink
perform Mon & osd failure tests in Stretch cluster
Browse files Browse the repository at this point in the history
Signed-off-by: Mahesh Shetty <[email protected]>
  • Loading branch information
mashetty330 committed Apr 15, 2024
1 parent 78d9dc5 commit 0eeaf1b
Show file tree
Hide file tree
Showing 3 changed files with 335 additions and 0 deletions.
40 changes: 40 additions & 0 deletions ocs_ci/ocs/resources/stretchcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
get_mon_pods,
get_mon_pod_id,
get_pod_node,
get_osd_pods,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -611,3 +612,42 @@ def post_failure_checks(
failure_check_map[type](
start_time, end_time, wait_for_read_completion=wait_for_read_completion
)

def get_mon_pods_in_a_zone(self, zone):
"""
Fetches mon pods in a particular zone
Args:
zone (str): Zone
Returns:
List: mon pods in a zone
"""
nodes_in_zone = [node.name for node in self.get_nodes_in_zone(zone)]
mon_pods = get_mon_pods()

mon_pods_in_zone = [
pod for pod in mon_pods if get_pod_node(pod).name in nodes_in_zone
]
return mon_pods_in_zone

def get_osd_pods_in_a_zone(self, zone):
"""
Fetches osd osd pods in particular zone
Args:
zone (str): Zone
Returns:
List: OSD pods in a zone
"""

nodes_in_zone = [node.name for node in self.get_nodes_in_zone(zone)]
osd_pods = get_osd_pods()

osd_pods_in_zone = [
pod for pod in osd_pods if get_pod_node(pod).name in nodes_in_zone
]
return osd_pods_in_zone
83 changes: 83 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6948,6 +6948,15 @@ def finalizer():

@pytest.fixture()
def logwriter_cephfs_many_pvc_factory(request, pvc_factory):
return logwriter_cephfs_many_pvc(request, pvc_factory)


@pytest.fixture(scope="class")
def logwriter_cephfs_many_pvc_class(request, pvc_factory_class):
return logwriter_cephfs_many_pvc(request, pvc_factory_class)


def logwriter_cephfs_many_pvc(request, pvc_factory):
"""
Fixture to create RWX cephfs volume
Expand All @@ -6973,8 +6982,17 @@ def setup_stretch_cluster_project(request, project_factory_session):
return project_factory_session(constants.STRETCH_CLUSTER_NAMESPACE)


@pytest.fixture(scope="class")
def logwriter_workload_class(request, teardown_factory_class):
return setup_logwriter_workload(request, teardown_factory_class)


@pytest.fixture()
def logwriter_workload_factory(request, teardown_factory):
return setup_logwriter_workload(request, teardown_factory)


def setup_logwriter_workload(request, teardown_factory):
"""
Fixture to create logwriter deployment
Expand Down Expand Up @@ -7022,8 +7040,17 @@ def factory(pvc, logwriter_path):
return factory


@pytest.fixture(scope="class")
def logreader_workload_class(request, teardown_factory_class):
return setup_logreader_workload(request, teardown_factory_class)


@pytest.fixture()
def logreader_workload_factory(request, teardown_factory):
return setup_logreader_workload(request, teardown_factory)


def setup_logreader_workload(request, teardown_factory):
def factory(pvc, logreader_path, duration=30):
"""
Args:
Expand Down Expand Up @@ -7070,6 +7097,26 @@ def factory(pvc, logreader_path, duration=30):
return factory


@pytest.fixture(scope="class")
def setup_logwriter_cephfs_workload_class(
request,
setup_stretch_cluster_project,
pvc_factory_class,
logwriter_cephfs_many_pvc_class,
logwriter_workload_class,
logreader_workload_class,
):

return setup_logwriter_cephfs_workload(
request,
setup_stretch_cluster_project,
pvc_factory_class,
logwriter_cephfs_many_pvc_class,
logwriter_workload_class,
logreader_workload_class,
)


@pytest.fixture()
def setup_logwriter_cephfs_workload_factory(
request,
Expand All @@ -7079,6 +7126,25 @@ def setup_logwriter_cephfs_workload_factory(
logwriter_workload_factory,
logreader_workload_factory,
):

return setup_logwriter_cephfs_workload(
request,
setup_stretch_cluster_project,
pvc_factory,
logwriter_cephfs_many_pvc_factory,
logwriter_workload_factory,
logreader_workload_factory,
)


def setup_logwriter_cephfs_workload(
request,
setup_stretch_cluster_project,
pvc_factory,
logwriter_cephfs_many_pvc_factory,
logwriter_workload_factory,
logreader_workload_factory,
):
"""
This fixture will create the RWX cephfs volume and call the logwriter, logreader fixture to do
complete setup
Expand Down Expand Up @@ -7110,9 +7176,26 @@ def factory(read_duration=30):
return factory


@pytest.fixture(scope="class")
def setup_logwriter_rbd_workload_class(
request, setup_stretch_cluster_project, teardown_factory_class
):
return setup_logwriter_rbd_workload(
request, setup_stretch_cluster_project, teardown_factory_class
)


@pytest.fixture()
def setup_logwriter_rbd_workload_factory(
request, setup_stretch_cluster_project, teardown_factory
):
return setup_logwriter_rbd_workload(
request, setup_stretch_cluster_project, teardown_factory
)


def setup_logwriter_rbd_workload(
request, setup_stretch_cluster_project, teardown_factory
):
"""
This fixture will create the RWO RBD volume, create logwriter sts using that volume
Expand Down
212 changes: 212 additions & 0 deletions tests/disaster-recovery/sc_arbiter/test_mon_osd_failures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import random
import time

import pytest
import logging

from datetime import datetime, timezone

from ocs_ci.helpers.helpers import modify_deployment_replica_count
from ocs_ci.helpers.stretchcluster_helper import recover_workload_pods_post_recovery
from ocs_ci.ocs.exceptions import UnexpectedBehaviour
from ocs_ci.ocs.resources.pod import (
get_not_running_pods,
wait_for_pods_to_be_in_statuses,
get_deployment_name,
)
from ocs_ci.ocs.resources.pvc import get_pvc_objs
from ocs_ci.ocs.resources.stretchcluster import StretchCluster
from ocs_ci.ocs import constants

logger = logging.getLogger(__name__)


@pytest.fixture(scope="class")
def setup_logwriter_workloads(
request,
setup_logwriter_cephfs_workload_class,
setup_logwriter_rbd_workload_class,
logreader_workload_class,
):
sc_obj = StretchCluster()
# Run the logwriter cephFs workloads
(
sc_obj.cephfs_logwriter_dep,
sc_obj.cephfs_logreader_job,
) = setup_logwriter_cephfs_workload_class(read_duration=0)

# Generate 5 minutes worth of logs before inducing the netsplit
logger.info("Generating 2 mins worth of log")
time.sleep(120)

sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL)
sc_obj.get_logwriter_reader_pods(label=constants.LOGREADER_CEPHFS_LABEL)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2
)
logger.info("All the workloads pods are successfully up and running")

start_time = datetime.now(timezone.utc)
sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL)
sc_obj.get_logfile_map(label=constants.LOGWRITER_RBD_LABEL)

def finalizer():
"""
Check for data loss, data corruption at the end of the tests
"""
end_time = datetime.now(timezone.utc)

try:
sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGREADER_CEPHFS_LABEL,
statuses=[constants.STATUS_RUNNING, constants.STATUS_COMPLETED],
)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2
)
except UnexpectedBehaviour:

logger.info("some pods are not running, so trying the work-around")
pods_not_running = get_not_running_pods(
namespace=constants.STRETCH_CLUSTER_NAMESPACE
)
recover_workload_pods_post_recovery(sc_obj, pods_not_running)

sc_obj.post_failure_checks(start_time, end_time, wait_for_read_completion=False)
logger.info("Successfully verified with post failure checks for the workloads")

sc_obj.cephfs_logreader_job.delete()
logger.info(sc_obj.cephfs_logreader_pods)
for pod in sc_obj.cephfs_logreader_pods:
pod.wait_for_pod_delete(timeout=120)
logger.info("All old CephFS logreader pods are deleted")

# check for any data loss
assert sc_obj.check_for_data_loss(
constants.LOGWRITER_CEPHFS_LABEL
), "[CephFS] Data is lost"
logger.info("[CephFS] No data loss is seen")
assert sc_obj.check_for_data_loss(
constants.LOGWRITER_RBD_LABEL
), "[RBD] Data is lost"
logger.info("[RBD] No data loss is seen")

# check for data corruption
pvc = get_pvc_objs(
pvc_names=[
sc_obj.cephfs_logwriter_dep.get()["spec"]["template"]["spec"][
"volumes"
][0]["persistentVolumeClaim"]["claimName"]
],
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)[0]
logreader_workload_class(
pvc=pvc, logreader_path=constants.LOGWRITER_CEPHFS_READER, duration=5
)
sc_obj.get_logwriter_reader_pods(constants.LOGREADER_CEPHFS_LABEL)

wait_for_pods_to_be_in_statuses(
expected_statuses=constants.STATUS_COMPLETED,
pod_names=[pod.name for pod in sc_obj.cephfs_logreader_pods],
timeout=900,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
logger.info("[CephFS] Logreader job pods have reached 'Completed' state!")

assert sc_obj.check_for_data_corruption(
label=constants.LOGREADER_CEPHFS_LABEL
), "Data is corrupted for cephFS workloads"
logger.info("No data corruption is seen in CephFS workloads")

assert sc_obj.check_for_data_corruption(
label=constants.LOGWRITER_RBD_LABEL
), "Data is corrupted for RBD workloads"
logger.info("No data corruption is seen in RBD workloads")

request.addfinalizer(finalizer)


@pytest.mark.usefixtures("setup_logwriter_workloads")
class TestMonAndOSDFailures:
def test_single_mon_failures(self):
"""
Test mon failure with IO in the background
"""
logger.info("testing single mon failures scenario")
sc_obj = StretchCluster()

# get mon-pod of a single zone
mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone("data-1")
mon_pod_to_fail = random.choice(mon_pods_in_zone).name

# get the deployment of the mon-pod
mon_dep = get_deployment_name(mon_pod_to_fail)

# scale the deployment of mon to 0
# and wait 10 mins
logger.info(f"failing mon {mon_dep} now...")
if modify_deployment_replica_count(mon_dep, 0):
time.sleep(600)

# scale the deployment back to 1
logger.info(f"recovering mon {mon_dep} now...")
modify_deployment_replica_count(mon_dep, 1)

def test_both_mon_failure(self):
"""
Test mon failure with IO for both the data-zones
"""
logger.info("testing mon failures at both the data-zones")
sc_obj = StretchCluster()
data_zones = constants.ZONES_LABELS
data_zones.remove("arbiter")
mon_deps = list()
for zone in data_zones:
# get mon-pod of a single zone
mon_pods_in_zone = sc_obj.get_mon_pods_in_a_zone(zone)
mon_pod_to_fail = random.choice(mon_pods_in_zone).name

# get the deployment of the mon-pod
mon_dep = get_deployment_name(mon_pod_to_fail)

# scale the deployment of mon to 0
# and wait 10 mins
modify_deployment_replica_count(mon_dep, 0)
logger.info(f"Failing mon {mon_dep} from data-zone {zone}")
mon_deps.append(mon_dep)

time.sleep(600)

# scale the deployments back to 1
for mon_dep in mon_deps:
logger.info(f"recovering mon {mon_dep}")
modify_deployment_replica_count(mon_dep, 1)

def test_single_osd_failure(self):
"""
Test single osd failure while IO's running
"""
logger.info("testing single osd failure scenarios")
sc_obj = StretchCluster()

# get osd-pod of a single zone
osd_pods_in_zone = sc_obj.get_osd_pods_in_a_zone("data-1")
osd_pod_to_fail = random.choice(osd_pods_in_zone).name

# get the deployment of the osd-pod
osd_dep = get_deployment_name(osd_pod_to_fail)

# scale the deployment of osd to 0
# and wait 10 mins
logger.info(f"failing osd {osd_dep} now...")
if modify_deployment_replica_count(osd_dep, 0):
time.sleep(600)

# scale the deployment back to 1
logger.info(f"recovering osd {osd_dep} now...")
modify_deployment_replica_count(osd_dep, 1)

0 comments on commit 0eeaf1b

Please sign in to comment.