Skip to content

Commit

Permalink
Resiliency tests scenarios automation
Browse files Browse the repository at this point in the history
Signed-off-by: Parag Kamble <[email protected]>
  • Loading branch information
paraggit committed Dec 3, 2024
1 parent 7d25c99 commit c9c9978
Show file tree
Hide file tree
Showing 16 changed files with 904 additions and 1 deletion.
1 change: 1 addition & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
PROVIDER_CLIENT_DEPLOYMENT_DIR = os.path.join(
TEMPLATE_DIR, "provider-client-deployment"
)
RESILIENCY_DIR = os.path.join(TOP_DIR, "ocs_ci", "resiliency")

# OCP Deployment constants
CHRONY_TEMPLATE = os.path.join(
Expand Down
2 changes: 1 addition & 1 deletion ocs_ci/ocs/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def get_node_ips(node_type="worker"):
ocp = OCP(kind=constants.NODE)
if node_type == "worker":
nodes = ocp.get(selector=constants.WORKER_LABEL).get("items")
if node_type == "master:":
if node_type == "master":
nodes = ocp.get(selector=constants.MASTER_LABEL).get("items")

if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM:
Expand Down
182 changes: 182 additions & 0 deletions ocs_ci/resiliency/cluster_failures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import logging
from ocs_ci.ocs.node import get_node_ips
from abc import ABC, abstractmethod
from ocs_ci.framework import config
from ocs_ci.utility.vsphere import VSPHERE
from ocs_ci.ocs import constants
import random
import time

log = logging.getLogger(__name__)


class ClusterFailures(ABC):
def __init__(self, cluster_name):
self.cluster_name = cluster_name

def random_node_ip(self, node_type="worker"):
"""Return a random node IP of a given node type."""
ips = get_node_ips(node_type=node_type)
return random.choice(ips)

@abstractmethod
def shutdown_node(self, node_ip=None, node_type="worker"):
pass

@abstractmethod
def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
pass

@abstractmethod
def network_split(self, nodes):
pass


class VsphereClusterFailures(ClusterFailures):
def __init__(self):
super().__init__(cluster_name="vSphere")
self.vsphere_host = config.ENV_DATA["vsphere_server"]
self.vsphere_password = config.ENV_DATA["vsphere_password"]
self.vsphere_username = config.ENV_DATA["vsphere_user"]
self.dc = config.ENV_DATA["vsphere_datacenter"]
self.vsobj = VSPHERE(
self.vsphere_host, self.vsphere_username, self.vsphere_password
)

def shutdown_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(f"Shutting down node {node_ip} on vSphere cluster {self.cluster_name}")
vm = self.vsobj.get_vm_by_ip(node_ip, self.dc)
self.vsobj.stop_vms([vm])
log.info(f"Node {node_ip} VM instance stopped.")

def reboot_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
vm = self.vsobj.get_vm_by_ip(node_ip, self.dc)
vm_name = vm.name
self.vsobj.stop_vms([vm])
log.info(f"VM instance {vm_name} is stopped.")
time.sleep(20)
self.vsobj.start_vms([vm])
log.info(f"VM instance {vm_name} is started.")

def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"{'Connecting' if connect else 'Disconnecting'} network interface"
f"of node {node_ip} on vSphere cluster {self.cluster_name}"
)
self.vsobj.change_vm_network_state(node_ip, self.dc, connect=connect)

def network_split(self, nodes):
log.warning("Function 'network_split' is not implemented.")
raise NotImplementedError("Function 'network_split' is not implemented.")


class IbmCloudClusterFailures(ClusterFailures):
def __init__(self):
super().__init__(cluster_name="IBM Cloud")

def shutdown_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"Shutting down node {node_ip} on IBM Cloud cluster {self.cluster_name}"
)
raise NotImplementedError("IBM Cloud shutdown logic is not implemented.")

def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"{'Connecting' if connect else 'Disconnecting'} network interface"
f" of node {node_ip} on IBM Cloud cluster {self.cluster_name}"
)
# Add IBM Cloud-specific logic here

def network_split(self, nodes):
log.info(
f"Simulating network split on nodes {nodes} in IBM Cloud cluster {self.cluster_name}"
)
# Add IBM Cloud-specific network split logic


class AwsClusterFailures(ClusterFailures):
def __init__(self):
super().__init__(cluster_name="AWS")

def shutdown_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(f"Shutting down node {node_ip} on AWS cluster {self.cluster_name}")
# Add AWS-specific shutdown logic

def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"{'Connecting' if connect else 'Disconnecting'} network interface"
f"of node {node_ip} on AWS cluster {self.cluster_name}"
)
# Add AWS-specific logic here

def network_split(self, nodes):
log.info(
f"Simulating network split on nodes {nodes} in AWS cluster {self.cluster_name}"
)
# Add AWS-specific network split logic


class BaremetalClusterFailures(ClusterFailures):
def __init__(self):
super().__init__(cluster_name="Bare Metal")

def shutdown_node(self, node_ip=None, node_type="worker"):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"Shutting down node {node_ip} on Bare Metal cluster {self.cluster_name}"
)
# Add bare metal-specific shutdown logic

def change_node_network_interface_state(
self, node_ip=None, node_type="worker", interface_name=None, connect=False
):
if not node_ip:
node_ip = self.random_node_ip(node_type=node_type)
log.info(
f"{'Connecting' if connect else 'Disconnecting'} network interface"
f" of node {node_ip} on Bare Metal cluster {self.cluster_name}"
)
# Add bare metal-specific logic here

def network_split(self, nodes):
log.info(
f"Simulating network split on nodes {nodes} in Bare Metal cluster {self.cluster_name}"
)
# Add bare metal-specific network split logic


def get_cluster_object():
platform = config.ENV_DATA["platform"].lower()
if platform == constants.VSPHERE_PLATFORM:
return VsphereClusterFailures()
elif platform == constants.AWS_PLATFORM:
return AwsClusterFailures()
elif platform == constants.IBMCLOUD_PLATFORM:
return IbmCloudClusterFailures()
elif platform == constants.BAREMETAL_PLATFORM:
return BaremetalClusterFailures()
else:
raise ValueError(f"Unsupported platform: {platform}")
20 changes: 20 additions & 0 deletions ocs_ci/resiliency/conf/network_failures.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
NETWORK_FAILURES:
WAIT_TILL_NODE_JOIN: true
FAILURES:
- NODE_NETWORK_DOWN:
NETWORK_FAILURE_DURATION: 30
node_selector:
- labels: []
num_reboot_nodes: "0-3"
- POD_NETWORK_FAILURE:
node_selector:
- labels: []
WORKLOAD:
FIO:
- CEPHFS:
template: "fio_cephfs_template.yaml"
name: "cephfs-fio-workload"
- BLOCK:
template: "fio_block_template.yaml"
name: "block-fio-workload"

21 changes: 21 additions & 0 deletions ocs_ci/resiliency/conf/node_failures.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
NODE_FAILURES:
WAIT_TILL_NODE_JOIN: true
FAILURES:
- POWEROFF_NODE:
NODE_TYPE:
- "master"
- "worker"
node_selector:
- labels: []
ITERATION: 10
- NODE_DRAIN:
node_selector:
- labels: []
WORKLOAD:
FIO:
- CEPHFS:
template: "fio_cephfs_template.yaml"
name: "cephfs-fio-workload"
- BLOCK:
template: "fio_block_template.yaml"
name: "block-fio-workload"
7 changes: 7 additions & 0 deletions ocs_ci/resiliency/conf/resiliency.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
RESILIENCY:
RUN_CONFIG:
STOP_WHEN_CEPH_UNHEALTHY: true
ITERATE_SCENARIOS: True
FAILURE_SCENARIOS:
- NODE_FAILURES
- NETWORK_FAILURES
59 changes: 59 additions & 0 deletions ocs_ci/resiliency/network_failures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import logging
import time

from ocs_ci.resiliency.cluster_failures import get_cluster_object

log = logging.getLogger(__name__)


class NetworkFailures:
SCENARIO_NAME = "NETWORK_FAILURES"
FAILURE_METHODS = {
"POD_NETWORK_FAILURE": "_run_pod_network_failures",
"NODE_NETWORK_DOWN": "_run_node_network_failure",
}

def __init__(self, failure_data):
self.scenario_name = self.SCENARIO_NAME
self.failure_data = failure_data
self.cluster_obj = get_cluster_object()

def failure_case(self):
"""Get the first failure case key from failure_data."""
if not self.failure_data:
raise ValueError("No failure case provided in failure_data.")
return next(iter(self.failure_data))

def run(self):
"""Dynamically call the appropriate method based on the failure case."""
case = self.failure_case()
method_name = self.FAILURE_METHODS.get(case)
if method_name and hasattr(self, method_name):
method = getattr(self, method_name)
method()
else:
raise NotImplementedError(
f"Failure method for case '{case}' is not implemented."
)

def _run_pod_network_failures(self):
"""Handle Pod Network Failure scenario."""
log.info("Bringing down Pod network interface.")
# Implement pod network failure logic here

def _run_node_network_failure(self):
"""Handle Node Network Failure scenario."""
log.info("Bringing down Node network interfaces.")
node_types = ["master", "worker"]
for node_type in node_types:
node_ip = self.cluster_obj.random_node_ip(node_type)
self.cluster_obj.change_node_network_interface_state(
node_ip=node_ip, node_type=node_type, connect=False
)
try:
time.sleep(60) # Simulate network being down
finally:
self.cluster_obj.change_node_network_interface_state(
node_ip=node_ip, node_type=node_type, connect=True
)
log.info(f"Network interface on node {node_ip} restored.")
69 changes: 69 additions & 0 deletions ocs_ci/resiliency/node_failures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import logging
import random
from ocs_ci.utility.utils import ceph_health_check
from ocs_ci.resiliency.cluster_failures import get_cluster_object

log = logging.getLogger(__name__)


class NodeFailures:
SCENARIO_NAME = "NODE_FAILURES"
FAILURE_METHODS = {
"POWEROFF_NODE": "_run_poweroff_node",
"NODE_DRAIN": "_run_node_drain",
}

def __init__(self, failure_data):
self.failure_data = failure_data
self.failure_case_name = self._get_failure_case()
self.scenario_name = self.SCENARIO_NAME
self.cluster_obj = get_cluster_object()

def _get_failure_case(self):
"""Retrieve the failure case name from the provided failure data."""
if not self.failure_data:
log.error("Failure data is empty.")
return None
return next(iter(self.failure_data))

def run(self):
"""Run the failure scenario based on the failure case."""
if not self.failure_case_name:
log.error("No valid failure case name found. Exiting run method.")
return

method_name = self.FAILURE_METHODS.get(self.failure_case_name)
if method_name and hasattr(self, method_name):
failure_method = getattr(self, method_name)
failure_method()
self._post_scenario_checks()
else:
raise NotImplementedError(
f"Failure method for '{self.failure_case_name}' is not implemented."
)

def _run_poweroff_node(self):
"""Simulate the reboot of nodes."""
log.info("Running Failure Case: POWEROFF_NODE.")
node_types = self.failure_data[self.failure_case_name].get("NODE_TYPE", [])
poweroff_iteration = self.failure_data[self.failure_case_name].get(
"ITERATION", 0
)

for _ in range(poweroff_iteration):
node_type = random.choice(node_types)
log.info(f"Rebooting {node_type} node.")
self.cluster_obj.reboot_node(node_type=node_type)
log.info(f"{node_type.capitalize()} node rebooted.")

def _run_node_drain(self):
"""Simulate draining of nodes."""
log.info("Running Failure Case: NODE_DRAIN.")
# Implement node drain logic here
log.info("Draining node...")

def _post_scenario_checks(self):
"""Perform post-scenario checks to ensure the cluster is healthy."""
log.info(f"Running post-scenario checks for {self.scenario_name}.")
log.info("Verifying that Ceph health is OK (retrying if necessary).")
ceph_health_check(tries=45, delay=60)
Loading

0 comments on commit c9c9978

Please sign in to comment.