diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 3331cfb3f9a..1123a55f808 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -91,6 +91,7 @@ PROVIDER_CLIENT_DEPLOYMENT_DIR = os.path.join( TEMPLATE_DIR, "provider-client-deployment" ) +RESILIENCY_DIR = os.path.join(TOP_DIR, "ocs_ci", "resiliency") # OCP Deployment constants CHRONY_TEMPLATE = os.path.join( diff --git a/ocs_ci/ocs/node.py b/ocs_ci/ocs/node.py index 76964b854e7..242b3d29639 100644 --- a/ocs_ci/ocs/node.py +++ b/ocs_ci/ocs/node.py @@ -334,7 +334,7 @@ def get_node_ips(node_type="worker"): ocp = OCP(kind=constants.NODE) if node_type == "worker": nodes = ocp.get(selector=constants.WORKER_LABEL).get("items") - if node_type == "master:": + if node_type == "master": nodes = ocp.get(selector=constants.MASTER_LABEL).get("items") if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM: diff --git a/ocs_ci/resiliency/cluster_failures.py b/ocs_ci/resiliency/cluster_failures.py new file mode 100644 index 00000000000..a1feb189c17 --- /dev/null +++ b/ocs_ci/resiliency/cluster_failures.py @@ -0,0 +1,182 @@ +import logging +from ocs_ci.ocs.node import get_node_ips +from abc import ABC, abstractmethod +from ocs_ci.framework import config +from ocs_ci.utility.vsphere import VSPHERE +from ocs_ci.ocs import constants +import random +import time + +log = logging.getLogger(__name__) + + +class ClusterFailures(ABC): + def __init__(self, cluster_name): + self.cluster_name = cluster_name + + def random_node_ip(self, node_type="worker"): + """Return a random node IP of a given node type.""" + ips = get_node_ips(node_type=node_type) + return random.choice(ips) + + @abstractmethod + def shutdown_node(self, node_ip=None, node_type="worker"): + pass + + @abstractmethod + def change_node_network_interface_state( + self, node_ip=None, node_type="worker", interface_name=None, connect=False + ): + pass + + @abstractmethod + def network_split(self, nodes): + pass + + +class VsphereClusterFailures(ClusterFailures): + def __init__(self): + super().__init__(cluster_name="vSphere") + self.vsphere_host = config.ENV_DATA["vsphere_server"] + self.vsphere_password = config.ENV_DATA["vsphere_password"] + self.vsphere_username = config.ENV_DATA["vsphere_user"] + self.dc = config.ENV_DATA["vsphere_datacenter"] + self.vsobj = VSPHERE( + self.vsphere_host, self.vsphere_username, self.vsphere_password + ) + + def shutdown_node(self, node_ip=None, node_type="worker"): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + log.info(f"Shutting down node {node_ip} on vSphere cluster {self.cluster_name}") + vm = self.vsobj.get_vm_by_ip(node_ip, self.dc) + self.vsobj.stop_vms([vm]) + log.info(f"Node {node_ip} VM instance stopped.") + + def reboot_node(self, node_ip=None, node_type="worker"): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + vm = self.vsobj.get_vm_by_ip(node_ip, self.dc) + vm_name = vm.name + self.vsobj.stop_vms([vm]) + log.info(f"VM instance {vm_name} is stopped.") + time.sleep(20) + self.vsobj.start_vms([vm]) + log.info(f"VM instance {vm_name} is started.") + + def change_node_network_interface_state( + self, node_ip=None, node_type="worker", interface_name=None, connect=False + ): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + log.info( + f"{'Connecting' if connect else 'Disconnecting'} network interface" + f"of node {node_ip} on vSphere cluster {self.cluster_name}" + ) + self.vsobj.change_vm_network_state(node_ip, self.dc, connect=connect) + + def network_split(self, nodes): + log.warning("Function 'network_split' is not implemented.") + raise NotImplementedError("Function 'network_split' is not implemented.") + + +class IbmCloudClusterFailures(ClusterFailures): + def __init__(self): + super().__init__(cluster_name="IBM Cloud") + + def shutdown_node(self, node_ip=None, node_type="worker"): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + log.info( + f"Shutting down node {node_ip} on IBM Cloud cluster {self.cluster_name}" + ) + raise NotImplementedError("IBM Cloud shutdown logic is not implemented.") + + def change_node_network_interface_state( + self, node_ip=None, node_type="worker", interface_name=None, connect=False + ): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + log.info( + f"{'Connecting' if connect else 'Disconnecting'} network interface" + f" of node {node_ip} on IBM Cloud cluster {self.cluster_name}" + ) + # Add IBM Cloud-specific logic here + + def network_split(self, nodes): + log.info( + f"Simulating network split on nodes {nodes} in IBM Cloud cluster {self.cluster_name}" + ) + # Add IBM Cloud-specific network split logic + + +class AwsClusterFailures(ClusterFailures): + def __init__(self): + super().__init__(cluster_name="AWS") + + def shutdown_node(self, node_ip=None, node_type="worker"): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + log.info(f"Shutting down node {node_ip} on AWS cluster {self.cluster_name}") + # Add AWS-specific shutdown logic + + def change_node_network_interface_state( + self, node_ip=None, node_type="worker", interface_name=None, connect=False + ): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + log.info( + f"{'Connecting' if connect else 'Disconnecting'} network interface" + f"of node {node_ip} on AWS cluster {self.cluster_name}" + ) + # Add AWS-specific logic here + + def network_split(self, nodes): + log.info( + f"Simulating network split on nodes {nodes} in AWS cluster {self.cluster_name}" + ) + # Add AWS-specific network split logic + + +class BaremetalClusterFailures(ClusterFailures): + def __init__(self): + super().__init__(cluster_name="Bare Metal") + + def shutdown_node(self, node_ip=None, node_type="worker"): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + log.info( + f"Shutting down node {node_ip} on Bare Metal cluster {self.cluster_name}" + ) + # Add bare metal-specific shutdown logic + + def change_node_network_interface_state( + self, node_ip=None, node_type="worker", interface_name=None, connect=False + ): + if not node_ip: + node_ip = self.random_node_ip(node_type=node_type) + log.info( + f"{'Connecting' if connect else 'Disconnecting'} network interface" + f" of node {node_ip} on Bare Metal cluster {self.cluster_name}" + ) + # Add bare metal-specific logic here + + def network_split(self, nodes): + log.info( + f"Simulating network split on nodes {nodes} in Bare Metal cluster {self.cluster_name}" + ) + # Add bare metal-specific network split logic + + +def get_cluster_object(): + platform = config.ENV_DATA["platform"].lower() + if platform == constants.VSPHERE_PLATFORM: + return VsphereClusterFailures() + elif platform == constants.AWS_PLATFORM: + return AwsClusterFailures() + elif platform == constants.IBMCLOUD_PLATFORM: + return IbmCloudClusterFailures() + elif platform == constants.BAREMETAL_PLATFORM: + return BaremetalClusterFailures() + else: + raise ValueError(f"Unsupported platform: {platform}") diff --git a/ocs_ci/resiliency/conf/network_failures.yaml b/ocs_ci/resiliency/conf/network_failures.yaml new file mode 100644 index 00000000000..ad252fbc3a7 --- /dev/null +++ b/ocs_ci/resiliency/conf/network_failures.yaml @@ -0,0 +1,20 @@ +NETWORK_FAILURES: + WAIT_TILL_NODE_JOIN: true + FAILURES: + - NODE_NETWORK_DOWN: + NETWORK_FAILURE_DURATION: 30 + node_selector: + - labels: [] + num_reboot_nodes: "0-3" + - POD_NETWORK_FAILURE: + node_selector: + - labels: [] + WORKLOAD: + FIO: + - CEPHFS: + template: "fio_cephfs_template.yaml" + name: "cephfs-fio-workload" + - BLOCK: + template: "fio_block_template.yaml" + name: "block-fio-workload" + diff --git a/ocs_ci/resiliency/conf/node_failures.yaml b/ocs_ci/resiliency/conf/node_failures.yaml new file mode 100644 index 00000000000..55c978cbb26 --- /dev/null +++ b/ocs_ci/resiliency/conf/node_failures.yaml @@ -0,0 +1,21 @@ +NODE_FAILURES: + WAIT_TILL_NODE_JOIN: true + FAILURES: + - POWEROFF_NODE: + NODE_TYPE: + - "master" + - "worker" + node_selector: + - labels: [] + ITERATION: 10 + - NODE_DRAIN: + node_selector: + - labels: [] + WORKLOAD: + FIO: + - CEPHFS: + template: "fio_cephfs_template.yaml" + name: "cephfs-fio-workload" + - BLOCK: + template: "fio_block_template.yaml" + name: "block-fio-workload" \ No newline at end of file diff --git a/ocs_ci/resiliency/conf/resiliency.yaml b/ocs_ci/resiliency/conf/resiliency.yaml new file mode 100644 index 00000000000..dbecf322822 --- /dev/null +++ b/ocs_ci/resiliency/conf/resiliency.yaml @@ -0,0 +1,7 @@ +RESILIENCY: + RUN_CONFIG: + STOP_WHEN_CEPH_UNHEALTHY: true + ITERATE_SCENARIOS: True + FAILURE_SCENARIOS: + - NODE_FAILURES + - NETWORK_FAILURES diff --git a/ocs_ci/resiliency/network_failures.py b/ocs_ci/resiliency/network_failures.py new file mode 100644 index 00000000000..ef6dd3c9ca3 --- /dev/null +++ b/ocs_ci/resiliency/network_failures.py @@ -0,0 +1,59 @@ +import logging +import time + +from ocs_ci.resiliency.cluster_failures import get_cluster_object + +log = logging.getLogger(__name__) + + +class NetworkFailures: + SCENARIO_NAME = "NETWORK_FAILURES" + FAILURE_METHODS = { + "POD_NETWORK_FAILURE": "_run_pod_network_failures", + "NODE_NETWORK_DOWN": "_run_node_network_failure", + } + + def __init__(self, failure_data): + self.scenario_name = self.SCENARIO_NAME + self.failure_data = failure_data + self.cluster_obj = get_cluster_object() + + def failure_case(self): + """Get the first failure case key from failure_data.""" + if not self.failure_data: + raise ValueError("No failure case provided in failure_data.") + return next(iter(self.failure_data)) + + def run(self): + """Dynamically call the appropriate method based on the failure case.""" + case = self.failure_case() + method_name = self.FAILURE_METHODS.get(case) + if method_name and hasattr(self, method_name): + method = getattr(self, method_name) + method() + else: + raise NotImplementedError( + f"Failure method for case '{case}' is not implemented." + ) + + def _run_pod_network_failures(self): + """Handle Pod Network Failure scenario.""" + log.info("Bringing down Pod network interface.") + # Implement pod network failure logic here + + def _run_node_network_failure(self): + """Handle Node Network Failure scenario.""" + log.info("Bringing down Node network interfaces.") + node_types = ["master", "worker"] + for node_type in node_types: + node_ip = self.cluster_obj.random_node_ip(node_type) + self.cluster_obj.change_node_network_interface_state( + node_ip=node_ip, node_type=node_type, connect=False + ) + try: + time.sleep(60) # Simulate network being down + finally: + self.cluster_obj.change_node_network_interface_state( + node_ip=node_ip, node_type=node_type, connect=True + ) + log.info(f"Network interface on node {node_ip} restored.") diff --git a/ocs_ci/resiliency/node_failures.py b/ocs_ci/resiliency/node_failures.py new file mode 100644 index 00000000000..b134fe47ff1 --- /dev/null +++ b/ocs_ci/resiliency/node_failures.py @@ -0,0 +1,69 @@ +import logging +import random +from ocs_ci.utility.utils import ceph_health_check +from ocs_ci.resiliency.cluster_failures import get_cluster_object + +log = logging.getLogger(__name__) + + +class NodeFailures: + SCENARIO_NAME = "NODE_FAILURES" + FAILURE_METHODS = { + "POWEROFF_NODE": "_run_poweroff_node", + "NODE_DRAIN": "_run_node_drain", + } + + def __init__(self, failure_data): + self.failure_data = failure_data + self.failure_case_name = self._get_failure_case() + self.scenario_name = self.SCENARIO_NAME + self.cluster_obj = get_cluster_object() + + def _get_failure_case(self): + """Retrieve the failure case name from the provided failure data.""" + if not self.failure_data: + log.error("Failure data is empty.") + return None + return next(iter(self.failure_data)) + + def run(self): + """Run the failure scenario based on the failure case.""" + if not self.failure_case_name: + log.error("No valid failure case name found. Exiting run method.") + return + + method_name = self.FAILURE_METHODS.get(self.failure_case_name) + if method_name and hasattr(self, method_name): + failure_method = getattr(self, method_name) + failure_method() + self._post_scenario_checks() + else: + raise NotImplementedError( + f"Failure method for '{self.failure_case_name}' is not implemented." + ) + + def _run_poweroff_node(self): + """Simulate the reboot of nodes.""" + log.info("Running Failure Case: POWEROFF_NODE.") + node_types = self.failure_data[self.failure_case_name].get("NODE_TYPE", []) + poweroff_iteration = self.failure_data[self.failure_case_name].get( + "ITERATION", 0 + ) + + for _ in range(poweroff_iteration): + node_type = random.choice(node_types) + log.info(f"Rebooting {node_type} node.") + self.cluster_obj.reboot_node(node_type=node_type) + log.info(f"{node_type.capitalize()} node rebooted.") + + def _run_node_drain(self): + """Simulate draining of nodes.""" + log.info("Running Failure Case: NODE_DRAIN.") + # Implement node drain logic here + log.info("Draining node...") + + def _post_scenario_checks(self): + """Perform post-scenario checks to ensure the cluster is healthy.""" + log.info(f"Running post-scenario checks for {self.scenario_name}.") + log.info("Verifying that Ceph health is OK (retrying if necessary).") + ceph_health_check(tries=45, delay=60) diff --git a/ocs_ci/resiliency/resiliency_helper.py b/ocs_ci/resiliency/resiliency_helper.py new file mode 100644 index 00000000000..5500fb2c2a8 --- /dev/null +++ b/ocs_ci/resiliency/resiliency_helper.py @@ -0,0 +1,169 @@ +import yaml +import os +import logging +from ocs_ci.ocs import constants +from ocs_ci.resiliency.node_failures import NodeFailures +from ocs_ci.resiliency.network_failures import NetworkFailures +from ocs_ci.helpers.sanity_helpers import Sanity + +log = logging.getLogger(__name__) + + +class ResiliencyConfig: + """Handles loading and parsing of the resiliency configuration.""" + + CONFIG_FILE = os.path.join(constants.RESILIENCY_DIR, "conf", "resiliency.yaml") + + def __init__(self): + self.data = self.load_yaml(self.CONFIG_FILE) + resiliency = self.data.get("RESILIENCY", {}) + self.run_config = resiliency.get("RUN_CONFIG", {}) + self.stop_when_ceph_unhealthy = self.run_config.get( + "STOP_WHEN_CEPH_UNHEALTHY", False + ) + self.iterate_scenarios = self.run_config.get("ITERATE_SCENARIOS", False) + self.failure_scenarios = resiliency.get("FAILURE_SCENARIOS", []) + + @staticmethod + def load_yaml(file_path): + """Load and parse the YAML file.""" + try: + with open(file_path, "r") as file: + return yaml.safe_load(file) or {} + except FileNotFoundError: + log.error(f"YAML file not found: {file_path}") + return {} + except yaml.YAMLError as exc: + log.error(f"Error parsing YAML file {file_path}: {exc}") + return {} + + def __repr__(self): + """Representation of the ResiliencyConfig object.""" + return ( + f"ResiliencyConfig(" + f"stop_when_ceph_unhealthy={self.stop_when_ceph_unhealthy}, " + f"iterate_scenarios={self.iterate_scenarios}, " + f"failure_scenarios={self.failure_scenarios})" + ) + + +class ResiliencyFailures: + """Handles loading failure cases from the configuration and iterating over them.""" + + SCENARIO_DIR = os.path.join(constants.RESILIENCY_DIR, "conf") + + def __init__(self, scenario_name, failure_method=None): + self.scenario_name = scenario_name + self.failure_method = failure_method + self.failure_cases_data = self.get_failure_cases_data() + self.workload = self.failure_cases_data.get("WORKLOAD", "") + self.failure_list = self.get_failure_list() + self._iterator = iter(self.failure_list) + + def get_failure_cases_data(self): + """Load the YAML file containing failure case details for the given scenario.""" + log.info( + f"Searching for scenario '{self.scenario_name}' in directory: {self.SCENARIO_DIR}" + ) + scenario_file = f"{self.scenario_name.lower()}.yaml" + file_path = os.path.join(self.SCENARIO_DIR, scenario_file) + + if os.path.isfile(file_path): + data = ResiliencyConfig.load_yaml(file_path) + if self.scenario_name in data: + log.info( + f"Found scenario '{self.scenario_name}' in file: {scenario_file}" + ) + return data[self.scenario_name] + else: + log.error( + f"Scenario '{self.scenario_name}' not found in file: {scenario_file}" + ) + else: + log.error( + f"Scenario file '{scenario_file}' not found in directory: {self.SCENARIO_DIR}" + ) + return {} + + def get_failure_list(self): + """Retrieve and optionally filter the failure list based on the failure method.""" + failures = self.failure_cases_data.get("FAILURES", []) + if self.failure_method: + # Filter the failures to include only those matching the failure method + filtered_failures = [ + {self.failure_method: failure[self.failure_method]} + for failure in failures + if self.failure_method in failure + ] + if not filtered_failures: + log.warning( + f"No failures found for failure method '{self.failure_method}' in scenario '{self.scenario_name}'." + ) + return filtered_failures + return failures + + def __iter__(self): + """Return an iterator over the failure list.""" + self._iterator = iter(self.failure_list) + return self._iterator + + +class Resiliency: + """Main class for running resiliency tests.""" + + def __init__(self, scenario, failure_method=None): + self.scenario_name = scenario + self.resiliency_failures = ResiliencyFailures(scenario, failure_method) + self.sanity_helpers = Sanity() + + def post_scenario_check(self): + """Perform post-scenario checks like Ceph health and logs.""" + log.info("Checking Ceph health...") + self.sanity_helpers.health_check(tries=40) + log.info("Running must-gather logs...") + + def start(self): + """Iterate over and inject the failures one by one.""" + for failure_case in self.resiliency_failures: + self.inject_failure(failure_case) + self.post_scenario_check() + + def inject_failure(self, failure): + """Inject the failure into the system.""" + log.info(f"Running failure case for scenario '{self.scenario_name}': {failure}") + failure_obj = InjectFailures(self.scenario_name, failure) + failure_obj.run_failure_case() + + def cleanup(self): + """Cleanup method after the scenario is completed.""" + log.info("Cleaning up after the scenario...") + + +class InjectFailures: + """Handles the actual injection of failures based on the scenario.""" + + SCENARIO_CLASSES = { + NetworkFailures.SCENARIO_NAME: NetworkFailures, + NodeFailures.SCENARIO_NAME: NodeFailures, + } + + def __init__(self, scenario, failure_case): + self.scenario = scenario + self.failure_case = failure_case + + def failure_object(self): + scenario_class = self.SCENARIO_CLASSES.get(self.scenario) + if scenario_class: + return scenario_class(self.failure_case) + else: + raise NotImplementedError( + f"No implementation for scenario '{self.scenario}'" + ) + + def run_failure_case(self): + """Inject the failure into the cluster.""" + log.info( + f"Injecting failure into the cluster for scenario '{self.scenario}'..." + ) + failure_obj = self.failure_object() + failure_obj.run() diff --git a/ocs_ci/resiliency/resiliency_workload.py b/ocs_ci/resiliency/resiliency_workload.py new file mode 100644 index 00000000000..0b5a3b59c9f --- /dev/null +++ b/ocs_ci/resiliency/resiliency_workload.py @@ -0,0 +1,154 @@ +import logging +import os + +from abc import ABC, abstractmethod +from jinja2 import Environment, FileSystemLoader +import fauxfactory + +from ocs_ci.ocs import constants +from ocs_ci.utility.utils import run_cmd + +log = logging.getLogger(__name__) + + +class Workload(ABC): + """ + Abstract Base Class for all workloads. + """ + + def __init__(self, namespace="default", image=None): + self.namespace = namespace + self.image = image + self.template_dir = os.path.join(constants.RESILIENCY_DIR, "workloads") + self.workload_env = Environment(loader=FileSystemLoader(self.template_dir)) + + @abstractmethod + def start_workload(self): + pass + + @abstractmethod + def scale_up_pods(self, desired_count): + pass + + @abstractmethod + def scale_down_pods(self, desired_count): + pass + + @abstractmethod + def stop_workload(self): + pass + + @abstractmethod + def cleanup_workload(self): + pass + + +class FioWorkload(Workload): + """ + FIO-specific implementation of Workload. + """ + + def __init__(self, pvc): + super().__init__(namespace=pvc.namespace) + self.pvc = pvc + self.pvc.reload() + self.deployment_name = f"fio-app-{fauxfactory.gen_alpha(8).lower()}" + self.volume_mode = self.pvc.data["spec"]["volumeMode"] + self.template_file = ( + "fio_fs_workload_template.yaml" + if self.volume_mode == "Filesystem" + else "fio_block_workload_template.yaml" + ) + self.template = self.workload_env.get_template(self.template_file) + self.output_file = f"/tmp/{fauxfactory.gen_alpha(8).lower()}.yaml" + self.render_template() + + def start_workload(self): + log.info("Starting FIO workload.") + run_cmd(f"oc create -f {self.output_file}") + log.info("FIO workload started.") + + def scale_up_pods(self, desired_count): + log.info(f"Scaling up FIO pods to {desired_count}.") + # Implement logic to scale up pods + + def scale_down_pods(self, desired_count): + log.info(f"Scaling down FIO pods to {desired_count}.") + # Implement logic to scale down pods + + def stop_workload(self): + log.info("Stopping FIO workload.") + run_cmd(f"oc delete -f {self.output_file}") + log.info("FIO workload stopped.") + + def cleanup_workload(self): + log.info("Cleaning up FIO workload.") + # Implement cleanup logic, e.g., deleting all pods in the workload + + def render_template(self): + rendered_yaml = self.template.render( + fio_name=self.deployment_name, + namespace=self.namespace, + pvc_claim_name=self.pvc.name, + ) + with open(self.output_file, "w") as f: + f.write(rendered_yaml) + log.info("Rendered FIO workload template.") + + +class SmallFilesWorkload(Workload): + """ + SmallFiles-specific implementation of Workload. + """ + + def __init__(self, namespace="default", image="smallfiles-image:latest"): + super().__init__(namespace, image) + + def start_workload(self): + log.info(f"Starting SmallFiles workload in namespace: {self.namespace}.") + # Implement pod creation logic + + def scale_up_pods(self, desired_count): + log.info(f"Scaling up SmallFiles pods to {desired_count}.") + # Implement logic to scale up pods + + def scale_down_pods(self, desired_count): + log.info(f"Scaling down SmallFiles pods to {desired_count}.") + # Implement logic to scale down pods + + def stop_workload(self): + log.info("Stopping SmallFiles workload.") + # Implement pod deletion logic + + def cleanup_workload(self): + log.info("Cleaning up SmallFiles workload.") + # Implement cleanup logic + + +class VdbenchWorkload(Workload): + """ + Vdbench-specific implementation of Workload. + """ + + def __init__(self, namespace="default", image="vdbench-image:latest"): + super().__init__(namespace, image) + + def start_workload(self): + log.info(f"Starting Vdbench workload in namespace: {self.namespace}.") + # Implement pod creation logic + + def scale_up_pods(self, desired_count): + log.info(f"Scaling up Vdbench pods to {desired_count}.") + # Implement logic to scale up pods + + def scale_down_pods(self, desired_count): + log.info(f"Scaling down Vdbench pods to {desired_count}.") + # Implement logic to scale down pods + + def stop_workload(self): + log.info("Stopping Vdbench workload.") + # Implement pod deletion logic + + def cleanup_workload(self): + log.info("Cleaning up Vdbench workload.") + # Implement cleanup logic diff --git a/ocs_ci/resiliency/workloads/fio_block_workload_template.yaml b/ocs_ci/resiliency/workloads/fio_block_workload_template.yaml new file mode 100644 index 00000000000..a844ff3445e --- /dev/null +++ b/ocs_ci/resiliency/workloads/fio_block_workload_template.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ fio_name }} # FIO workload name + namespace: {{ namespace }} # OpenShift namespace +spec: + replicas: 1 + selector: + matchLabels: + app: {{ fio_name }} + template: + metadata: + labels: + app: {{ fio_name }} + spec: + replicas: 1 + containers: + - name: fio + image: quay.io/ocsci/nginx:fio + command: + - fio + args: + - "--name={{ fio_name }}" + - "--rw=randwrite" + - "--size=4G" + - "--bs=4k" + - "--numjobs=4" + - "--runtime=0" + - "--group_reporting" + - "--time_based" + - --output=/path/to/your/result_file.txt + - "--filename=/dev/rbdblock" # Block device path + volumeDevices: # Using volumeDevices for Block mode + - name: fio-volume + devicePath: /dev/rbdblock # Path to block device inside the container + restartPolicy: Always + volumes: + - name: fio-volume + persistentVolumeClaim: + claimName: {{ pvc_claim_name }} # PVC claim name for block device + strategy: + type: Recreate \ No newline at end of file diff --git a/ocs_ci/resiliency/workloads/fio_fs_workload_template.yaml b/ocs_ci/resiliency/workloads/fio_fs_workload_template.yaml new file mode 100644 index 00000000000..9645f7b20f3 --- /dev/null +++ b/ocs_ci/resiliency/workloads/fio_fs_workload_template.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ fio_name }} # FIO workload name + namespace: {{ namespace }} # OpenShift namespace +spec: + replicas: 1 + selector: + matchLabels: + app: {{ fio_name }} + template: + metadata: + labels: + app: {{ fio_name }} + spec: + replicas: 1 + containers: + - name: fio + image: quay.io/ocsci/nginx:fio + command: + - fio + args: + - "--name={{ fio_name }}" + - "--rw=randwrite" + - "--size=4G" + - "--bs=4k" + - "--numjobs=4" + - "--runtime=0" + - "--group_reporting" + - "--time_based" + - "--filename=/mnt/fio_file_workload" + volumeMounts: + - name: fio-volume + mountPath: /mnt + restartPolicy: Always + volumes: + - name: fio-volume + persistentVolumeClaim: + claimName: {{ pvc_claim_name }} # PVC claim name + strategy: + type: Recreate \ No newline at end of file diff --git a/ocs_ci/utility/vsphere.py b/ocs_ci/utility/vsphere.py index d2a3b2de3ef..7cf3ef822aa 100644 --- a/ocs_ci/utility/vsphere.py +++ b/ocs_ci/utility/vsphere.py @@ -1742,3 +1742,37 @@ def get_volume_path(self, volume_id, datastore_name, datacenter_name): volume_path = vstorage_object.config.backing.filePath logger.debug(f"File path for volume {volume_id} is `{volume_path}`") return volume_path + + def wait_for_vm_status(self, vm, desired_status, timeout=300, interval=10): + """ + Wait for the VM to reach the desired status. + + :param vm: The virtual machine object + :param desired_status: The desired status (e.g., 'poweredOn', 'poweredOff') + :param timeout: Maximum time (in seconds) to wait for the VM to reach the desired status + :param interval: Time (in seconds) between each status check + :return: True if the VM reaches the desired status, False otherwise + """ + import time + + start_time = time.time() + + while time.time() - start_time < timeout: + current_status = vm.runtime.powerState + + # Check if the VM has reached the desired status + if current_status == desired_status: + logger.info( + f"VM {vm.name} has reached the desired status: {desired_status}" + ) + return True + + logger.info( + f"Current status of VM {vm.name} is {current_status}, waiting for {desired_status}..." + ) + time.sleep(interval) + + logger.info( + f"VM {vm.name} did not reach the desired status {desired_status} within the timeout period." + ) + return False diff --git a/tests/conftest.py b/tests/conftest.py index da8fc5f43d5..495660be7f9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8553,6 +8553,27 @@ def finalizer(): return repo_dir +@pytest.fixture +def fio_resiliency_workload(request): + """ + Pytest fixture to start and stop an FIO workload for resiliency testing. + """ + from ocs_ci.resiliency.resiliency_workload import FioWorkload + + def factory(pvc_obj): + fio = FioWorkload(pvc_obj) + fio.start_workload() + # yield + + def finalizer(): + fio.stop_workload() + + request.addfinalizer(finalizer) + return fio # Return the FioWorkload instance if needed + + return factory + + @pytest.fixture(scope="session", autouse=True) def run_description(): """ diff --git a/tests/resiliency/test_network_failures_scenarios.py b/tests/resiliency/test_network_failures_scenarios.py new file mode 100644 index 00000000000..71cfda32bb0 --- /dev/null +++ b/tests/resiliency/test_network_failures_scenarios.py @@ -0,0 +1,41 @@ +from ocs_ci.resiliency.resiliency_helper import Resiliency + +# from ocs_ci.resiliency.resiliency_workload import workload_object +import logging +from ocs_ci.ocs import constants +from ocs_ci.framework.pytest_customization.marks import green_squad + +log = logging.getLogger(__name__) + + +@green_squad +class TestNetworkFailures: + def test_node_network_failure(self, multi_pvc_factory, fio_resiliency_workload): + """ """ + scenario = "NETWORK_FAILURES" + failure_method = "NODE_NETWORK_DOWN" + + # Create pvcs with different access_modes + size = 5 + access_modes = [constants.ACCESS_MODE_RWO] + cephfs_pvc_objs = multi_pvc_factory( + interface=constants.CEPHFILESYSTEM, + access_modes=access_modes, + size=size, + num_of_pvc=2, + ) + + rbd_pvc_objs = multi_pvc_factory( + interface=constants.CEPHBLOCKPOOL, + access_modes=access_modes, + size=size, + num_of_pvc=2, + ) + + # Starting Workload on the cluster + for pv_obj in cephfs_pvc_objs + rbd_pvc_objs: + fio_resiliency_workload(pv_obj) + + node_failures = Resiliency(scenario, failure_method=failure_method) + node_failures.start() + node_failures.cleanup() diff --git a/tests/resiliency/test_node_failure_scenarios.py b/tests/resiliency/test_node_failure_scenarios.py new file mode 100644 index 00000000000..30380705a0c --- /dev/null +++ b/tests/resiliency/test_node_failure_scenarios.py @@ -0,0 +1,42 @@ +from ocs_ci.resiliency.resiliency_helper import Resiliency + +# from ocs_ci.resiliency.resiliency_workload import workload_object +import logging +from ocs_ci.ocs import constants +from ocs_ci.framework.pytest_customization.marks import green_squad + +log = logging.getLogger(__name__) + + +@green_squad +class TestResiliencyNodeFailures: + def test_node_poweroff(self, multi_pvc_factory, fio_resiliency_workload): + """Resiliency tests""" + scenario = "NODE_FAILURES" + failure_method = "POWEROFF_NODE" + + # Create pvcs with different access_modes + size = 5 + access_modes = [constants.ACCESS_MODE_RWO] + cephfs_pvc_objs = multi_pvc_factory( + interface=constants.CEPHFILESYSTEM, + access_modes=access_modes, + size=size, + num_of_pvc=2, + ) + + rbd_pvc_objs = multi_pvc_factory( + interface=constants.CEPHBLOCKPOOL, + access_modes=access_modes, + size=size, + num_of_pvc=2, + ) + + # Starting Workload on the cluster + for pv_obj in cephfs_pvc_objs + rbd_pvc_objs: + fio_resiliency_workload(pv_obj) + + # Injecting Resiliency failures + node_failures = Resiliency(scenario, failure_method=failure_method) + node_failures.start() + node_failures.cleanup()