-
Notifications
You must be signed in to change notification settings - Fork 170
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Resiliency tests scenarios automation
Signed-off-by: Parag Kamble <[email protected]>
- Loading branch information
Showing
16 changed files
with
904 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
import logging | ||
from ocs_ci.ocs.node import get_node_ips | ||
from abc import ABC, abstractmethod | ||
from ocs_ci.framework import config | ||
from ocs_ci.utility.vsphere import VSPHERE | ||
from ocs_ci.ocs import constants | ||
import random | ||
import time | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
class ClusterFailures(ABC): | ||
def __init__(self, cluster_name): | ||
self.cluster_name = cluster_name | ||
|
||
def random_node_ip(self, node_type="worker"): | ||
"""Return a random node IP of a given node type.""" | ||
ips = get_node_ips(node_type=node_type) | ||
return random.choice(ips) | ||
|
||
@abstractmethod | ||
def shutdown_node(self, node_ip=None, node_type="worker"): | ||
pass | ||
|
||
@abstractmethod | ||
def change_node_network_interface_state( | ||
self, node_ip=None, node_type="worker", interface_name=None, connect=False | ||
): | ||
pass | ||
|
||
@abstractmethod | ||
def network_split(self, nodes): | ||
pass | ||
|
||
|
||
class VsphereClusterFailures(ClusterFailures): | ||
def __init__(self): | ||
super().__init__(cluster_name="vSphere") | ||
self.vsphere_host = config.ENV_DATA["vsphere_server"] | ||
self.vsphere_password = config.ENV_DATA["vsphere_password"] | ||
self.vsphere_username = config.ENV_DATA["vsphere_user"] | ||
self.dc = config.ENV_DATA["vsphere_datacenter"] | ||
self.vsobj = VSPHERE( | ||
self.vsphere_host, self.vsphere_username, self.vsphere_password | ||
) | ||
|
||
def shutdown_node(self, node_ip=None, node_type="worker"): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
log.info(f"Shutting down node {node_ip} on vSphere cluster {self.cluster_name}") | ||
vm = self.vsobj.get_vm_by_ip(node_ip, self.dc) | ||
self.vsobj.stop_vms([vm]) | ||
log.info(f"Node {node_ip} VM instance stopped.") | ||
|
||
def reboot_node(self, node_ip=None, node_type="worker"): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
vm = self.vsobj.get_vm_by_ip(node_ip, self.dc) | ||
vm_name = vm.name | ||
self.vsobj.stop_vms([vm]) | ||
log.info(f"VM instance {vm_name} is stopped.") | ||
time.sleep(20) | ||
self.vsobj.start_vms([vm]) | ||
log.info(f"VM instance {vm_name} is started.") | ||
|
||
def change_node_network_interface_state( | ||
self, node_ip=None, node_type="worker", interface_name=None, connect=False | ||
): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
log.info( | ||
f"{'Connecting' if connect else 'Disconnecting'} network interface" | ||
f"of node {node_ip} on vSphere cluster {self.cluster_name}" | ||
) | ||
self.vsobj.change_vm_network_state(node_ip, self.dc, connect=connect) | ||
|
||
def network_split(self, nodes): | ||
log.warning("Function 'network_split' is not implemented.") | ||
raise NotImplementedError("Function 'network_split' is not implemented.") | ||
|
||
|
||
class IbmCloudClusterFailures(ClusterFailures): | ||
def __init__(self): | ||
super().__init__(cluster_name="IBM Cloud") | ||
|
||
def shutdown_node(self, node_ip=None, node_type="worker"): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
log.info( | ||
f"Shutting down node {node_ip} on IBM Cloud cluster {self.cluster_name}" | ||
) | ||
raise NotImplementedError("IBM Cloud shutdown logic is not implemented.") | ||
|
||
def change_node_network_interface_state( | ||
self, node_ip=None, node_type="worker", interface_name=None, connect=False | ||
): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
log.info( | ||
f"{'Connecting' if connect else 'Disconnecting'} network interface" | ||
f" of node {node_ip} on IBM Cloud cluster {self.cluster_name}" | ||
) | ||
# Add IBM Cloud-specific logic here | ||
|
||
def network_split(self, nodes): | ||
log.info( | ||
f"Simulating network split on nodes {nodes} in IBM Cloud cluster {self.cluster_name}" | ||
) | ||
# Add IBM Cloud-specific network split logic | ||
|
||
|
||
class AwsClusterFailures(ClusterFailures): | ||
def __init__(self): | ||
super().__init__(cluster_name="AWS") | ||
|
||
def shutdown_node(self, node_ip=None, node_type="worker"): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
log.info(f"Shutting down node {node_ip} on AWS cluster {self.cluster_name}") | ||
# Add AWS-specific shutdown logic | ||
|
||
def change_node_network_interface_state( | ||
self, node_ip=None, node_type="worker", interface_name=None, connect=False | ||
): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
log.info( | ||
f"{'Connecting' if connect else 'Disconnecting'} network interface" | ||
f"of node {node_ip} on AWS cluster {self.cluster_name}" | ||
) | ||
# Add AWS-specific logic here | ||
|
||
def network_split(self, nodes): | ||
log.info( | ||
f"Simulating network split on nodes {nodes} in AWS cluster {self.cluster_name}" | ||
) | ||
# Add AWS-specific network split logic | ||
|
||
|
||
class BaremetalClusterFailures(ClusterFailures): | ||
def __init__(self): | ||
super().__init__(cluster_name="Bare Metal") | ||
|
||
def shutdown_node(self, node_ip=None, node_type="worker"): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
log.info( | ||
f"Shutting down node {node_ip} on Bare Metal cluster {self.cluster_name}" | ||
) | ||
# Add bare metal-specific shutdown logic | ||
|
||
def change_node_network_interface_state( | ||
self, node_ip=None, node_type="worker", interface_name=None, connect=False | ||
): | ||
if not node_ip: | ||
node_ip = self.random_node_ip(node_type=node_type) | ||
log.info( | ||
f"{'Connecting' if connect else 'Disconnecting'} network interface" | ||
f" of node {node_ip} on Bare Metal cluster {self.cluster_name}" | ||
) | ||
# Add bare metal-specific logic here | ||
|
||
def network_split(self, nodes): | ||
log.info( | ||
f"Simulating network split on nodes {nodes} in Bare Metal cluster {self.cluster_name}" | ||
) | ||
# Add bare metal-specific network split logic | ||
|
||
|
||
def get_cluster_object(): | ||
platform = config.ENV_DATA["platform"].lower() | ||
if platform == constants.VSPHERE_PLATFORM: | ||
return VsphereClusterFailures() | ||
elif platform == constants.AWS_PLATFORM: | ||
return AwsClusterFailures() | ||
elif platform == constants.IBMCLOUD_PLATFORM: | ||
return IbmCloudClusterFailures() | ||
elif platform == constants.BAREMETAL_PLATFORM: | ||
return BaremetalClusterFailures() | ||
else: | ||
raise ValueError(f"Unsupported platform: {platform}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
NETWORK_FAILURES: | ||
WAIT_TILL_NODE_JOIN: true | ||
FAILURES: | ||
- NODE_NETWORK_DOWN: | ||
NETWORK_FAILURE_DURATION: 30 | ||
node_selector: | ||
- labels: [] | ||
num_reboot_nodes: "0-3" | ||
- POD_NETWORK_FAILURE: | ||
node_selector: | ||
- labels: [] | ||
WORKLOAD: | ||
FIO: | ||
- CEPHFS: | ||
template: "fio_cephfs_template.yaml" | ||
name: "cephfs-fio-workload" | ||
- BLOCK: | ||
template: "fio_block_template.yaml" | ||
name: "block-fio-workload" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
NODE_FAILURES: | ||
WAIT_TILL_NODE_JOIN: true | ||
FAILURES: | ||
- POWEROFF_NODE: | ||
NODE_TYPE: | ||
- "master" | ||
- "worker" | ||
node_selector: | ||
- labels: [] | ||
ITERATION: 10 | ||
- NODE_DRAIN: | ||
node_selector: | ||
- labels: [] | ||
WORKLOAD: | ||
FIO: | ||
- CEPHFS: | ||
template: "fio_cephfs_template.yaml" | ||
name: "cephfs-fio-workload" | ||
- BLOCK: | ||
template: "fio_block_template.yaml" | ||
name: "block-fio-workload" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
RESILIENCY: | ||
RUN_CONFIG: | ||
STOP_WHEN_CEPH_UNHEALTHY: true | ||
ITERATE_SCENARIOS: True | ||
FAILURE_SCENARIOS: | ||
- NODE_FAILURES | ||
- NETWORK_FAILURES |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import logging | ||
import time | ||
|
||
from ocs_ci.resiliency.cluster_failures import get_cluster_object | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
class NetworkFailures: | ||
SCENARIO_NAME = "NETWORK_FAILURES" | ||
FAILURE_METHODS = { | ||
"POD_NETWORK_FAILURE": "_run_pod_network_failures", | ||
"NODE_NETWORK_DOWN": "_run_node_network_failure", | ||
} | ||
|
||
def __init__(self, failure_data): | ||
self.scenario_name = self.SCENARIO_NAME | ||
self.failure_data = failure_data | ||
self.cluster_obj = get_cluster_object() | ||
|
||
def failure_case(self): | ||
"""Get the first failure case key from failure_data.""" | ||
if not self.failure_data: | ||
raise ValueError("No failure case provided in failure_data.") | ||
return next(iter(self.failure_data)) | ||
|
||
def run(self): | ||
"""Dynamically call the appropriate method based on the failure case.""" | ||
case = self.failure_case() | ||
method_name = self.FAILURE_METHODS.get(case) | ||
if method_name and hasattr(self, method_name): | ||
method = getattr(self, method_name) | ||
method() | ||
else: | ||
raise NotImplementedError( | ||
f"Failure method for case '{case}' is not implemented." | ||
) | ||
|
||
def _run_pod_network_failures(self): | ||
"""Handle Pod Network Failure scenario.""" | ||
log.info("Bringing down Pod network interface.") | ||
# Implement pod network failure logic here | ||
|
||
def _run_node_network_failure(self): | ||
"""Handle Node Network Failure scenario.""" | ||
log.info("Bringing down Node network interfaces.") | ||
node_types = ["master", "worker"] | ||
for node_type in node_types: | ||
node_ip = self.cluster_obj.random_node_ip(node_type) | ||
self.cluster_obj.change_node_network_interface_state( | ||
node_ip=node_ip, node_type=node_type, connect=False | ||
) | ||
try: | ||
time.sleep(60) # Simulate network being down | ||
finally: | ||
self.cluster_obj.change_node_network_interface_state( | ||
node_ip=node_ip, node_type=node_type, connect=True | ||
) | ||
log.info(f"Network interface on node {node_ip} restored.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import logging | ||
import random | ||
from ocs_ci.utility.utils import ceph_health_check | ||
from ocs_ci.resiliency.cluster_failures import get_cluster_object | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
class NodeFailures: | ||
SCENARIO_NAME = "NODE_FAILURES" | ||
FAILURE_METHODS = { | ||
"POWEROFF_NODE": "_run_poweroff_node", | ||
"NODE_DRAIN": "_run_node_drain", | ||
} | ||
|
||
def __init__(self, failure_data): | ||
self.failure_data = failure_data | ||
self.failure_case_name = self._get_failure_case() | ||
self.scenario_name = self.SCENARIO_NAME | ||
self.cluster_obj = get_cluster_object() | ||
|
||
def _get_failure_case(self): | ||
"""Retrieve the failure case name from the provided failure data.""" | ||
if not self.failure_data: | ||
log.error("Failure data is empty.") | ||
return None | ||
return next(iter(self.failure_data)) | ||
|
||
def run(self): | ||
"""Run the failure scenario based on the failure case.""" | ||
if not self.failure_case_name: | ||
log.error("No valid failure case name found. Exiting run method.") | ||
return | ||
|
||
method_name = self.FAILURE_METHODS.get(self.failure_case_name) | ||
if method_name and hasattr(self, method_name): | ||
failure_method = getattr(self, method_name) | ||
failure_method() | ||
self._post_scenario_checks() | ||
else: | ||
raise NotImplementedError( | ||
f"Failure method for '{self.failure_case_name}' is not implemented." | ||
) | ||
|
||
def _run_poweroff_node(self): | ||
"""Simulate the reboot of nodes.""" | ||
log.info("Running Failure Case: POWEROFF_NODE.") | ||
node_types = self.failure_data[self.failure_case_name].get("NODE_TYPE", []) | ||
poweroff_iteration = self.failure_data[self.failure_case_name].get( | ||
"ITERATION", 0 | ||
) | ||
|
||
for _ in range(poweroff_iteration): | ||
node_type = random.choice(node_types) | ||
log.info(f"Rebooting {node_type} node.") | ||
self.cluster_obj.reboot_node(node_type=node_type) | ||
log.info(f"{node_type.capitalize()} node rebooted.") | ||
|
||
def _run_node_drain(self): | ||
"""Simulate draining of nodes.""" | ||
log.info("Running Failure Case: NODE_DRAIN.") | ||
# Implement node drain logic here | ||
log.info("Draining node...") | ||
|
||
def _post_scenario_checks(self): | ||
"""Perform post-scenario checks to ensure the cluster is healthy.""" | ||
log.info(f"Running post-scenario checks for {self.scenario_name}.") | ||
log.info("Verifying that Ceph health is OK (retrying if necessary).") | ||
ceph_health_check(tries=45, delay=60) |
Oops, something went wrong.