From 544f687d384ab721c20afde6d744b2d2a1756d16 Mon Sep 17 00:00:00 2001 From: "Liu, Xiaopeng (133)" Date: Fri, 8 May 2020 15:40:13 +0800 Subject: [PATCH] Add ec2_os actions, enable os level burn_cpu,network,kill process function via aws ssm Signed-off-by: Liu, Xiaopeng (133) --- MANIFEST.in | 3 +- chaosaws/__init__.py | 2 + chaosaws/ec2_os/__init__.py | 57 ++ chaosaws/ec2_os/actions.py | 717 ++++++++++++++++++ chaosaws/ec2_os/constants.py | 16 + chaosaws/ec2_os/probes.py | 201 +++++ chaosaws/ec2_os/scripts/burn_io.sh | 26 + chaosaws/ec2_os/scripts/cpu_stress_test.ps1 | 18 + chaosaws/ec2_os/scripts/cpu_stress_test.sh | 21 + .../ec2_os/scripts/ensure_tc_installed.sh | 10 + .../ec2_os/scripts/ensure_tc_uninstalled.sh | 11 + chaosaws/ec2_os/scripts/fill_disk.ps1 | 13 + chaosaws/ec2_os/scripts/fill_disk.sh | 13 + chaosaws/ec2_os/scripts/grep_pid.sh | 3 + chaosaws/ec2_os/scripts/kill_process.sh | 16 + chaosaws/ec2_os/scripts/killall_processes.sh | 16 + chaosaws/ec2_os/scripts/network_advanced.sh | 44 ++ chaosaws/ec2_os/scripts/network_latency.sh | 10 + chaosaws/ec2_os/scripts/run_cmd.ps1 | 12 + chaosaws/ec2_os/scripts/run_cmd.sh | 7 + setup.py | 1 + tests/ec2_os/test_ec2_os_actions.py | 563 ++++++++++++++ tests/ec2_os/test_ec2_os_probes.py | 185 +++++ 23 files changed, 1964 insertions(+), 1 deletion(-) create mode 100644 chaosaws/ec2_os/__init__.py create mode 100644 chaosaws/ec2_os/actions.py create mode 100644 chaosaws/ec2_os/constants.py create mode 100644 chaosaws/ec2_os/probes.py create mode 100644 chaosaws/ec2_os/scripts/burn_io.sh create mode 100644 chaosaws/ec2_os/scripts/cpu_stress_test.ps1 create mode 100644 chaosaws/ec2_os/scripts/cpu_stress_test.sh create mode 100644 chaosaws/ec2_os/scripts/ensure_tc_installed.sh create mode 100644 chaosaws/ec2_os/scripts/ensure_tc_uninstalled.sh create mode 100644 chaosaws/ec2_os/scripts/fill_disk.ps1 create mode 100644 chaosaws/ec2_os/scripts/fill_disk.sh create mode 100644 chaosaws/ec2_os/scripts/grep_pid.sh create mode 100644 chaosaws/ec2_os/scripts/kill_process.sh create mode 100644 chaosaws/ec2_os/scripts/killall_processes.sh create mode 100644 chaosaws/ec2_os/scripts/network_advanced.sh create mode 100644 chaosaws/ec2_os/scripts/network_latency.sh create mode 100644 chaosaws/ec2_os/scripts/run_cmd.ps1 create mode 100644 chaosaws/ec2_os/scripts/run_cmd.sh create mode 100644 tests/ec2_os/test_ec2_os_actions.py create mode 100644 tests/ec2_os/test_ec2_os_probes.py diff --git a/MANIFEST.in b/MANIFEST.in index 6a18e3b..27012e3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,4 +3,5 @@ include requirements.txt include requirements-dev.txt include LICENSE include CHANGELOG.md -include pytest.ini \ No newline at end of file +include pytest.ini +include chaosaws/ec2_os/scripts/* \ No newline at end of file diff --git a/chaosaws/__init__.py b/chaosaws/__init__.py index c67c1d7..3fc5305 100644 --- a/chaosaws/__init__.py +++ b/chaosaws/__init__.py @@ -239,5 +239,7 @@ def load_exported_activities() -> List[DiscoveredActivities]: activities.extend(discover_actions("chaosaws.rds.actions")) activities.extend(discover_probes("chaosaws.rds.probes")) activities.extend(discover_actions("chaosaws.elasticache.actions")) + activities.extend(discover_actions("chaosaws.ec2_os.actions")) + activities.extend(discover_probes("chaosaws.ec2_os.probes")) return activities diff --git a/chaosaws/ec2_os/__init__.py b/chaosaws/ec2_os/__init__.py new file mode 100644 index 0000000..ce7910e --- /dev/null +++ b/chaosaws/ec2_os/__init__.py @@ -0,0 +1,57 @@ +import os + +from logzero import logger +from chaoslib.exceptions import FailedActivity + +from .constants import OS_WINDOWS, OS_LINUX + + +def construct_script_content(action: str = None, + os_type: str = None, + parameters: dict = None): + """ + As for now, no Windows action supported except burn CPU + + :param action: + :param os_type: { OS_LINUX | OS_WINDOWS } + :param parameters: + :return: + """ + + cmd_param = "" + if os_type == OS_LINUX: + file_suffix = ".sh" + p_delimiter = "" + cmdline_delimiter = " && " + elif os_type == OS_WINDOWS: + file_suffix = ".ps1" + p_delimiter = "$" + cmdline_delimiter = "\n" + else: + raise FailedActivity( + "Cannot find corresponding script for {} on OS: {}".format( + action, os_type)) + + if action == "run_cmd": + cmdline_param = cmdline_delimiter.join(parameters['cmd']) + # parameters.pop('cmd') + del parameters['cmd'] + else: + cmdline_param = "" + + if parameters is not None: + param_list = list() + for k, v in parameters.items(): + param_list.append('='.join([p_delimiter + k, "'" + v + "'"])) + cmd_param = '\n'.join(param_list) + else: + logger.info("No parameter parsed, return default script content") + + script_name = action + file_suffix + + with open(os.path.join(os.path.dirname(__file__), + "scripts", script_name)) as file: + script_content = file.read() + # merge duration + script_content = cmd_param + "\n" + cmdline_param + "\n" + script_content + return script_content diff --git a/chaosaws/ec2_os/actions.py b/chaosaws/ec2_os/actions.py new file mode 100644 index 0000000..5d18971 --- /dev/null +++ b/chaosaws/ec2_os/actions.py @@ -0,0 +1,717 @@ +# -*- coding: utf-8 -*- +import time +from typing import Any, Dict, List + +from chaoslib.exceptions import FailedActivity +from chaoslib.types import Configuration, Secrets +from logzero import logger + +from chaosaws import aws_client +from chaosaws.types import AWSResponse +from chaosaws.ec2_os import construct_script_content +from .probes import describe_os_type +from .constants import OS_LINUX, OS_WINDOWS +from .constants import BURN_CPU, FILL_DISK, NETWORK_UTIL, \ + BURN_IO, SSMDEFAULTNETWORKLAGACY, KILLALL_PROCESSES, KILL_PROCESS, RUN_CMD + +__all__ = ["burn_cpu", "fill_disk", "network_latency", "burn_io", + "network_loss", "network_corruption", "network_advanced", + "os_advanced_internet_scripts", "killall_processes", + "run_cmd", "kill_process"] + + +def burn_cpu(instance_ids: List[str] = None, + execution_duration: str = "60", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + burn CPU up to 100% at random machines. + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional + Duration of the stress test (in seconds) that generates high CPU usage. + Defaults to 60 seconds. + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + + logger.debug( + "Start burn_cpu: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=BURN_CPU, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def fill_disk(instance_ids: List[str] = None, + execution_duration: str = "120", + size: str = "1000", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + For now do not have this scenario, fill the disk with random data. + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional + Lifetime of the file created. Defaults to 120 seconds. + size : str + Size of the file created on the disk. Defaults to 1000 MB. + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + + logger.debug( + "Start fill_disk: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + param["size"] = size + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=FILL_DISK, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def burn_io(instance_ids: List[str] = None, + execution_duration: str = "60", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + Increases the Disk I/O operations per second of the virtual machine. + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional + Lifetime of the file created. Defaults to 120 seconds. + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + + logger.debug( + "Start burn_io: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=BURN_IO, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def network_advanced(instance_ids: List[str] = None, + execution_duration: str = "60", + command: str = "", + device: str = "eth0", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + do a customized operations on the virtual machine via Linux - TC. + For windows, no solution as for now. + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional + Lifetime of the file created. Defaults to 60 seconds. + command : str, optional + advanced command line in tc, e.g. loss 5% or corrupt 10% etc. + device : str, optional + default to eth0, or specify the device name, e.g. enps0 + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + + logger.debug( + "Start network_advanced: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + param["param"] = command + param["device"] = device + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=NETWORK_UTIL, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def network_loss(instance_ids: List[str] = None, + execution_duration: str = "60", + device: str = "eth0", + loss_ratio: str = "5%", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + do a network loss operations on the virtual machine via Linux - TC. + For windows, no solution as for now. + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional + Lifetime of the file created. Defaults to 60 seconds. + device : str, optional + default to eth0, or specify the device name, e.g. enps0 + loss_ratio : str: + loss_ratio = "30%" + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + + logger.debug( + "Start network_advanced: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + param["param"] = "loss " + loss_ratio + param["device"] = device + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=NETWORK_UTIL, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def network_corruption(instance_ids: List[str] = None, + execution_duration: str = "60", + device: str = "eth0", + corruption_ratio: str = "5%", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + do a network loss operations on the virtual machine via Linux - TC. + For windows, no solution as for now. + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional + Lifetime of the file created. Defaults to 60 seconds. + device : str, optional + default to eth0, or specify the device name, e.g. enps0 + corruption_ratio : str: + corruption_ratio = "30%" + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + + logger.debug( + "Start network_corruption: configuration='{}', " + "instance_ids='{}'".format(configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + param["param"] = "corrupt " + corruption_ratio + param["device"] = device + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=NETWORK_UTIL, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def network_latency(instance_ids: List[str] = None, + execution_duration: str = "60", + device: str = "eth0", + delay: str = "1000ms", + variance: str = "500ms", + ratio: str = "", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + Increases the response time of the virtual machine. + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional + Lifetime of the file created. Defaults to 120 seconds. + device : str, optional + default to eth0, or specify the device name, e.g. enps0 + delay : str + Added delay in ms. Defaults to 1000ms. + variance : str + Variance of the delay in ms. Defaults to 500ms. + ratio: str = "5%", optional + the specific ratio of how many Variance of the delay in ms. + Defaults to "". + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + logger.debug( + "Start network_latency: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + param["param"] = "delay " + delay + " " + variance + " " + ratio + param["device"] = device + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=NETWORK_UTIL, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def killall_processes(instance_ids: List[str] = None, + execution_duration: str = "1", + process_name: str = None, + signal: str = "", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + The killall utility kills processes selected by name + refer to https://linux.die.net/man/1/killall + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional default to 1 second + This is not technically not useful as the process usually is killed + without and delay, however you can set more seconds here to let the + thread wait for more time to extend your experiment execution in case + you need to watch more on the observation metrics. + process_name : str + Name of the process to be killed + signal : str , default to "" + The signal of killall command, e.g. use -9 to force kill + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + logger.debug( + "Start network_latency: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + param["process_name"] = process_name + param["signal"] = signal + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=KILLALL_PROCESSES, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def kill_process(instance_ids: List[str] = None, + execution_duration: str = "1", + process: str = None, + signal: str = "", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + kill -s [signal_as_below] [processname] + HUP INT QUIT ILL TRAP ABRT EMT FPE KILL BUS SEGV SYS PIPE ALRM TERM URG + STOP TSTP CONT CHLD TTIN TTOU IO XCPU XFSZ VTALRM PROF WINCH INFO USR1 USR2 + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional default to 1 second + This is not technically not useful as the process usually is killed + without and delay, however you can set more seconds here to let the + thread wait for more time to extend your experiment execution in case + you need to watch more on the observation metrics. + process : str + process or pid that kill command accetps + signal : str , default to "" + The signal of kill command, use kill -l for help + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + logger.debug( + "Start network_latency: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + param["process_name"] = process + param["signal"] = signal + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=KILL_PROCESS, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def run_cmd(instance_ids: List[str] = None, + execution_duration: str = "60", + cmd: List[str] = None, + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + run cmd + Linus -> Shell + Windows -> PowerShell + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + execution_duration : str, optional default to 1 second + This is not technically not useful as the process usually is killed + without and delay, however you can set more seconds here to let the + thread wait for more time to extend your experiment execution in case + you need to watch more on the observation metrics. + cmd : List[str] + Lines of your commands + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + + logger.debug( + "Start run_cmd: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = execution_duration + param["instance_id"] = instance + param["cmd"] = cmd + response.append( + __linux_from_default(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=RUN_CMD, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +def os_advanced_internet_scripts(instance_ids: List[str] = None, + source_info: str = None, + command_line: List[str] = None, + execution_timeout: str = "60", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + os_advanced_internet_scripts send commands + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + source_info : str + Specify an URL that could be accessed from where your chaos runs. + This function is supported by AWS SSM RunRemoteScript. + You can specify either a public S3 address or a Github address. + If your Github address requires login, you need also provide + SSM secured parameter store. + For example: + "source_info": "https://s3.amazonaws.com/chaos/burnio.sh" + command_line : str + Specify the above script run command + For example: + "command_line": [ "burnio.sh -h param" ] + execution_timeout : optional + Default to 60 seconds + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + logger.debug( + "Start network_latency: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + + try: + return __linux_from_internet(instance_ids, source_info, command_line, + execution_timeout, configuration, secrets) + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +############################################################################### +# Private helper functions +############################################################################### +def __linux_from_default(instance_id: str = None, + action: str = None, + parameters: Dict[str, Any] = None, + configuration: Configuration = None, + secrets: Secrets = None) -> AWSResponse: + + default_timeout = int(parameters['duration']) + client = aws_client("ssm", configuration, secrets) + if not instance_id: + raise FailedActivity( + "you must specify the instance_id" + ) + try: + if describe_os_type(instance_id, configuration, secrets) == "windows": + os_type = OS_WINDOWS + else: + os_type = OS_LINUX + + res_send_command = client.send_command( + InstanceIds=[instance_id], + DocumentName="AWS-RunShellScript", + # =============================================== + # TODO if in Windows + # DocumentName == 'AWS-RunPowerShellScript' + # =============================================== + Parameters={ + 'commands': + [construct_script_content(action, os_type, parameters)] + }, + ) + cmd_id = res_send_command["Command"]["CommandId"] + logger.info("ssm run command is sent, id {}".format(cmd_id)) + totalwait = 0 + interval = default_timeout / 2 + while True: + res_list = client.list_command_invocations( + CommandId=cmd_id, + Details=True + ) + try: + cp = res_list['CommandInvocations'][0]['CommandPlugins'][0] + status = cp['Status'] + if status == "InProgress": + time.sleep(interval) + totalwait += interval + interval = interval / 2 if interval > 1 else 1 + if totalwait > default_timeout + SSMDEFAULTNETWORKLAGACY: + raise FailedActivity( + "Script exceeded default timeout {}" + .format(default_timeout)) + continue + elif status == "Failed": + break + elif status == "Success": + break + else: + break + except IndexError: + time.sleep(1) + continue + for command_invocation in res_list['CommandInvocations']: + for invocation in command_invocation['CommandPlugins']: + if invocation['Name'] == 'aws:runShellScript': + logger.info("ssm run command status {}" + .format(invocation['Status'])) + logger.info("ssm rum command result \n{}" + .format(invocation['Output'].rstrip('\n'))) + return invocation['Output'].rstrip('\n') + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script:\n{}".format(x)) + + +def __linux_from_internet(instance_ids: List[str] = None, + source_info: str = None, + command_line: List[str] = None, + execution_timeout: str = "60", + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + Execute shell script on linux system + """ + ssm_document = 'AWS-RunRemoteScript' + sourceinfos = {"path": source_info} + params = {"sourceType": ["S3"], + "sourceInfo": [str(sourceinfos).replace('\'', '\"')], + "executionTimeout": [execution_timeout], + "commandLine": command_line} + response = [] + client = aws_client("ssm", configuration, secrets) + if not instance_ids: + raise FailedActivity( + "you must specify the instance_id" + ) + try: + res_send_command = client.send_command( + InstanceIds=instance_ids, + DocumentName=ssm_document, + Parameters=params + ) + cmd = res_send_command['Command'] + cmd_id = cmd['CommandId'] + logger.info("ssm run command is sent, id {}".format(cmd_id)) + while True: + res_list = client.list_command_invocations( + CommandId=cmd_id, + Details=True + ) + cmd_invocations = str(res_list['CommandInvocations']) + if cmd_invocations.find("runShellScript") == -1: + time.sleep(0.1) + continue + else: + break + for command_invocation in res_list['CommandInvocations']: + for invocation in command_invocation['CommandPlugins']: + # =============================================== + # TODO if in Windows + # if invocation['Name'] == 'runPowerShellScript': + # =============================================== + if invocation['Name'] == 'runShellScript': + logger.warning(invocation['Output'].rstrip('\n')) + response.append(invocation['Output'].rstrip('\n')) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script:\n{}".format(x)) diff --git a/chaosaws/ec2_os/constants.py b/chaosaws/ec2_os/constants.py new file mode 100644 index 0000000..524a379 --- /dev/null +++ b/chaosaws/ec2_os/constants.py @@ -0,0 +1,16 @@ +# OS Types +OS_WINDOWS = "Windows" +OS_LINUX = "Linux" + +# script name +BURN_CPU = "cpu_stress_test" +BURN_IO = "burn_io" +FILL_DISK = "fill_disk" +NETWORK_UTIL = "network_advanced" +KILLALL_PROCESSES = "killall_processes" +KILL_PROCESS = "kill_process" +GREP_PROCESS = "grep_pid" +RUN_CMD = "run_cmd" + +# general default SSM lag +SSMDEFAULTNETWORKLAGACY = 10 diff --git a/chaosaws/ec2_os/probes.py b/chaosaws/ec2_os/probes.py new file mode 100644 index 0000000..51ac784 --- /dev/null +++ b/chaosaws/ec2_os/probes.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- +import time +from typing import List, Dict, Any + +from chaosaws import aws_client +from chaoslib.exceptions import FailedActivity +from chaosaws.types import AWSResponse +from chaoslib.types import Configuration, Secrets +from logzero import logger + +from .constants import OS_LINUX, OS_WINDOWS, GREP_PROCESS +from chaosaws.ec2_os import construct_script_content + +__all__ = ["describe_os_type", "describe_instance", + "ensure_tc_installed", "ensure_tc_uninstalled", + "grep_process_exist"] + + +def describe_os_type(instance_id, configuration, secrets): + res = describe_instance(instance_id, configuration, secrets) + os = "linux" + try: + os = res['Reservations'][0]['Instances'][0]['Platform'] + except KeyError: + logger.warning("No Platform key, so it is Linux") + return os + + +def describe_instance(instance_id: str, + configuration: Configuration = None, + secrets: Secrets = None) -> AWSResponse: + + client = aws_client('ec2', configuration, secrets) + + return client.describe_instances(InstanceIds=[ + instance_id, + ]) + + +def ensure_tc_installed(instance_ids: List[str] = None, + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + response = [] + for instance_id in instance_ids: + response.append( + __simple_ssm_helper( + instance_id=instance_id, + configuration=configuration, + secrets=secrets, + default_timeout=30, + action="ensure_tc_installed", + failure_matcher="Install iproute-tc package failed." + ) + ) + return response + + +def ensure_tc_uninstalled(instance_ids: List[str] = None, + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + response = [] + for instance_id in instance_ids: + response.append( + __simple_ssm_helper( + instance_id=instance_id, + configuration=configuration, + secrets=secrets, + default_timeout=30, + action="ensure_tc_uninstalled", + failure_matcher="Remove iproute-tc package failed." + ) + ) + return response + + +def grep_process_exist(instance_ids: List[str] = None, + process_name: str = None, + configuration: Configuration = None, + secrets: Secrets = None) -> List[AWSResponse]: + """ + Grep pid of process name + + Parameters + ---------- + instance_ids : List[str] + Filter the virtual machines. If the filter is omitted all machines in + the subscription will be selected as potential chaos candidates. + process_name : str + Name of the process to be killed + configuration : Configuration + Chaostoolkit Configuration + secrets : Secrets + Chaostoolkit Secrets + """ + logger.debug( + "Start network_latency: configuration='{}', instance_ids='{}'".format( + configuration, instance_ids)) + response = [] + try: + for instance in instance_ids: + param = dict() + param["duration"] = "1" + param["instance_id"] = instance + param["process_name"] = process_name + response.append( + __simple_ssm_helper(instance_id=instance, + configuration=configuration, + secrets=secrets, + action=GREP_PROCESS, + parameters=param) + ) + return response + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script via AWS SSM {}".format( + str(x) + )) + + +############################################################################### +# Private helper functions +############################################################################### +def __simple_ssm_helper(instance_id: str, + configuration: Configuration = None, + secrets: Secrets = None, + default_timeout: int = 30, + action: str = None, + parameters: Dict[str, Any] = None, + failure_matcher: str = "failed") -> AWSResponse: + + client = aws_client("ssm", configuration, secrets) + if not instance_id: + raise FailedActivity( + "you must specify the instance_id" + ) + try: + if describe_os_type(instance_id, configuration, secrets) == "windows": + os_type = OS_WINDOWS + # TODO with PowerShell + cmd = "" + document_name = "" + else: + os_type = OS_LINUX + document_name = "AWS-RunShellScript" + + res_send_command = client.send_command( + InstanceIds=[instance_id], + DocumentName=document_name, + Parameters={ + 'commands': + [construct_script_content(action, os_type, parameters)] + }, + ) + cmd_id = res_send_command["Command"]["CommandId"] + logger.info("ssm run command is sent, id {}".format(cmd_id)) + totalwait = 0 + interval = 1 + while True: + res_list = client.list_command_invocations( + CommandId=cmd_id, + Details=True + ) + try: + cp = res_list['CommandInvocations'][0]['CommandPlugins'][0] + status = cp['Status'] + if status == "InProgress": + time.sleep(interval) + totalwait += interval + if totalwait > default_timeout: + raise FailedActivity( + "Script exceeded default timeout {}".format( + default_timeout + ) + ) + continue + elif status == "Failed": + break + elif status == "Success": + break + else: + break + except IndexError: + time.sleep(1) + continue + for command_invocation in res_list['CommandInvocations']: + for invocation in command_invocation['CommandPlugins']: + if invocation['Name'] == 'aws:runShellScript': + if failure_matcher in invocation['Output']: + raise FailedActivity( + "The result of command failed as:\n{}".format( + failure_matcher + ) + ) + logger.info("ssm run command status {}" + .format(invocation['Status'])) + logger.info("ssm rum command result \n{}" + .format(invocation['Output'].rstrip('\n'))) + return invocation['Output'].rstrip('\n') + except Exception as x: + raise FailedActivity( + "failed issuing a execute of shell script:\n{}".format(x)) diff --git a/chaosaws/ec2_os/scripts/burn_io.sh b/chaosaws/ec2_os/scripts/burn_io.sh new file mode 100644 index 0000000..288d63e --- /dev/null +++ b/chaosaws/ec2_os/scripts/burn_io.sh @@ -0,0 +1,26 @@ +#Script for BurnIO Chaos Monkey + +cat << EOF > /tmp/loop.sh +while [ true ]; +do + sudo dd if=/dev/urandom of=/root/burn bs=32K count=1024 iflag=fullblock +done +EOF + +chmod +x /tmp/loop.sh +timeout $duration /tmp/loop.sh + +# while true; +# do +# dd if=/dev/urandom of=/experiment_burnio bs=1M count=1024 iflag=fullblock status=none +# done & + +ret=$? +if [[ $ret -eq 0 || $ret -eq 124 ]]; then + echo "experiment burnio -> <$instance_id>: success" +else + echo "experiment brunio -> <$instance_id>: fail" +fi + +sudo rm /root/burn +sudo rm /tmp/loop.sh diff --git a/chaosaws/ec2_os/scripts/cpu_stress_test.ps1 b/chaosaws/ec2_os/scripts/cpu_stress_test.ps1 new file mode 100644 index 0000000..61c320c --- /dev/null +++ b/chaosaws/ec2_os/scripts/cpu_stress_test.ps1 @@ -0,0 +1,18 @@ +Param([parameter(mandatory=$true)] [int]$duration) + +$CPUs = (Get-WMIObject win32_processor | Measure-Object NumberofLogicalProcessors -sum).sum +Write-Output "Stressing $Cpus CPUs for $duration seconds." + +ForEach ($Number in 1..$CPUs){ + Start-Job -ScriptBlock{ + param ($duration) + $stopwatch = [system.diagnostics.stopwatch]::StartNew() + $result = 1; + while ($stopwatch.Elapsed.TotalSeconds -lt $duration) { + $result = $result * $number + } + } -Arg $duration + +} + +Get-Job | Wait-Job \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/cpu_stress_test.sh b/chaosaws/ec2_os/scripts/cpu_stress_test.sh new file mode 100644 index 0000000..e2b7a79 --- /dev/null +++ b/chaosaws/ec2_os/scripts/cpu_stress_test.sh @@ -0,0 +1,21 @@ +# get CPU counts +cpus=$(cat /proc/cpuinfo | awk "/^processor/{print $3}" | wc -l) +pids="" +echo "Stressing $instance_id $cpus CPUs for $duration seconds." +trap 'for p in $pids; do kill $p; done' 0 + +for i in $cpus +do + while : + do : + done & pids="$pids $!"; +done +sleep $duration +echo "Stressing $cpus CPUs for $duration seconds. Done" + +ret=$? +if [ $ret -eq 0 ]; then + echo "experiment strees_cpu <$instance_id> -> success" +else + echo "experiment strees_cpu <$instance_id> -> fail" +fi \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/ensure_tc_installed.sh b/chaosaws/ec2_os/scripts/ensure_tc_installed.sh new file mode 100644 index 0000000..30e53fc --- /dev/null +++ b/chaosaws/ec2_os/scripts/ensure_tc_installed.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +if [ -z "`rpm -qa iproute-tc`" ]; then + yum install -y iproute-tc 2>&1 >/dev/null +fi + +if [ $? -ne 0 ];then + echo "Install iproute-tc package failed." + exit 1 +fi \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/ensure_tc_uninstalled.sh b/chaosaws/ec2_os/scripts/ensure_tc_uninstalled.sh new file mode 100644 index 0000000..f6fcf7f --- /dev/null +++ b/chaosaws/ec2_os/scripts/ensure_tc_uninstalled.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -n "`rpm -qa iproute-tc`" ]; then + yum remove -y iproute-tc 2>&1 >/dev/null +fi + +ret3=$? +if [ $ret3 -ne 0 ];then + echo "Remove iproute-tc package failed." + exit 1 +fi \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/fill_disk.ps1 b/chaosaws/ec2_os/scripts/fill_disk.ps1 new file mode 100644 index 0000000..84c6966 --- /dev/null +++ b/chaosaws/ec2_os/scripts/fill_disk.ps1 @@ -0,0 +1,13 @@ +#Script for FillDisk Chaos Monkey + + +Param([parameter(mandatory=$true)] [int]$duration, +[parameter(mandatory=$true)] [int]$size) + +Write-Host "Filling disk with $size MB of random data for $duration seconds." + +$Msize = $size*1024000 + +fsutil file createnew C:/burn $Msize +Start-Sleep -s $duration +rm C:/burn \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/fill_disk.sh b/chaosaws/ec2_os/scripts/fill_disk.sh new file mode 100644 index 0000000..490edb1 --- /dev/null +++ b/chaosaws/ec2_os/scripts/fill_disk.sh @@ -0,0 +1,13 @@ +echo "Filling Disk with $size MB of random data for $duration seconds." + +nohup dd if=/dev/urandom of=/root/burn bs=1M count=$size iflag=fullblock +sleep $duration + +ret=$? +if [ $ret -eq 0 ]; then + echo "experiment fill_disk -> <$instance_id>: success" +else + echo "experiment fill_disk -> <$instance_id>: fail" +fi + +rm /root/burn \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/grep_pid.sh b/chaosaws/ec2_os/scripts/grep_pid.sh new file mode 100644 index 0000000..988bc6d --- /dev/null +++ b/chaosaws/ec2_os/scripts/grep_pid.sh @@ -0,0 +1,3 @@ +#!bin/bash + +ps -ef | grep $process_name | grep -v 'grep' | awk '{ print $2 }' \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/kill_process.sh b/chaosaws/ec2_os/scripts/kill_process.sh new file mode 100644 index 0000000..b6f8116 --- /dev/null +++ b/chaosaws/ec2_os/scripts/kill_process.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +if [ -z "$process_name" ]; then + echo "Please provide process name." + exit 1 +fi + +experiment=$(kill $signal $process_name) +ret=$? +if [ $ret -eq 0 ]; then + echo "experiment kill_process -> process <$process_name> on <$instance_id>: success" +else + echo "experiment kill_process -> process <$process_name> on <$instance_id>: fail" +fi +#Sleep $duration +sleep $duration diff --git a/chaosaws/ec2_os/scripts/killall_processes.sh b/chaosaws/ec2_os/scripts/killall_processes.sh new file mode 100644 index 0000000..0aac1be --- /dev/null +++ b/chaosaws/ec2_os/scripts/killall_processes.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +if [ -z "$process_name" ]; then + echo "Please provide process name." + exit 1 +fi + +experiment=$(killall $signal $process_name) +ret=$? +if [ $ret -eq 0 ]; then + echo "experiment killall_processes -> processes <$process_name> on <$instance_id>: success" +else + echo "experiment killall_processes -> processes <$process_name> on <$instance_id>: fail" +fi +#Sleep $duration +sleep $duration diff --git a/chaosaws/ec2_os/scripts/network_advanced.sh b/chaosaws/ec2_os/scripts/network_advanced.sh new file mode 100644 index 0000000..c816226 --- /dev/null +++ b/chaosaws/ec2_os/scripts/network_advanced.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +#Mandatory parameters: +#param='delay 1000ms 500ms' or param='delay 1000ms 500ms' or param='corrupt 15%' +#duration=60 + +#Check if it is a normal setting +check=$(tc -s qdisc show dev $device |grep pfifo_fast) +ret=$? + +#if not, recovery +if [ $ret -ne 0 ]; then + tc qdisc del dev $device root 2>&1 >/dev/null +fi + +#Do experiments +experiment=$(tc qdisc add dev $device root netem $param) +ret=$? +if [ $ret -eq 0 ]; then + echo "experiment network ($param) -> <$instance_id>: success" +else + echo "experiment network ($param) -> <$instance_id>: fail" +fi + +#Sleep $duration +#Mandatory +sleep $duration + +#recovery +tc -s qdisc show dev $device |grep pfifo_fast 2>&1 >/dev/null +ret1=$? +#if not, recovery +if [ $ret1 -ne 0 ]; then + tc qdisc del dev $device root 2>&1 >/dev/null +fi + +#Check if it is a normal setting +tc -s qdisc show dev $device |grep pfifo_fast 2>&1 >/dev/null +ret2=$? +if [ $ret2 -eq 0 ]; then + echo "recover network ($param) -> <$instance_id>: success" +else + echo "recover network ($param) -> <$instance_id>: fail" +fi \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/network_latency.sh b/chaosaws/ec2_os/scripts/network_latency.sh new file mode 100644 index 0000000..55df40a --- /dev/null +++ b/chaosaws/ec2_os/scripts/network_latency.sh @@ -0,0 +1,10 @@ +# Script for NetworkLatency Chaos Monkey + +# Adds ${delay}ms +- ${jitter}ms of latency to each packet for $duration seconds +sudo tc qdisc add dev eth0 root netem delay ${delay}ms ${jitter}ms +sleep $duration +sudo tc qdisc del dev eth0 root + + + + diff --git a/chaosaws/ec2_os/scripts/run_cmd.ps1 b/chaosaws/ec2_os/scripts/run_cmd.ps1 new file mode 100644 index 0000000..e4ea4f6 --- /dev/null +++ b/chaosaws/ec2_os/scripts/run_cmd.ps1 @@ -0,0 +1,12 @@ +Try +{ + +} +Catch +{ + +} +Finally +{ + +} \ No newline at end of file diff --git a/chaosaws/ec2_os/scripts/run_cmd.sh b/chaosaws/ec2_os/scripts/run_cmd.sh new file mode 100644 index 0000000..de9d4c2 --- /dev/null +++ b/chaosaws/ec2_os/scripts/run_cmd.sh @@ -0,0 +1,7 @@ + +ret=$? +if [ $ret -eq 0 ]; then + echo "experiment run_cmd -> <$instance_id>: success" +else + echo "experiment run_cmd -> <$instance_id>: fail" +fi \ No newline at end of file diff --git a/setup.py b/setup.py index 7eef5e8..fd46b0a 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ def get_version_from_package() -> str: 'chaosaws', 'chaosaws.ecs', 'chaosaws.ec2', + 'chaosaws.ec2_os', 'chaosaws.eks', 'chaosaws.elasticache', 'chaosaws.iam', diff --git a/tests/ec2_os/test_ec2_os_actions.py b/tests/ec2_os/test_ec2_os_actions.py new file mode 100644 index 0000000..33e4790 --- /dev/null +++ b/tests/ec2_os/test_ec2_os_actions.py @@ -0,0 +1,563 @@ +# -*- coding: utf-8 -*- +import pytest +from unittest.mock import MagicMock, patch, mock_open, call + +from chaosaws.ec2_os.actions import burn_cpu, fill_disk, network_latency, \ + burn_io, network_loss, network_corruption, network_advanced, \ + os_advanced_internet_scripts, killall_processes, kill_process + + +class AnyStringWith(str): + def __eq__(self, other): + return self in other + + +CONFIG = { + "aws": { + "aws_region": "cn-north-1" + } +} + +SECRETS = { + "aws": { + "aws_access_key_id": "abcdefghijklmn", + "aws_secret_access_key": "opqrstuvwxyz", + "aws_session_token": "abcdefghijklmnopqrstuvwxyz", + } +} + +INSTANCE_IDS = ["i-04cf7749ff48ca517"] + +CMD_ID_RETURN = { + 'Command': { + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'DocumentName': 'AWS-RunShellScript', + 'DocumentVersion': '', + 'Comment': '' + } +} + +CMD_LIST_INVOCATION_RETURN = { + 'CommandInvocations': [{ + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'InstanceId': 'i-04cf7749ff48ca517', + 'InstanceName': '', + 'Comment': '', + 'DocumentName': 'AWS-RunShellScript', + 'DocumentVersion': '', + 'RequestedDateTime': None, + 'Status': 'Success', + 'StatusDetails': 'Success', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'Success', + 'StatusDetails': 'Success', + 'ResponseCode': 0, + 'ResponseStartDateTime': None, + 'ResponseFinishDateTime': None, + 'Output': 'Stressing i-04cf7749ff48ca517 2 CPUs for 90 seconds.\nStressing 2 CPUs for 90 seconds. Done\nexperiment strees_cpu -> success\n', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': 'cn-north-1', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }], + 'ServiceRole': '', + 'NotificationConfig': { + 'NotificationArn': '', + 'NotificationEvents': [], + 'NotificationType': '' + }, + 'CloudWatchOutputConfig':{ + 'CloudWatchLogGroupName': '', + 'CloudWatchOutputEnabled': False + } + }], + 'ResponseMetadata': { + 'RequestId': 'b3f724e7-8ca2-4198-9483-d5b2dff3119a', + 'HTTPStatusCode': 200, + 'HTTPHeaders': { + 'x-amzn-requestid': 'b3f724e7-8ca2-4198-9483-d5b2dff3119a', + 'content-type': 'application/x-amz-json-1.1', + 'content-length': '998', + 'date': 'Wed, 23 Oct 2019 05:01:48 GMT' + }, + 'RetryAttempts': 0 + } +} + +CMD_LIST_INVOCATION_REMOTE_RETURN = { + 'CommandInvocations': [{ + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'InstanceId': 'i-04cf7749ff48ca517', + 'InstanceName': '', + 'Comment': '', + 'DocumentName': 'AWS-RunRemoteScript', + 'DocumentVersion': '', + 'RequestedDateTime': None, + 'Status': 'Success', + 'StatusDetails': 'Success', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'Success', + 'StatusDetails': 'Success', + 'ResponseCode': 0, + 'ResponseStartDateTime': None, + 'ResponseFinishDateTime': None, + 'Output': 'Stressing i-04cf7749ff48ca517 2 CPUs for 90 seconds.\nStressing 2 CPUs for 90 seconds. Done\nexperiment strees_cpu -> success\n', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': 'cn-north-1', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }], + 'ServiceRole': '', + 'NotificationConfig': { + 'NotificationArn': '', + 'NotificationEvents': [], + 'NotificationType': '' + }, + 'CloudWatchOutputConfig':{ + 'CloudWatchLogGroupName': '', + 'CloudWatchOutputEnabled': False + } + }], + 'ResponseMetadata': { + 'RequestId': 'b3f724e7-8ca2-4198-9483-d5b2dff3119a', + 'HTTPStatusCode': 200, + 'HTTPHeaders': { + 'x-amzn-requestid': 'b3f724e7-8ca2-4198-9483-d5b2dff3119a', + 'content-type': 'application/x-amz-json-1.1', + 'content-length': '998', + 'date': 'Wed, 23 Oct 2019 05:01:48 GMT' + }, + 'RetryAttempts': 0 + } +} + +CMD_RETURN_DISK = { + 'CommandInvocations': [{ + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'InstanceId': 'i-04cf7749ff48ca517', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'Success', + 'StatusDetails': 'Success', + 'ResponseCode': 0, + 'ResponseStartDateTime': None, + 'ResponseFinishDateTime': None, + 'Output': 'experiment fill_disk -> : success', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': 'cn-north-1', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }] + }] +} + +CMD_RETURN_NETWORK = { + 'CommandInvocations': [{ + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'InstanceId': 'i-04cf7749ff48ca517', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'Success', + 'StatusDetails': 'Success', + 'ResponseCode': 0, + 'ResponseStartDateTime': None, + 'ResponseFinishDateTime': None, + 'Output': 'experiment network (loss 5%) -> : success', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': 'cn-north-1', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }] + }] +} + +CMD_RETURN_PROCESSES = { + 'CommandInvocations': [{ + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'InstanceId': 'i-04cf7749ff48ca517', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'Success', + 'StatusDetails': 'Success', + 'ResponseCode': 0, + 'ResponseStartDateTime': None, + 'ResponseFinishDateTime': None, + 'Output': 'experiment killall_processes -> processes on : success', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': 'cn-north-1', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }] + }] +} + +CMD_RETURN_PROCESS = { + 'CommandInvocations': [{ + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'InstanceId': 'i-04cf7749ff48ca517', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'Success', + 'StatusDetails': 'Success', + 'ResponseCode': 0, + 'ResponseStartDateTime': None, + 'ResponseFinishDateTime': None, + 'Output': 'experiment kill_process -> process on : success', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': 'cn-north-1', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }] + }] +} + +CMD_RETURN_FAIL = { + 'CommandInvocations': [{ + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'InstanceId': 'i-04cf7749ff48ca517', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'Success', + 'StatusDetails': 'Success', + 'ResponseCode': 0, + 'ResponseStartDateTime': None, + 'ResponseFinishDateTime': None, + 'Output': 'experiment network (corruption 15%) -> : fail', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': 'cn-north-1', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }] + }] +} + +CMD_RETURN_IN_PROGRESS = { + 'CommandInvocations': [{ + 'CommandId': '015c2722-9004-40cb-a4d2-871fec1e00d7', + 'InstanceId': 'i-04cf7749ff48ca517', + 'InstanceName': '', + 'Comment': '', + 'DocumentName': 'AWS-RunShellScript', + 'DocumentVersion': '', + 'RequestedDateTime': None, + 'Status': 'InProgress', + 'StatusDetails': 'InProgress', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'InProgress', + 'StatusDetails': 'InProgress', + 'ResponseCode': -1, + 'Output': '', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': '', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }], + 'ServiceRole': '', + 'NotificationConfig': { + 'NotificationArn': '', + 'NotificationEvents': [], + 'NotificationType': '' + }, + 'CloudWatchOutputConfig': { + 'CloudWatchLogGroupName': '', + 'CloudWatchOutputEnabled': False + }}], + 'ResponseMetadata': { + 'RequestId': 'fe1562ae-1e37-499a-9159-7bcff451d431', + 'HTTPStatusCode': 200, + 'HTTPHeaders': { + 'x-amzn-requestid': 'fe1562ae-1e37-499a-9159-7bcff451d431', + 'content-type': 'application/x-amz-json-1.1', + 'content-length': '769', + 'date': 'Wed, 23 Oct 2019 05:29:03 GMT' + }, 'RetryAttempts': 0 + }} + +CMD_REMOTE = { + "CommandInvocations": [ + { + "CommandId": "e2296cff-b479-4240-8645-f1e84350d054", + "InstanceId": "i-04cf7749ff48ca517", + "InstanceName": "u7000nec2oapk8s0002", + "Comment": "", + "DocumentName": "AWS-RunRemoteScript", + "DocumentVersion": "", + "RequestedDateTime": 1571984830.578, + "Status": "Success", + "StatusDetails": "Success", + "StandardOutputUrl": "", + "StandardErrorUrl": "", + "CommandPlugins": [ + { + "Name": "downloadContent", + "Status": "Success", + "StatusDetails": "Success", + "ResponseCode": 0, + "ResponseStartDateTime": 1571984831.032, + "ResponseFinishDateTime": 1571984831.226, + "Output": "", + "StandardOutputUrl": "", + "StandardErrorUrl": "", + "OutputS3Region": "cn-north-1", + "OutputS3BucketName": "", + "OutputS3KeyPrefix": "" + }, + { + "Name": "runPowerShellScript", + "Status": "Success", + "StatusDetails": "Success", + "ResponseCode": 0, + "ResponseStartDateTime": 1571984831.226, + "ResponseFinishDateTime": 1571984831.226, + "Output": "Step execution skipped due to incompatible platform. Step name: runPowerShellScript", + "StandardOutputUrl": "", + "StandardErrorUrl": "", + "OutputS3Region": "cn-north-1", + "OutputS3BucketName": "", + "OutputS3KeyPrefix": "" + }, + { + "Name": "runShellScript", + "Status": "Success", + "StatusDetails": "Success", + "ResponseCode": 0, + "ResponseStartDateTime": 1571984831.227, + "ResponseFinishDateTime": 1571984831.236, + "Output": "experiment steady state -> : success", + "StandardOutputUrl": "", + "StandardErrorUrl": "", + "OutputS3Region": "cn-north-1", + "OutputS3BucketName": "", + "OutputS3KeyPrefix": "" + } + ], + "ServiceRole": "", + "NotificationConfig": { + "NotificationArn": "", + "NotificationEvents": [], + "NotificationType": "" + }, + "CloudWatchOutputConfig": { + "CloudWatchLogGroupName": "", + "CloudWatchOutputEnabled": False + } + } + ] +} + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_burn_cpu(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_LIST_INVOCATION_RETURN + + os_type.return_value = "Linux" + + list_res = burn_cpu(instance_ids=INSTANCE_IDS, execution_duration="30", + configuration=CONFIG, secrets=SECRETS) + + open.assert_called_with(AnyStringWith("cpu_stress_test.sh")) + assert "success" in list_res[0] + assert INSTANCE_IDS[0] in list_res[0] + assert "experiment strees_cpu" in list_res[0] + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_fill_disk(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_RETURN_DISK + + os_type.return_value = "Linux" + + list_res = fill_disk(instance_ids=INSTANCE_IDS, execution_duration="30", + size="5000", configuration=CONFIG, secrets=SECRETS) + + open.assert_called_with(AnyStringWith("fill_disk.sh")) + assert "success" in list_res[0] + assert INSTANCE_IDS[0] in list_res[0] + assert "experiment fill_disk" in list_res[0] + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_kill_process(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_RETURN_PROCESS + + os_type.return_value = "Linux" + + list_res = kill_process(instance_ids=INSTANCE_IDS, + execution_duration="30", + signal="-9", + process="java", + configuration=CONFIG, + secrets=SECRETS) + + open.assert_called_with(AnyStringWith("kill_process.sh")) + assert "success" in list_res[0] + assert INSTANCE_IDS[0] in list_res[0] + assert "experiment kill_process" in list_res[0] + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_network(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_RETURN_NETWORK + + os_type.return_value = "Linux" + + list_res = network_latency(instance_ids=INSTANCE_IDS, + execution_duration="30", + configuration=CONFIG, + delay="1000ms", + variance="500ms", + ratio="50%", + secrets=SECRETS) + + open.assert_called_with(AnyStringWith("network_advanced.sh")) + assert "success" in list_res[0] + assert INSTANCE_IDS[0] in list_res[0] + assert "experiment network" in list_res[0] + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_network_fail(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_RETURN_FAIL + + os_type.return_value = "Linux" + + list_res = network_corruption(instance_ids=INSTANCE_IDS, + execution_duration="30", + configuration=CONFIG, + corruption_ratio="50%", + secrets=SECRETS) + + open.assert_called_with(AnyStringWith("network_advanced.sh")) + assert "fail" in list_res[0] + assert INSTANCE_IDS[0] in list_res[0] + assert "experiment network" in list_res[0] + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_burn_io_timeout(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_RETURN_IN_PROGRESS + + os_type.return_value = "Linux" + + with pytest.raises(Exception) as ex: + list_res = burn_io(instance_ids=INSTANCE_IDS, + execution_duration="2", + configuration=CONFIG, + secrets=SECRETS) + assert 'Script exceeded default timeout' in str(ex.value) + open.assert_called_with(AnyStringWith("burn_io.sh")) + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_network_exception(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_RETURN_IN_PROGRESS + + os_type.return_value = "Linux" + + with pytest.raises(Exception) as ex: + list_res = network_advanced(instance_ids=INSTANCE_IDS, + execution_duration="2", + configuration=CONFIG, + command="loss 100%", + device="eth0", + secrets=SECRETS) + assert 'failed issuing a execute of shell script via AWS SSM' in str(ex.value) + open.assert_called_with(AnyStringWith("network_advanced.sh")) + + +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_os_advanced_internet_scripts(aws_client, os_type): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_REMOTE + + os_type.return_value = "Linux" + + list_res = os_advanced_internet_scripts(instance_ids=INSTANCE_IDS, + source_info="https://s3.aws/test.sh", + command_line=["sh test.sh"], + configuration=CONFIG, + execution_timeout="5", + secrets=SECRETS) + assert "success" in list_res[0] + assert INSTANCE_IDS[0] in list_res[0] + assert "experiment steady state" in list_res[0] + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.actions.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.actions.aws_client', autospec=True) +def test_killall_processes(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_RETURN_PROCESSES + + os_type.return_value = "Linux" + + list_res = killall_processes(instance_ids=INSTANCE_IDS, + execution_duration="30", + process_name="java", + signal="-9", + configuration=CONFIG, + secrets=SECRETS) + + open.assert_called_with(AnyStringWith("killall_processes.sh")) + assert "success" in list_res[0] + assert "java" in list_res[0] + assert INSTANCE_IDS[0] in list_res[0] + assert "experiment killall_processes" in list_res[0] \ No newline at end of file diff --git a/tests/ec2_os/test_ec2_os_probes.py b/tests/ec2_os/test_ec2_os_probes.py new file mode 100644 index 0000000..15bd1b8 --- /dev/null +++ b/tests/ec2_os/test_ec2_os_probes.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +import pytest +from unittest.mock import MagicMock, patch, mock_open, call + +from chaosaws.ec2_os.probes import describe_instance, describe_os_type, \ + ensure_tc_installed, ensure_tc_uninstalled + + +CONFIG = { + "aws": { + "aws_region": "cn-north-1" + } +} + +INSTANCE_IDS = ["i-04cf7749ff48ca517"] + +SECRETS = { + "aws": { + "aws_access_key_id": "abcdefghijklmn", + "aws_secret_access_key": "opqrstuvwxyz", + "aws_session_token": "abcdefghijklmnopqrstuvwxyz", + } +} + +CMD_ID_RETURN = { + 'Command': { + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'DocumentName': 'AWS-RunShellScript', + 'DocumentVersion': '', + 'Comment': '' + } +} + +INSTANCE_DESC = { + 'Reservations': [{ + 'Instances': [{ + 'InstanceId': 'i-1234567890abcdef0', + 'InstanceLifecycle': 'spot', + 'SpotInstanceRequestId': 'sir-abcdef01'}]}]} + +INSTANCE_DESC_WINDOWS = { + 'Reservations': [{ + 'Instances': [{ + 'InstanceId': 'i-1234567890abcdef0', + 'Platform': 'Windows', + 'InstanceLifecycle': 'spot', + 'SpotInstanceRequestId': 'sir-abcdef01'}]}]} + +CMD_LIST_INVOCATION_RETURN = { + 'CommandInvocations': [{ + 'CommandId': '6ff4cc59-dac6-4d2e-8952-8ac53930627a', + 'InstanceId': 'i-04cf7749ff48ca517', + 'InstanceName': '', + 'Comment': '', + 'DocumentName': 'AWS-RunShellScript', + 'DocumentVersion': '', + 'RequestedDateTime': None, + 'Status': 'Success', + 'StatusDetails': 'Success', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'CommandPlugins': [{ + 'Name': 'aws:runShellScript', + 'Status': 'Success', + 'StatusDetails': 'Success', + 'ResponseCode': 0, + 'ResponseStartDateTime': None, + 'ResponseFinishDateTime': None, + 'Output': '', + 'StandardOutputUrl': '', + 'StandardErrorUrl': '', + 'OutputS3Region': 'cn-north-1', + 'OutputS3BucketName': '', + 'OutputS3KeyPrefix': '' + }], + 'ServiceRole': '', + 'NotificationConfig': { + 'NotificationArn': '', + 'NotificationEvents': [], + 'NotificationType': '' + }, + 'CloudWatchOutputConfig':{ + 'CloudWatchLogGroupName': '', + 'CloudWatchOutputEnabled': False + } + }], + 'ResponseMetadata': { + 'RequestId': 'b3f724e7-8ca2-4198-9483-d5b2dff3119a', + 'HTTPStatusCode': 200, + 'HTTPHeaders': { + 'x-amzn-requestid': 'b3f724e7-8ca2-4198-9483-d5b2dff3119a', + 'content-type': 'application/x-amz-json-1.1', + 'content-length': '998', + 'date': 'Wed, 23 Oct 2019 05:01:48 GMT' + }, + 'RetryAttempts': 0 + } +} + + +class AnyStringWith(str): + def __eq__(self, other): + return self in other + + +@patch('chaosaws.ec2_os.probes.aws_client', autospec=True) +def test_describe_os_type(aws_client): + client = MagicMock() + aws_client.return_value = client + client.describe_instances.return_value = INSTANCE_DESC + + os = describe_os_type(instance_id='i-1234567890abcdef0', secrets=SECRETS, + configuration=CONFIG) + assert "linux" == os + + +@patch('chaosaws.ec2_os.probes.aws_client', autospec=True) +def test_describe_os_type_windows(aws_client): + client = MagicMock() + aws_client.return_value = client + client.describe_instances.return_value = INSTANCE_DESC_WINDOWS + + os = describe_os_type(instance_id='i-1234567890abcdef0', secrets=SECRETS, + configuration=CONFIG) + assert "Windows" == os + + +@patch('chaosaws.ec2_os.probes.aws_client', autospec=True) +def test_describe_instances_windows(aws_client): + client = MagicMock() + aws_client.return_value = client + client.describe_instances.return_value = INSTANCE_DESC_WINDOWS + + response = describe_instance(instance_id='i-1234567890abcdef0', + secrets=SECRETS, + configuration=CONFIG) + assert INSTANCE_DESC_WINDOWS == response + + +@patch('chaosaws.ec2_os.probes.aws_client', autospec=True) +def test_describe_instances(aws_client): + client = MagicMock() + aws_client.return_value = client + client.describe_instances.return_value = INSTANCE_DESC + + response = describe_instance(instance_id='i-1234567890abcdef0', + secrets=SECRETS, + configuration=CONFIG) + assert INSTANCE_DESC == response + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.probes.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.probes.aws_client', autospec=True) +def test_ensure_tc_installed(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_LIST_INVOCATION_RETURN + + os_type.return_value = "Linux" + + list_res = ensure_tc_installed(instance_ids=INSTANCE_IDS, + configuration=CONFIG, + secrets=SECRETS) + open.assert_called_with(AnyStringWith("ensure_tc_installed.sh")) + assert "failed" not in list_res[0] + + +@patch("builtins.open", new_callable=mock_open, read_data="script") +@patch('chaosaws.ec2_os.probes.describe_os_type', autospec=True) +@patch('chaosaws.ec2_os.probes.aws_client', autospec=True) +def test_ensure_tc_uninstalled(aws_client, os_type, open): + client = MagicMock() + aws_client.return_value = client + client.send_command.return_value = CMD_ID_RETURN + client.list_command_invocations.return_value = CMD_LIST_INVOCATION_RETURN + + os_type.return_value = "Linux" + + list_res = ensure_tc_uninstalled(instance_ids=INSTANCE_IDS, + configuration=CONFIG, + secrets=SECRETS) + open.assert_called_with(AnyStringWith("ensure_tc_uninstalled.sh")) + assert "failed" not in list_res[0]