eclipse-bluechi · engelmi · Mar 4, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024
@@ -3,7 +3,7 @@
 import logging
 
 from enum import Enum
-from typing import Tuple, Iterator, Any, Optional, Union
+from typing import Tuple, Iterator, Any, Optional, Union, Dict, List
 
 from bluechi_test.client import Client
 
@@ -24,6 +24,8 @@ class BluechiCtl():
     def __init__(self, client: Client) -> None:
         self.client = client
 
+        self.tracked_services: Dict[str, List[str]] = dict()
+
     def _run(self, log_txt: str, cmd: str, check_result: bool, expected_result: int) \
             -> Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:
         LOGGER.debug(log_txt)
@@ -78,6 +80,11 @@ def get_node_status(self, node_name: str = None, check_result: bool = True, expe
 
     def start_unit(self, node_name: str, unit_name: str, check_result: bool = True, expected_result: int = 0) \
             -> Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:
+        # track started units to stop and reset failures on cleanup
+        if node_name not in self.tracked_services:
+            self.tracked_services[node_name] = []
+        self.tracked_services[node_name].append(unit_name)
+
         return self._run(
             f"Starting unit '{unit_name}' on node '{node_name}'",
             f"start {node_name} {unit_name}",

@@ -76,7 +76,7 @@ def _read_topology() -> Dict[str, Any]:
     if tmt_yaml_file is None or tmt_yaml_file == "":
         return get_primary_ip()
 
-    topology = ""
+    topology = dict()
     with open(tmt_yaml_file, "r") as f:
         topology = yaml.safe_load(f.read())
     return topology
@@ -94,9 +94,9 @@ def bluechi_ctrl_host_ip() -> str:
     if "guests" not in topology:
         return get_primary_ip()
 
-    for guest in topology["guests"]:
-        if "name" in guest and "controller" in guest["role"]:
-            return guest["hostname"]
+    for _, values in topology["guests"].items():
+        if values["role"] == "controller":
+            return values["hostname"]
 
     return get_primary_ip()
 
@@ -215,7 +215,6 @@ def bluechi_test(
         return BluechiSSHTest(available_hosts,
                               machines_ssh_user,
                               machines_ssh_password,
-                              bluechi_ctrl_svc_port,
                               tmt_test_serial_number,
                               tmt_test_data_dir,
                               run_with_valgrind,

@@ -5,9 +5,9 @@
 import time
 import traceback
 
-from typing import Any, Iterator, Optional, Tuple, Union
+from typing import Any, Iterator, Optional, Tuple, Union, List
 
-from bluechi_test.client import Client, ContainerClient, SSHClient
+from bluechi_test.client import Client
 from bluechi_test.config import BluechiAgentConfig, BluechiControllerConfig
 from bluechi_test.systemctl import SystemCtl
 from bluechi_test.bluechictl import BluechiCtl
@@ -18,15 +18,23 @@
 
 class BluechiMachine():
 
+    valgrind_log_directory = '/var/log/valgrind'
     valgrind_log_path_controller = '/var/log/valgrind/bluechi-controller-valgrind.log'
     valgrind_log_path_agent = '/var/log/valgrind/bluechi-agent-valgrind.log'
 
+    gcda_file_location = '/var/tmp/bluechi-coverage'
+
+    backup_file_suffix = '.backup'
+
     def __init__(self, name: str, client: Client) -> None:
         self.name = name
         self.client = client
 
         self.systemctl = SystemCtl(client)
 
+        self.created_files: List[str] = []
+        self.changed_files: List[str] = []
+
     def create_file(self, target_dir: str, file_name: str, content: str) -> None:
         target_file = os.path.join(target_dir, file_name)
         try:
@@ -36,6 +44,27 @@ def create_file(self, target_dir: str, file_name: str, content: str) -> None:
             traceback.print_exc()
             return
 
+        # keep track of create file for later cleanup
+        self.created_files.append(os.path.join(target_dir, file_name))
+
+    def _track_changed_file(self, target_dir: str, file_name: str) -> None:
+        target_file = os.path.join(target_dir, file_name)
+        try:
+            # create backup of original file only the first time
+            if target_file in self.changed_files:
+                return
+
+            LOGGER.debug(f"Creating backup of file '{target_file}'...")
+            backup_file = target_file + BluechiMachine.backup_file_suffix
+            result, output = self.client.exec_run(f"cp {target_file} {backup_file}")
+            if result != 0:
+                raise Exception(output)
+            self.changed_files.append(target_file)
+        except Exception as ex:
+            LOGGER.error(f"Failed to create backup of file '{target_file}': {ex}")
+            traceback.print_exc()
+            return
+
     def get_file(self, machine_path: str, local_path: str) -> None:
         self.client.get_file(machine_path, local_path)
 
@@ -77,9 +106,12 @@ def copy_systemd_service(self, service_file_name: str, source_dir: str, target_d
 
         LOGGER.debug(f"Copy local systemd service '{source_path}' to container path '{target_dir}'\
              with content:\n{content}")
-        self.client.create_file(target_dir, service_file_name, content)
+        self.create_file(target_dir, service_file_name, content)
         self.systemctl.daemon_reload()
 
+        # keep track of created service file to potentially stop it in later cleanup
+        self.systemctl.tracked_services.append(service_file_name)
+
     def copy_container_script(self, script_file_name: str):
         curr_dir = os.getcwd()
         source_path = os.path.join(curr_dir, "..", "..", "..", "bluechi_test", "container_scripts", script_file_name)
@@ -89,14 +121,18 @@ def copy_container_script(self, script_file_name: str):
 
         LOGGER.info(f"Copy container script '{source_path}' to container path '{curr_dir}'\
              with content:\n{content}")
-        self.client.create_file(target_dir, script_file_name, content)
+        self.create_file(target_dir, script_file_name, content)
 
     def restart_with_config_file(self, config_file_location, service):
+        unit_dir = "/usr/lib/systemd/system"
+        service_file = f"{service}.service"
+
+        self._track_changed_file(unit_dir, service_file)
         self.client.exec_run(f"sed -i '/ExecStart=/c\\ExecStart=/usr/libexec/{service} -c "
                              f"{config_file_location}' "
-                             f"/usr/lib/systemd/system/{service}.service")
+                             f"{os.path.join(unit_dir, service_file)}")
         self.systemctl.daemon_reload()
-        self.systemctl.restart_unit(f"{service}.service")
+        self.systemctl.restart_unit(service_file)
 
     def wait_for_bluechi_agent(self):
         should_wait = True
@@ -109,13 +145,21 @@ def wait_for_bluechi_controller(self):
             should_wait = not self.systemctl.service_is_active("bluechi-controller")
 
     def enable_valgrind(self) -> None:
+        unit_dir = "/usr/lib/systemd/system"
+        controller_service = "bluechi-controller.service"
+        agent_service = "bluechi-agent.service"
+
+        self._track_changed_file(unit_dir, controller_service)
         self.client.exec_run(f"sed -i '/ExecStart=/c\\ExecStart=/usr/bin/valgrind -s --leak-check=yes "
                              f"--log-file={BluechiMachine.valgrind_log_path_controller} "
-                             f"/usr/libexec/bluechi-controller' /usr/lib/systemd/system/bluechi-controller.service")
+                             f"/usr/libexec/bluechi-controller' {os.path.join(unit_dir, controller_service)}")
+
+        self._track_changed_file(unit_dir, agent_service)
         self.client.exec_run(f"sed -i '/ExecStart=/c\\ExecStart=/usr/bin/valgrind -s --leak-check=yes "
                              f"--log-file={BluechiMachine.valgrind_log_path_agent} /usr/libexec/bluechi-agent' "
-                             f"/usr/lib/systemd/system/bluechi-agent.service")
-        self.client.exec_run("mkdir -p /var/log/valgrind")
+                             f"{os.path.join(unit_dir, agent_service)}")
+
+        self.client.exec_run(f"mkdir -p {BluechiMachine.valgrind_log_directory}")
         self.systemctl.daemon_reload()
 
     def run_python(self, python_script_path: str) -> \
@@ -124,6 +168,9 @@ def run_python(self, python_script_path: str) -> \
         target_file_dir = os.path.join("/", "tmp")
         target_file_name = get_random_name(10)
         content = read_file(python_script_path)
+
+        # directly call create_file on client to bypass cleanup as
+        # the script file will be removed after running it
         self.client.create_file(target_file_dir, target_file_name, content)
 
         target_file_path = os.path.join(target_file_dir, target_file_name)
@@ -136,43 +183,30 @@ def run_python(self, python_script_path: str) -> \
         finally:
             return result, output
 
-    def cleanup(self):
-        if isinstance(self.client, ContainerClient):
-            if self.client.container.status == 'running':
-                kw_params = {'timeout': 0}
-                self.client.container.stop(**kw_params)
-            self.client.container.remove()
-        elif isinstance(self.client, SSHClient):
-            # TODO: implement proper cleanup (removing all added files etc.)
-            pass
-
     def gather_valgrind_logs(self, data_dir: str) -> None:
-        bluechi_controller_valgrind_filename = f"bluechi-controller-valgrind-{self.name}.log"
-        bluechi_agent_valgrind_filename = f"bluechi-agent-valgrind-{self.name}.log"
-        bluechi_controller_valgrind_log_target_path = f"/tmp/{bluechi_controller_valgrind_filename}"
-        bluechi_agent_valgrind_log_target_path = f"/tmp/{bluechi_agent_valgrind_filename}"
-
-        # Collect valgrind logs by copying log files to the data directory
-        result, _ = self.client.exec_run(
-            f'cp -f {BluechiMachine.valgrind_log_path_controller} {bluechi_controller_valgrind_log_target_path}')
-        if result == 0:
-            self.client.get_file(bluechi_controller_valgrind_log_target_path, data_dir)
-        result, _ = self.client.exec_run(
-            f'cp -f {BluechiMachine.valgrind_log_path_agent} {bluechi_agent_valgrind_log_target_path}')
-        if result == 0:
-            self.client.get_file(bluechi_agent_valgrind_log_target_path, data_dir)
+        try:
+            self.client.get_file(BluechiMachine.valgrind_log_path_controller, data_dir)
+        except Exception as ex:
+            LOGGER.debug(f"Failed to get valgrind logs for controller: {ex}")
+
+        try:
+            self.client.get_file(BluechiMachine.valgrind_log_path_agent, data_dir)
+        except Exception as ex:
+            LOGGER.debug(f"Failed to get valgrind logs for agent: {ex}")
 
     def gather_journal_logs(self, data_dir: str) -> None:
         log_file = f"/tmp/journal-{self.name}.log"
 
         self.client.exec_run(
             f'bash -c "journalctl --no-pager > {log_file}"', tty=True)
 
+        # track created logfile for later cleanup
+        self.created_files.append(log_file)
+
         self.client.get_file(log_file, data_dir)
 
     def gather_coverage(self, data_coverage_dir: str) -> None:
-        gcda_file_location = "/var/tmp/bluechi-coverage"
-        coverage_file = f"{gcda_file_location}/coverage-{self.name}.info"
+        coverage_file = f"{BluechiMachine.gcda_file_location}/coverage-{self.name}.info"
 
         LOGGER.info(f"Generating info file '{coverage_file}' started")
         result, output = self.client.exec_run(
@@ -183,6 +217,17 @@ def gather_coverage(self, data_coverage_dir: str) -> None:
 
         self.client.get_file(f"{coverage_file}", data_coverage_dir)
 
+    def cleanup_valgrind_logs(self):
+        self.client.exec_run(f"rm -f {BluechiMachine.valgrind_log_path_controller}")
+        self.client.exec_run(f"rm -f {BluechiMachine.valgrind_log_path_agent}")
+
+    def cleanup_journal_logs(self):
+        self.client.exec_run("journalctl --flush --rotate")
+        self.client.exec_run("journalctl --vacuum-time=1s")
+
+    def cleanup_coverage(self):
+        self.client.exec_run(f"rm -rf {BluechiMachine.gcda_file_location}/*")
+
 
 class BluechiAgentMachine(BluechiMachine):
 
@@ -192,7 +237,7 @@ def __init__(self, name: str, client: Client, agent_config: BluechiAgentConfig)
         self.config = agent_config
 
         # add confd file to container
-        self.client.create_file(self.config.get_confd_dir(), self.config.file_name, self.config.serialize())
+        self.create_file(self.config.get_confd_dir(), self.config.file_name, self.config.serialize())
 
 
 class BluechiControllerMachine(BluechiMachine):
@@ -205,4 +250,4 @@ def __init__(self, name: str, client: Client, ctrl_config: BluechiControllerConf
         self.config = ctrl_config
 
         # add confd file to container
-        self.client.create_file(self.config.get_confd_dir(), self.config.file_name, self.config.serialize())
+        self.create_file(self.config.get_confd_dir(), self.config.file_name, self.config.serialize())
@@ -2,7 +2,7 @@
 
 import logging
 
-from typing import Any, Iterator, Optional, Set, Tuple, Union
+from typing import Any, Iterator, Optional, Set, Tuple, Union, List
 
 from bluechi_test.client import Client
 
@@ -16,6 +16,8 @@ class SystemCtl():
     def __init__(self, client: Client) -> None:
         self.client = client
 
+        self.tracked_services: List[str] = []
+
     def _do_operation_on_unit(self, unit_name: str, operation: str, check_result: bool, expected_result: int) \
             -> Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:
 
@@ -36,6 +38,9 @@ def _do_operation(self, operation: str, check_result: bool, expected_result: int
 
     def start_unit(self, unit_name: str, check_result: bool = True, expected_result: int = 0)  \
             -> Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:
+        # track started units to stop and reset failures on cleanup
+        self.tracked_services.append(unit_name)
+
         return self._do_operation_on_unit(unit_name, "start", check_result, expected_result)
 
     def stop_unit(self,
@@ -63,6 +68,9 @@ def stop_unit(self,
 
     def restart_unit(self, unit_name: str, check_result: bool = True, expected_result: int = 0)  \
             -> Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:
+        # track started units to stop and reset failures on cleanup
+        self.tracked_services.append(unit_name)
+
         return self._do_operation_on_unit(unit_name, "restart", check_result, expected_result)
 
     def reset_failed_for_unit(self, unit_name: str, check_result: bool = True, expected_result: int = 0)  \