From 9acc7cb1f99a880f6c53c210dca6d23afad081cc Mon Sep 17 00:00:00 2001 From: Kamoltat Sirivadhna Date: Wed, 13 Mar 2024 17:45:59 -0400 Subject: [PATCH] src/teuthology_api/suite: Modify how we handle Error and Success runs The changes includes: 1. make suite route return {"run": run_details, "logs": logs, "job_count": job_count} 2. Improve how we handle Exception by utilizing Queue from python multiprocessing library. 3. Set the timeout for the process to be 180 seconds, if teuthology doesn't respond back within that time, then we return a Process Timeout Signed-off-by: Kamoltat Sirivadhna --- src/teuthology_api/routes/suite.py | 7 +++- src/teuthology_api/schemas/suite.py | 2 +- src/teuthology_api/services/helpers.py | 36 +++++++++++++++----- src/teuthology_api/services/suite.py | 47 +++++++++++++------------- 4 files changed, 57 insertions(+), 35 deletions(-) diff --git a/src/teuthology_api/routes/suite.py b/src/teuthology_api/routes/suite.py index 13f1905..5737a16 100644 --- a/src/teuthology_api/routes/suite.py +++ b/src/teuthology_api/routes/suite.py @@ -24,4 +24,9 @@ def create_run( ): args = args.model_dump(by_alias=True) args["--user"] = get_username(request) - return run(args, logs, access_token) + try: + created_run = run(args, logs, access_token) + log.debug(created_run) + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + return created_run diff --git a/src/teuthology_api/schemas/suite.py b/src/teuthology_api/schemas/suite.py index 409e219..14b4042 100644 --- a/src/teuthology_api/schemas/suite.py +++ b/src/teuthology_api/schemas/suite.py @@ -37,7 +37,7 @@ class SuiteArgs(BaseArgs): default="https://github.com/ceph/ceph-ci.git", alias="--suite_repo" ) teuthology_branch: Union[str, None] = Field( - default="main", alias="--teuthology-branch" + default=None, alias="--teuthology-branch" ) validate_sha1: Union[str, None] = Field(default="true", alias="--validate-sha1") wait: Union[bool, None] = Field(default=False, alias="--wait") diff --git a/src/teuthology_api/services/helpers.py b/src/teuthology_api/services/helpers.py index ef7abc3..2bf484f 100644 --- a/src/teuthology_api/services/helpers.py +++ b/src/teuthology_api/services/helpers.py @@ -1,4 +1,4 @@ -from multiprocessing import Process +from multiprocessing import Process, Queue import logging import os import uuid @@ -26,26 +26,44 @@ def logs_run(func, args): _id = str(uuid.uuid4()) archive = Path(ARCHIVE_DIR) log_file = archive / f"{_id}.log" - - teuthology_process = Process(target=_execute_with_logs, args=(func, args, log_file)) - teuthology_process.start() - teuthology_process.join() - + teuth_queue = Queue() + teuth_process = Process( + target=_execute_with_logs, args=(func, args, log_file, teuth_queue) + ) + teuth_process.daemon = True # Set the process as a daemon + teuth_process.start() + teuth_process.join(timeout=180) # Set the timeout value in seconds + if teuth_process.is_alive(): + teuth_process.terminate() # Terminate the process if it exceeds the timeout + teuth_process.join() + raise TimeoutError("Process execution timed out") logs = "" with open(log_file, encoding="utf-8") as file: logs = file.readlines() if os.path.isfile(log_file): os.remove(log_file) - return logs + log.debug(logs) + if teuth_process.exitcode > 0: + e = teuth_queue.get() + log.error(e) + return "fail", e, 0 + else: + job_count = teuth_queue.get() + return "success", logs, job_count -def _execute_with_logs(func, args, log_file): +def _execute_with_logs(func, args, log_file, teuth_queue): """ To store logs, set a new FileHandler for teuthology root logger and then execute the command function. """ teuthology.setup_log_file(log_file) - func(args) + try: + job_count = func(args) + teuth_queue.put(job_count) + except Exception as e: + teuth_queue.put(e) + raise def get_run_details(run_name: str): diff --git a/src/teuthology_api/services/suite.py b/src/teuthology_api/services/suite.py index 99d2d6d..43506b1 100644 --- a/src/teuthology_api/services/suite.py +++ b/src/teuthology_api/services/suite.py @@ -20,30 +20,29 @@ def run(args, send_logs: bool, access_token: str): detail="You need to be logged in", headers={"WWW-Authenticate": "Bearer"}, ) - try: - args["--timestamp"] = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") - - logs = logs_run(teuthology.suite.main, args) - - # get run details from paddles - run_name = make_run_name( - { - "machine_type": args["--machine-type"], - "user": args["--user"], - "timestamp": args["--timestamp"], - "suite": args["--suite"], - "ceph_branch": args["--ceph"], - "kernel_branch": args["--kernel"], - "flavor": args["--flavor"], - } - ) - run_details = get_run_details(run_name) - if send_logs or args["--dry-run"]: - return {"run": run_details, "logs": logs} - return {"run": run_details} - except Exception as exc: - log.error("teuthology.suite.main failed with the error: %s", repr(exc)) - raise HTTPException(status_code=500, detail=repr(exc)) from exc + args["--timestamp"] = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + status, logs, job_count = logs_run(teuthology.suite.main, args) + if status == "fail": + raise logs + if args["--dry-run"] or job_count < 1: + return {"run": {}, "logs": logs, "job_count": job_count} + # get run details from paddles + run_name = make_run_name( + { + "machine_type": args["--machine-type"], + "user": args["--user"], + "timestamp": args["--timestamp"], + "suite": args["--suite"], + "ceph_branch": args["--ceph"], + "kernel_branch": args["--kernel"], + "flavor": args["--flavor"], + } + ) + run_details = get_run_details(run_name) + if send_logs: + return {"run": run_details, "logs": logs, "job_count": job_count} + else: + return {"run": run_details, "job_count": job_count} def make_run_name(run_dic):