Skip to content

Commit

Permalink
src/teuthology_api/suite: Modify how we handle Error and Success runs
Browse files Browse the repository at this point in the history
The changes includes:

1. make suite route return
{"run": run_details, "logs": logs, "job_count": job_count}

2. Improve how we handle Exception by utilizing Queue from
python multiprocessing library.

3. Set the timeout for the process to be 180 seconds, if teuthology
doesn't respond back within that time, then we return a Process Timeout

Signed-off-by: Kamoltat Sirivadhna <[email protected]>
  • Loading branch information
kamoltat committed May 10, 2024
1 parent f0a9d59 commit 9acc7cb
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 35 deletions.
7 changes: 6 additions & 1 deletion src/teuthology_api/routes/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,9 @@ def create_run(
):
args = args.model_dump(by_alias=True)
args["--user"] = get_username(request)
return run(args, logs, access_token)
try:
created_run = run(args, logs, access_token)
log.debug(created_run)
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
return created_run
2 changes: 1 addition & 1 deletion src/teuthology_api/schemas/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class SuiteArgs(BaseArgs):
default="https://github.com/ceph/ceph-ci.git", alias="--suite_repo"
)
teuthology_branch: Union[str, None] = Field(
default="main", alias="--teuthology-branch"
default=None, alias="--teuthology-branch"
)
validate_sha1: Union[str, None] = Field(default="true", alias="--validate-sha1")
wait: Union[bool, None] = Field(default=False, alias="--wait")
Expand Down
36 changes: 27 additions & 9 deletions src/teuthology_api/services/helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from multiprocessing import Process
from multiprocessing import Process, Queue
import logging
import os
import uuid
Expand Down Expand Up @@ -26,26 +26,44 @@ def logs_run(func, args):
_id = str(uuid.uuid4())
archive = Path(ARCHIVE_DIR)
log_file = archive / f"{_id}.log"

teuthology_process = Process(target=_execute_with_logs, args=(func, args, log_file))
teuthology_process.start()
teuthology_process.join()

teuth_queue = Queue()
teuth_process = Process(
target=_execute_with_logs, args=(func, args, log_file, teuth_queue)
)
teuth_process.daemon = True # Set the process as a daemon
teuth_process.start()
teuth_process.join(timeout=180) # Set the timeout value in seconds
if teuth_process.is_alive():
teuth_process.terminate() # Terminate the process if it exceeds the timeout
teuth_process.join()
raise TimeoutError("Process execution timed out")
logs = ""
with open(log_file, encoding="utf-8") as file:
logs = file.readlines()
if os.path.isfile(log_file):
os.remove(log_file)
return logs
log.debug(logs)
if teuth_process.exitcode > 0:
e = teuth_queue.get()
log.error(e)
return "fail", e, 0
else:
job_count = teuth_queue.get()
return "success", logs, job_count


def _execute_with_logs(func, args, log_file):
def _execute_with_logs(func, args, log_file, teuth_queue):
"""
To store logs, set a new FileHandler for teuthology root logger
and then execute the command function.
"""
teuthology.setup_log_file(log_file)
func(args)
try:
job_count = func(args)
teuth_queue.put(job_count)
except Exception as e:
teuth_queue.put(e)
raise


def get_run_details(run_name: str):
Expand Down
47 changes: 23 additions & 24 deletions src/teuthology_api/services/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,29 @@ def run(args, send_logs: bool, access_token: str):
detail="You need to be logged in",
headers={"WWW-Authenticate": "Bearer"},
)
try:
args["--timestamp"] = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

logs = logs_run(teuthology.suite.main, args)

# get run details from paddles
run_name = make_run_name(
{
"machine_type": args["--machine-type"],
"user": args["--user"],
"timestamp": args["--timestamp"],
"suite": args["--suite"],
"ceph_branch": args["--ceph"],
"kernel_branch": args["--kernel"],
"flavor": args["--flavor"],
}
)
run_details = get_run_details(run_name)
if send_logs or args["--dry-run"]:
return {"run": run_details, "logs": logs}
return {"run": run_details}
except Exception as exc:
log.error("teuthology.suite.main failed with the error: %s", repr(exc))
raise HTTPException(status_code=500, detail=repr(exc)) from exc
args["--timestamp"] = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
status, logs, job_count = logs_run(teuthology.suite.main, args)
if status == "fail":
raise logs
if args["--dry-run"] or job_count < 1:
return {"run": {}, "logs": logs, "job_count": job_count}
# get run details from paddles
run_name = make_run_name(
{
"machine_type": args["--machine-type"],
"user": args["--user"],
"timestamp": args["--timestamp"],
"suite": args["--suite"],
"ceph_branch": args["--ceph"],
"kernel_branch": args["--kernel"],
"flavor": args["--flavor"],
}
)
run_details = get_run_details(run_name)
if send_logs:
return {"run": run_details, "logs": logs, "job_count": job_count}
else:
return {"run": run_details, "job_count": job_count}


def make_run_name(run_dic):
Expand Down

0 comments on commit 9acc7cb

Please sign in to comment.