From f761b926d9bb2e6e1119763a5ffcf951aff0425b Mon Sep 17 00:00:00 2001 From: DavidHuber Date: Fri, 9 Aug 2024 20:58:33 +0000 Subject: [PATCH 1/2] Handle UNAVAILABLE rocoto state --- ci/scripts/utils/rocotostat.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/scripts/utils/rocotostat.py b/ci/scripts/utils/rocotostat.py index 70c672f0e8..a9384d450d 100755 --- a/ci/scripts/utils/rocotostat.py +++ b/ci/scripts/utils/rocotostat.py @@ -136,7 +136,7 @@ def rocoto_statcount(rocotostat): rocotostat_output = [line.split()[0:4] for line in rocotostat_output] rocotostat_output = [line for line in rocotostat_output if len(line) != 1] - status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED'] + status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED', 'UNAVAILABLE'] rocoto_status = {} status_counts = Counter(case for sublist in rocotostat_output for case in sublist) @@ -217,6 +217,11 @@ def is_stalled(rocoto_status): elif 'UNKNOWN' in rocoto_status: error_return = rocoto_status['UNKNOWN'] rocoto_state = 'UNKNOWN' + elif 'UNAVAILABLE' in rocoto_status: + rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError) + if 'UNAVAILABLE' in rocoto_status: + error_return = rocoto_status['UNAVAILABLE'] + rocoto_state = 'UNAVAILABLE' elif is_stalled(rocoto_status): rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError) if is_stalled(rocoto_status): From 4c529c3ea770b01e9feca47bcbb71f2d1215d228 Mon Sep 17 00:00:00 2001 From: DavidHuber Date: Tue, 13 Aug 2024 14:33:14 +0000 Subject: [PATCH 2/2] Allow UNKNOWN statuses to retry as well; initialize rocoto_state --- ci/scripts/utils/rocotostat.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ci/scripts/utils/rocotostat.py b/ci/scripts/utils/rocotostat.py index a9384d450d..4afea5c8b5 100755 --- a/ci/scripts/utils/rocotostat.py +++ b/ci/scripts/utils/rocotostat.py @@ -214,14 +214,16 @@ def is_stalled(rocoto_status): elif rocoto_status['DEAD'] > 0: error_return = rocoto_status['FAIL'] + rocoto_status['DEAD'] rocoto_state = 'FAIL' - elif 'UNKNOWN' in rocoto_status: - error_return = rocoto_status['UNKNOWN'] - rocoto_state = 'UNKNOWN' - elif 'UNAVAILABLE' in rocoto_status: + elif 'UNAVAILABLE' in rocoto_status or 'UNKNOWN' in rocoto_status: rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError) + error_return = 0 + rocoto_state = 'RUNNING' if 'UNAVAILABLE' in rocoto_status: error_return = rocoto_status['UNAVAILABLE'] rocoto_state = 'UNAVAILABLE' + if 'UNKNOWN' in rocoto_status: + error_return += rocoto_status['UNKNOWN'] + rocoto_state = 'UNKNOWN' elif is_stalled(rocoto_status): rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError) if is_stalled(rocoto_status):