From 7c4c1b9a8e00888e47f2e4433f22b6d12da632f6 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 16 Nov 2023 11:11:50 +0000 Subject: [PATCH 01/51] wip on simulation broadcastable --- cylc/flow/simulation.py | 15 ++++++++++++++- cylc/flow/task_job_mgr.py | 5 +++++ cylc/flow/task_proxy.py | 3 +++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 15314f8e3e7..5dc4d5325e6 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -37,6 +37,19 @@ from cylc.flow.task_proxy import TaskProxy +class ModeSettings: + """A store of state for simulation modes. + + Used instead of modifying the runtime config. + """ + __slots__ = [ + 'simulated_run_length' + ] + + def __init__(self): + self.simulated_run_length: Optional[float] = None + + def configure_sim_modes(taskdefs, sim_mode): """Adjust task defs for simulation and dummy mode. @@ -176,7 +189,7 @@ def sim_time_check( itask.summary['started_time'] = now timeout = ( itask.summary['started_time'] + - itask.tdef.rtconfig['simulation']['simulated run length'] + itask.mode_settings.simulated_run_length ) if now > timeout: job_d = itask.tokens.duplicate(job=str(itask.submit_num)) diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index 6651488b4b9..97f8ace6665 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -63,6 +63,7 @@ get_platform, ) from cylc.flow.remote import construct_ssh_cmd +from cylc.flow.simulation import ModeSettings from cylc.flow.subprocctx import SubProcContext from cylc.flow.subprocpool import SubProcPool from cylc.flow.task_action_timer import ( @@ -997,6 +998,10 @@ def _set_retry_timers( def _simulation_submit_task_jobs(self, itasks, workflow): """Simulation mode task jobs submission.""" for itask in itasks: + if not itask.mode_settings: + itask.mode_settings = ModeSettings() + if not itask.mode_settings.simulated_run_length: + itask.mode_settings.simulated_run_length = itask.tdef.rtconfig['simulation']['simulated run length'] itask.waiting_on_job_prep = False itask.submit_num += 1 self._set_retry_timers(itask) diff --git a/cylc/flow/task_proxy.py b/cylc/flow/task_proxy.py index 0e7fe1d8868..b8859fb6aef 100644 --- a/cylc/flow/task_proxy.py +++ b/cylc/flow/task_proxy.py @@ -186,6 +186,7 @@ class TaskProxy: 'tokens', 'try_timers', 'waiting_on_job_prep', + 'mode_settings', ] def __init__( @@ -266,6 +267,8 @@ def __init__( else: self.graph_children = generate_graph_children(tdef, self.point) + self.mode_settings = None + def __repr__(self) -> str: return f"<{self.__class__.__name__} '{self.tokens}'>" From c5f3a2591cb4916085a4f517492153a2213b8976 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Fri, 17 Nov 2023 12:47:56 +0000 Subject: [PATCH 02/51] Allow Broadcasting modification of sim mode tasks * Create a mode settings object - contains mode run info. * Test: Sim time check retrieves config on restart. * Make custom outputs happen on failure as well as success. * Check for broadcasts at simulation job submit. Apply suggestions from code review Co-authored-by: Oliver Sanders Response to review Removed get_simulation_time from submit time. response to review save and retreive simulation task start time across reloads make sim mode obey queued/runahead/held status Update cylc/flow/simulation.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> Apply suggestions from code review Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> Update tests/integration/test_simulation.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> Update tests/integration/test_simulation.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> response to review simplified ModeSettings Usage Apply suggestions from code review Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- changes.d/5721.feat.md | 1 + cylc/flow/rundb.py | 1 + cylc/flow/scheduler.py | 6 +- cylc/flow/simulation.py | 124 ++++++---- cylc/flow/task_job_mgr.py | 10 +- cylc/flow/task_proxy.py | 4 +- .../functional/modes/04-simulation-runtime.t | 62 +++++ .../modes/04-simulation-runtime/flow.cylc | 20 ++ .../modes/04-simulation-runtime/reference.log | 2 + tests/integration/test_simulation.py | 222 +++++++++++++++--- tests/unit/test_config.py | 28 --- tests/unit/test_simulation.py | 13 +- 12 files changed, 376 insertions(+), 117 deletions(-) create mode 100644 changes.d/5721.feat.md create mode 100644 tests/functional/modes/04-simulation-runtime.t create mode 100644 tests/functional/modes/04-simulation-runtime/flow.cylc create mode 100644 tests/functional/modes/04-simulation-runtime/reference.log diff --git a/changes.d/5721.feat.md b/changes.d/5721.feat.md new file mode 100644 index 00000000000..8562337c063 --- /dev/null +++ b/changes.d/5721.feat.md @@ -0,0 +1 @@ +Allow users to broadcast run_mode to tasks. \ No newline at end of file diff --git a/cylc/flow/rundb.py b/cylc/flow/rundb.py index 256da3b3129..2e281bc7c64 100644 --- a/cylc/flow/rundb.py +++ b/cylc/flow/rundb.py @@ -251,6 +251,7 @@ class CylcWorkflowDAO: ["flow_nums"], ["is_manual_submit", {"datatype": "INTEGER"}], ["try_num", {"datatype": "INTEGER"}], + # This is used to store simulation task start time across restarts. ["time_submit"], ["time_submit_exit"], ["submit_status", {"datatype": "INTEGER"}], diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py index fb1609ca00a..64c58cd1a01 100644 --- a/cylc/flow/scheduler.py +++ b/cylc/flow/scheduler.py @@ -1747,7 +1747,11 @@ async def main_loop(self) -> None: if ( self.pool.config.run_mode('simulation') and sim_time_check( - self.message_queue, self.pool.get_tasks()) + self.message_queue, + self.pool.get_tasks(), + self.task_events_mgr.broadcast_mgr, + self.workflow_db_mgr, + ) ): # A simulated task state change occurred. self.reset_inactivity_timer() diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 5dc4d5325e6..42541c431d1 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -16,11 +16,16 @@ """Utilities supporting simulation and skip modes """ +from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from time import time from cylc.flow.cycling.loader import get_point from cylc.flow.network.resolvers import TaskMsg +from cylc.flow.parsec.util import ( + pdeepcopy, + poverride +) from cylc.flow.platforms import FORBIDDEN_WITH_PLATFORM from cylc.flow.task_state import ( TASK_STATUS_RUNNING, @@ -29,25 +34,39 @@ ) from cylc.flow.wallclock import get_current_time_string -from metomi.isodatetime.parsers import DurationParser +from metomi.isodatetime.parsers import DurationParser, TimePointParser if TYPE_CHECKING: from queue import Queue + from cylc.flow.broadcast_mgr import BroadcastMgr from cylc.flow.cycling import PointBase from cylc.flow.task_proxy import TaskProxy + from cylc.flow.workflow_db_mgr import WorkflowDatabaseManager +@dataclass class ModeSettings: """A store of state for simulation modes. Used instead of modifying the runtime config. """ - __slots__ = [ - 'simulated_run_length' - ] - - def __init__(self): - self.simulated_run_length: Optional[float] = None + simulated_run_length: float = 0.0 + sim_task_fails: bool = False + + def __init__(self, itask: 'TaskProxy', broadcast_mgr: 'BroadcastMgr'): + overrides = broadcast_mgr.get_broadcast(itask.tokens) + if overrides: + rtconfig = pdeepcopy(itask.tdef.rtconfig) + poverride(rtconfig, overrides, prepend=True) + else: + rtconfig = itask.tdef.rtconfig + self.simulated_run_length = ( + set_simulated_run_len(rtconfig)) + self.sim_task_fails = sim_task_failed( + rtconfig['simulation'], + itask.point, + itask.submit_num + ) def configure_sim_modes(taskdefs, sim_mode): @@ -59,23 +78,17 @@ def configure_sim_modes(taskdefs, sim_mode): for tdef in taskdefs: # Compute simulated run time by scaling the execution limit. rtc = tdef.rtconfig - sleep_sec = get_simulated_run_len(rtc) - rtc['execution time limit'] = ( - sleep_sec + DurationParser().parse(str( - rtc['simulation']['time limit buffer'])).get_seconds() - ) - - rtc['simulation']['simulated run length'] = sleep_sec rtc['submission retry delays'] = [1] - # Generate dummy scripting. - rtc['init-script'] = "" - rtc['env-script'] = "" - rtc['pre-script'] = "" - rtc['post-script'] = "" - rtc['script'] = build_dummy_script( - rtc, sleep_sec) if dummy_mode else "" + if dummy_mode: + # Generate dummy scripting. + rtc['init-script'] = "" + rtc['env-script'] = "" + rtc['pre-script'] = "" + rtc['post-script'] = "" + rtc['script'] = build_dummy_script( + rtc, set_simulated_run_len(rtc)) disable_platforms(rtc) @@ -89,13 +102,14 @@ def configure_sim_modes(taskdefs, sim_mode): ) -def get_simulated_run_len(rtc: Dict[str, Any]) -> int: - """Get simulated run time. +def set_simulated_run_len(rtc: Dict[str, Any]) -> int: + """Calculate simulation run time from a task's config. rtc = run time config """ limit = rtc['execution time limit'] speedup = rtc['simulation']['speedup factor'] + if limit and speedup: sleep_sec = (DurationParser().parse( str(limit)).get_seconds() / speedup) @@ -170,7 +184,10 @@ def parse_fail_cycle_points( def sim_time_check( - message_queue: 'Queue[TaskMsg]', itasks: 'List[TaskProxy]' + message_queue: 'Queue[TaskMsg]', + itasks: 'List[TaskProxy]', + broadcast_mgr: 'BroadcastMgr', + db_mgr: 'WorkflowDatabaseManager', ) -> bool: """Check if sim tasks have been "running" for as long as required. @@ -179,38 +196,51 @@ def sim_time_check( Returns: True if _any_ simulated task state has changed. """ - sim_task_state_changed = False - now = time() + sim_task_state_changed: bool = False for itask in itasks: - if itask.state.status != TASK_STATUS_RUNNING: + if ( + itask.state.status != TASK_STATUS_RUNNING + or itask.state.is_queued + or itask.state.is_held + or itask.state.is_runahead + ): continue - # Started time is not set on restart - if itask.summary['started_time'] is None: - itask.summary['started_time'] = now - timeout = ( - itask.summary['started_time'] + - itask.mode_settings.simulated_run_length - ) - if now > timeout: + + # Started time and mode_settings are not set on restart: + started_time = itask.summary['started_time'] + if started_time is None: + started_time = int( + TimePointParser() + .parse( + db_mgr.pub_dao.select_task_job( + *itask.tokens.relative_id.split("/") + )["time_submit"] + ) + .seconds_since_unix_epoch + ) + itask.summary['started_time'] = started_time + if itask.mode_settings is None: + itask.mode_settings = ModeSettings(itask, broadcast_mgr) + + timeout = started_time + itask.mode_settings.simulated_run_length + if time() > timeout: job_d = itask.tokens.duplicate(job=str(itask.submit_num)) now_str = get_current_time_string() - if sim_task_failed( - itask.tdef.rtconfig['simulation'], - itask.point, - itask.get_try_num() - ): + + if itask.mode_settings.sim_task_fails: message_queue.put( TaskMsg(job_d, now_str, 'CRITICAL', TASK_STATUS_FAILED) ) else: - # Simulate message outputs. - for msg in itask.tdef.rtconfig['outputs'].values(): - message_queue.put( - TaskMsg(job_d, now_str, 'DEBUG', msg) - ) message_queue.put( TaskMsg(job_d, now_str, 'DEBUG', TASK_STATUS_SUCCEEDED) ) + + # Simulate message outputs. + for msg in itask.tdef.rtconfig['outputs'].values(): + message_queue.put( + TaskMsg(job_d, now_str, 'DEBUG', msg) + ) sim_task_state_changed = True return sim_task_state_changed @@ -218,7 +248,7 @@ def sim_time_check( def sim_task_failed( sim_conf: Dict[str, Any], point: 'PointBase', - try_num: int, + submit_num: int, ) -> bool: """Encapsulate logic for deciding whether a sim task has failed. @@ -228,5 +258,5 @@ def sim_task_failed( sim_conf['fail cycle points'] is None # i.e. "all" or point in sim_conf['fail cycle points'] ) and ( - try_num == 1 or not sim_conf['fail try 1 only'] + submit_num == 0 or not sim_conf['fail try 1 only'] ) diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index 97f8ace6665..ea4effa35ff 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -998,10 +998,8 @@ def _set_retry_timers( def _simulation_submit_task_jobs(self, itasks, workflow): """Simulation mode task jobs submission.""" for itask in itasks: - if not itask.mode_settings: - itask.mode_settings = ModeSettings() - if not itask.mode_settings.simulated_run_length: - itask.mode_settings.simulated_run_length = itask.tdef.rtconfig['simulation']['simulated run length'] + itask.mode_settings = ModeSettings( + itask, self.task_events_mgr.broadcast_mgr) itask.waiting_on_job_prep = False itask.submit_num += 1 self._set_retry_timers(itask) @@ -1009,7 +1007,7 @@ def _simulation_submit_task_jobs(self, itasks, workflow): itask.platform = {'name': 'SIMULATION'} itask.summary['job_runner_name'] = 'SIMULATION' itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = ( - itask.tdef.rtconfig['simulation']['simulated run length'] + itask.mode_settings.simulated_run_length ) itask.jobs.append( self.get_simulation_job_conf(itask, workflow) @@ -1017,6 +1015,8 @@ def _simulation_submit_task_jobs(self, itasks, workflow): self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_SUBMITTED, ) + self.workflow_db_mgr.put_insert_task_jobs( + itask, {'time_submit': get_current_time_string()}) return itasks diff --git a/cylc/flow/task_proxy.py b/cylc/flow/task_proxy.py index b8859fb6aef..e413790e6aa 100644 --- a/cylc/flow/task_proxy.py +++ b/cylc/flow/task_proxy.py @@ -48,6 +48,7 @@ if TYPE_CHECKING: from cylc.flow.id import Tokens from cylc.flow.cycling import PointBase + from cylc.flow.simulation import ModeSettings from cylc.flow.task_action_timer import TaskActionTimer from cylc.flow.taskdef import TaskDef @@ -267,7 +268,7 @@ def __init__( else: self.graph_children = generate_graph_children(tdef, self.point) - self.mode_settings = None + self.mode_settings: Optional['ModeSettings'] = None def __repr__(self) -> str: return f"<{self.__class__.__name__} '{self.tokens}'>" @@ -298,6 +299,7 @@ def copy_to_reload_successor(self, reload_successor, check_output): reload_successor.state.is_held = self.state.is_held reload_successor.state.is_runahead = self.state.is_runahead reload_successor.state.is_updated = self.state.is_updated + reload_successor.mode_settings = self.mode_settings # Prerequisites: the graph might have changed before reload, so # we need to use the new prerequisites but update them with the diff --git a/tests/functional/modes/04-simulation-runtime.t b/tests/functional/modes/04-simulation-runtime.t new file mode 100644 index 00000000000..5137e4cbe0d --- /dev/null +++ b/tests/functional/modes/04-simulation-runtime.t @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Test that we can broadcast an alteration to simulation mode. + +. "$(dirname "$0")/test_header" +set_test_number 7 + +install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" +run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" +workflow_run_ok "${TEST_NAME_BASE}-start" \ + cylc play "${WORKFLOW_NAME}" --mode=simulation --pause +SCHD_LOG="${WORKFLOW_RUN_DIR}/log/scheduler/log" + +# This broadcast will ensure that second_task will not finish +# in time for the workflow timeout, unless the broadcast +# cancelletion works correctly: +run_ok "second-task-broadcast-too-long" \ + cylc broadcast "${WORKFLOW_NAME}" \ + -n second_task \ + -s 'execution time limit = P1D' + +# Test cancelling broadcast changes the task config back. +run_ok "cancel-second-task-broadcast" \ + cylc broadcast "${WORKFLOW_NAME}" \ + -n second_task\ + --clear + +# If we speed up the simulated first_task task we +# can make it finish before workflow timeout +# (neither change will do this on its own): +run_ok "first-task-speed-up-broadcast" \ + cylc broadcast "${WORKFLOW_NAME}" \ + -n first_task \ + -s '[simulation]speedup factor = 60' \ + -s 'execution time limit = PT60S' + +workflow_run_ok "${TEST_NAME_BASE}-unpause" \ + cylc play "${WORKFLOW_NAME}" + +# Wait for the workflow to finish (it wasn't run in no-detach mode): +poll_workflow_stopped + +# If we hadn't changed the speedup factor using broadcast +# The workflow timeout would have been hit: +grep_fail "WARNING - Orphaned tasks" "${SCHD_LOG}" + +purge diff --git a/tests/functional/modes/04-simulation-runtime/flow.cylc b/tests/functional/modes/04-simulation-runtime/flow.cylc new file mode 100644 index 00000000000..5d2b4e349d5 --- /dev/null +++ b/tests/functional/modes/04-simulation-runtime/flow.cylc @@ -0,0 +1,20 @@ +[scheduler] + [[events]] + workflow timeout = PT30S + +[scheduling] + initial cycle point = 2359 + [[graph]] + R1 = first_task & second_task + +[runtime] + [[first_task]] + execution time limit = P2D + execution retry delays = PT10M + [[[simulation]]] + speedup factor = 1 + + [[second_task]] + execution time limit = PT1S + [[[simulation]]] + speedup factor = 1 diff --git a/tests/functional/modes/04-simulation-runtime/reference.log b/tests/functional/modes/04-simulation-runtime/reference.log new file mode 100644 index 00000000000..2d14bc201fb --- /dev/null +++ b/tests/functional/modes/04-simulation-runtime/reference.log @@ -0,0 +1,2 @@ +23590101T0000Z/get_observations -triggered off [] in flow 1 +23590101T0000Z/get_observations -triggered off [] in flow 1 diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index e0ada2c3e49..fe562ba63ce 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -15,8 +15,8 @@ # along with this program. If not, see . import pytest +from pytest import param from queue import Queue -from types import SimpleNamespace from cylc.flow.cycling.iso8601 import ISO8601Point from cylc.flow.simulation import sim_time_check @@ -30,34 +30,48 @@ def get_msg_queue_item(queue, id_): @pytest.fixture(scope='module') async def sim_time_check_setup( - mod_flow, mod_scheduler, mod_start, mod_one_conf + mod_flow, mod_scheduler, mod_start, mod_one_conf, ): schd = mod_scheduler(mod_flow({ 'scheduler': {'cycle point format': '%Y'}, 'scheduling': { 'initial cycle point': '1066', 'graph': { - 'R1': 'one & fail_all & fast_forward' + 'R1': 'one & fail_all & fast_forward', + 'P1Y': 'fail_once & fail_all_submits' } }, 'runtime': { 'one': {}, 'fail_all': { - 'simulation': {'fail cycle points': 'all'}, + 'simulation': { + 'fail cycle points': 'all', + 'fail try 1 only': False + }, 'outputs': {'foo': 'bar'} }, # This task ought not be finished quickly, but for the speed up 'fast_forward': { 'execution time limit': 'PT1M', 'simulation': {'speedup factor': 2} + }, + 'fail_once': { + 'simulation': { + 'fail cycle points': '1066, 1068', + } + }, + 'fail_all_submits': { + 'simulation': { + 'fail cycle points': '1066', + 'fail try 1 only': False, + } } } })) msg_q = Queue() async with mod_start(schd): itasks = schd.pool.get_tasks() - for i in itasks: - i.try_timers = {'execution-retry': SimpleNamespace(num=0)} + [schd.task_job_mgr._set_retry_timers(i) for i in itasks] yield schd, itasks, msg_q @@ -65,49 +79,109 @@ def test_false_if_not_running(sim_time_check_setup, monkeypatch): schd, itasks, msg_q = sim_time_check_setup # False if task status not running: - assert sim_time_check(msg_q, itasks) is False + assert sim_time_check( + msg_q, itasks, schd.task_events_mgr.broadcast_mgr, '' + ) is False -def test_sim_time_check_sets_started_time(sim_time_check_setup): - """But sim_time_check still returns False""" +@pytest.mark.parametrize( + 'itask, point, results', + ( + # Task fails this CP, first submit. + param( + 'fail_once', '1066', (True, False, False), + id='only-fail-on-submit-1'), + # Task succeeds this CP, all submits. + param( + 'fail_once', '1067', (False, False, False), + id='do-not-fail-this-cp'), + # Task fails this CP, first submit. + param( + 'fail_once', '1068', (True, False, False), + id='and-another-cp'), + # Task fails this CP, all submits. + param( + 'fail_all_submits', '1066', (True, True, True), + id='fail-all-submits'), + # Task succeeds this CP, all submits. + param( + 'fail_all_submits', '1067', (False, False, False), + id='fail-no-submits'), + ) +) +def test_fail_once(sim_time_check_setup, itask, point, results): + """A task with a fail cycle point only fails + at that cycle point, and then only on the first submission. + """ + schd, _, msg_q = sim_time_check_setup + + itask = schd.pool.get_task( + ISO8601Point(point), itask) + + for result in results: + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + assert itask.mode_settings.sim_task_fails is result + + +def test_sim_time_check_sets_started_time( + scheduler, sim_time_check_setup +): + """But sim_time_check still returns False + + This only occurs in reality if we've restarted from database and + not retrieved the started time from itask.summary. + """ schd, _, msg_q = sim_time_check_setup one_1066 = schd.pool.get_task(ISO8601Point('1066'), 'one') - one_1066.state.status = 'running' + schd.task_job_mgr._simulation_submit_task_jobs( + [one_1066], schd.workflow) + schd.workflow_db_mgr.process_queued_ops() + one_1066.summary['started_time'] = None + one_1066.state.is_queued = False assert one_1066.summary['started_time'] is None - assert sim_time_check(msg_q, [one_1066]) is False + assert sim_time_check( + msg_q, [one_1066], schd.task_events_mgr.broadcast_mgr, + schd.workflow_db_mgr + ) is False assert one_1066.summary['started_time'] is not None def test_task_finishes(sim_time_check_setup, monkeypatch): """...and an appropriate message sent. - Checks all possible outcomes in sim_time_check where elapsed time is - greater than the simulation time. + Checks that failed and bar are output if a task is set to fail. - Does NOT check every possible cause on an outcome - this is done + Does NOT check every possible cause of an outcome - this is done in unit tests. """ schd, _, msg_q = sim_time_check_setup monkeypatch.setattr('cylc.flow.simulation.time', lambda: 0) - # Setup a task to fail + # Setup a task to fail, submit it. fail_all_1066 = schd.pool.get_task(ISO8601Point('1066'), 'fail_all') fail_all_1066.state.status = 'running' - fail_all_1066.try_timers = {'execution-retry': SimpleNamespace(num=0)} + fail_all_1066.state.is_queued = False + schd.task_job_mgr._simulation_submit_task_jobs( + [fail_all_1066], schd.workflow) + + # For the purpose of the test delete the started time set by + # _simulation_submit_task_jobs. + fail_all_1066.summary['started_time'] = 0 # Before simulation time is up: - assert sim_time_check(msg_q, [fail_all_1066]) is False + assert sim_time_check( + msg_q, [fail_all_1066], schd.task_events_mgr.broadcast_mgr, '' + ) is False - # After simulation time is up: + # Time's up... monkeypatch.setattr('cylc.flow.simulation.time', lambda: 12) - assert sim_time_check(msg_q, [fail_all_1066]) is True - assert get_msg_queue_item(msg_q, '1066/fail_all').message == 'failed' - # Succeeds and records messages for all outputs: - fail_all_1066.try_timers = {'execution-retry': SimpleNamespace(num=1)} - msg_q = Queue() - assert sim_time_check(msg_q, [fail_all_1066]) is True - assert sorted(i.message for i in msg_q.queue) == ['bar', 'succeeded'] + # After simulation time is up it Fails and records custom outputs: + assert sim_time_check( + msg_q, [fail_all_1066], schd.task_events_mgr.broadcast_mgr, '' + ) is True + assert sorted(i.message for i in msg_q.queue) == ['bar', 'failed'] def test_task_sped_up(sim_time_check_setup, monkeypatch): @@ -115,11 +189,101 @@ def test_task_sped_up(sim_time_check_setup, monkeypatch): schd, _, msg_q = sim_time_check_setup fast_forward_1066 = schd.pool.get_task( ISO8601Point('1066'), 'fast_forward') - fast_forward_1066.state.status = 'running' + schd.task_job_mgr._simulation_submit_task_jobs( + [fast_forward_1066], schd.workflow) + + # For the purpose of the test delete the started time set but + # _simulation_submit_task_jobs. + fast_forward_1066.summary['started_time'] = 0 + fast_forward_1066.state.is_queued = False monkeypatch.setattr('cylc.flow.simulation.time', lambda: 0) - assert sim_time_check(msg_q, [fast_forward_1066]) is False + assert sim_time_check( + msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' + ) is False monkeypatch.setattr('cylc.flow.simulation.time', lambda: 29) - assert sim_time_check(msg_q, [fast_forward_1066]) is False + assert sim_time_check( + msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' + ) is False monkeypatch.setattr('cylc.flow.simulation.time', lambda: 31) - assert sim_time_check(msg_q, [fast_forward_1066]) is True + assert sim_time_check( + msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' + ) is True + + +async def test_simulation_mode_settings_restart( + monkeypatch, flow, scheduler, run, start +): + """Check that simulation mode settings are correctly restored + upon restart. + + In the case of start time this is collected from the database + from task_jobs.start_time. + """ + id_ = flow({ + 'scheduler': {'cycle point format': '%Y'}, + 'scheduling': { + 'initial cycle point': '1066', + 'graph': { + 'R1': 'one' + } + }, + 'runtime': { + 'one': { + 'execution time limit': 'PT1M', + 'simulation': { + 'speedup factor': 1 + } + }, + } + }) + schd = scheduler(id_) + msg_q = Queue() + + # Start the workflow: + async with start(schd): + # Pick the task proxy, Mock its start time, set state to running: + itask = schd.pool.get_tasks()[0] + itask.summary['started_time'] = 0 + itask.state.status = 'running' + + # Submit it, then mock the wallclock and assert that it's not finshed. + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + monkeypatch.setattr('cylc.flow.simulation.time', lambda: 0) + + assert sim_time_check( + msg_q, [itask], schd.task_events_mgr.broadcast_mgr, + schd.workflow_db_mgr + ) is False + + # Stop and restart the scheduler: + schd = scheduler(id_) + async with start(schd): + # Get our tasks and fix wallclock: + itask = schd.pool.get_tasks()[0] + monkeypatch.setattr('cylc.flow.simulation.time', lambda: 12) + itask.state.status = 'running' + + # Check that we haven't got started time back + assert itask.summary['started_time'] is None + + # Set the start time in the database to 0 to make the + # test simpler: + schd.workflow_db_mgr.put_insert_task_jobs( + itask, {'time_submit': '19700101T0000Z'}) + schd.workflow_db_mgr.process_queued_ops() + + # Set the current time: + monkeypatch.setattr('cylc.flow.simulation.time', lambda: 12) + assert sim_time_check( + msg_q, [itask], schd.task_events_mgr.broadcast_mgr, + schd.workflow_db_mgr + ) is False + + # Set the current time > timeout + monkeypatch.setattr('cylc.flow.simulation.time', lambda: 61) + assert sim_time_check( + msg_q, [itask], schd.task_events_mgr.broadcast_mgr, + schd.workflow_db_mgr + ) is True diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 494dd98e62b..98d9fc2f4ce 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -1765,31 +1765,3 @@ def test_cylc_env_at_parsing( assert var in cylc_env else: assert var not in cylc_env - - -def test_configure_sim_mode(caplog): - job_section = {} - sim_section = { - 'speedup factor': '', - 'default run length': 'PT10S', - 'time limit buffer': 'PT0S', - 'fail try 1 only': False, - 'fail cycle points': '', - } - rtconfig_1 = { - 'execution time limit': '', - 'simulation': sim_section, - 'job': job_section, - 'outputs': {}, - } - rtconfig_2 = deepcopy(rtconfig_1) - rtconfig_2['simulation']['default run length'] = 'PT2S' - - taskdefs = [ - SimpleNamespace(rtconfig=rtconfig_1), - SimpleNamespace(rtconfig=rtconfig_2), - ] - configure_sim_modes(taskdefs, 'simulation') - results = [ - i.rtconfig['simulation']['simulated run length'] for i in taskdefs] - assert results == [10.0, 2.0] diff --git a/tests/unit/test_simulation.py b/tests/unit/test_simulation.py index 1c490f35c16..753661a8f0e 100644 --- a/tests/unit/test_simulation.py +++ b/tests/unit/test_simulation.py @@ -24,7 +24,7 @@ parse_fail_cycle_points, build_dummy_script, disable_platforms, - get_simulated_run_len, + set_simulated_run_len, sim_task_failed, ) @@ -38,7 +38,7 @@ param('P1D', 24, 'PT1M', id='speed-up-and-execution-tl'), ) ) -def test_get_simulated_run_len( +def test_set_simulated_run_len( execution_time_limit, speedup_factor, default_run_length ): """Test the logic of the presence or absence of config items. @@ -49,10 +49,11 @@ def test_get_simulated_run_len( 'execution time limit': execution_time_limit, 'simulation': { 'speedup factor': speedup_factor, - 'default run length': default_run_length + 'default run length': default_run_length, + 'time limit buffer': 'PT0S', } } - assert get_simulated_run_len(rtc) == 3600 + assert set_simulated_run_len(rtc) == 3600 @pytest.mark.parametrize( @@ -115,7 +116,7 @@ def test_parse_fail_cycle_points(set_cycling_type): id='defaults' ), param( - {'fail cycle points': None, 'fail try 1 only': True}, + {'fail cycle points': None, 'fail try 1 only': False}, ISO8601Point('1066'), 1, True, @@ -125,7 +126,7 @@ def test_parse_fail_cycle_points(set_cycling_type): { 'fail cycle points': [ ISO8601Point('1066'), ISO8601Point('1067')], - 'fail try 1 only': True + 'fail try 1 only': False }, ISO8601Point('1067'), 1, From 6f5320f8b46760b24aa7ed1f60e6b231286082cc Mon Sep 17 00:00:00 2001 From: WXTIM <26465611+wxtim@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:12:04 +0000 Subject: [PATCH 03/51] return to get_simulation. Cache call to time() --- cylc/flow/simulation.py | 9 +++++---- tests/unit/test_simulation.py | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 42541c431d1..f34e9d8b260 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -61,7 +61,7 @@ def __init__(self, itask: 'TaskProxy', broadcast_mgr: 'BroadcastMgr'): else: rtconfig = itask.tdef.rtconfig self.simulated_run_length = ( - set_simulated_run_len(rtconfig)) + get_simulated_run_len(rtconfig)) self.sim_task_fails = sim_task_failed( rtconfig['simulation'], itask.point, @@ -88,7 +88,7 @@ def configure_sim_modes(taskdefs, sim_mode): rtc['pre-script'] = "" rtc['post-script'] = "" rtc['script'] = build_dummy_script( - rtc, set_simulated_run_len(rtc)) + rtc, get_simulated_run_len(rtc)) disable_platforms(rtc) @@ -102,7 +102,7 @@ def configure_sim_modes(taskdefs, sim_mode): ) -def set_simulated_run_len(rtc: Dict[str, Any]) -> int: +def get_simulated_run_len(rtc: Dict[str, Any]) -> int: """Calculate simulation run time from a task's config. rtc = run time config @@ -196,6 +196,7 @@ def sim_time_check( Returns: True if _any_ simulated task state has changed. """ + now = time() sim_task_state_changed: bool = False for itask in itasks: if ( @@ -223,7 +224,7 @@ def sim_time_check( itask.mode_settings = ModeSettings(itask, broadcast_mgr) timeout = started_time + itask.mode_settings.simulated_run_length - if time() > timeout: + if now > timeout: job_d = itask.tokens.duplicate(job=str(itask.submit_num)) now_str = get_current_time_string() diff --git a/tests/unit/test_simulation.py b/tests/unit/test_simulation.py index 753661a8f0e..920a872503a 100644 --- a/tests/unit/test_simulation.py +++ b/tests/unit/test_simulation.py @@ -24,7 +24,7 @@ parse_fail_cycle_points, build_dummy_script, disable_platforms, - set_simulated_run_len, + get_simulated_run_len, sim_task_failed, ) @@ -38,7 +38,7 @@ param('P1D', 24, 'PT1M', id='speed-up-and-execution-tl'), ) ) -def test_set_simulated_run_len( +def test_get_simulated_run_len( execution_time_limit, speedup_factor, default_run_length ): """Test the logic of the presence or absence of config items. @@ -53,7 +53,7 @@ def test_set_simulated_run_len( 'time limit buffer': 'PT0S', } } - assert set_simulated_run_len(rtc) == 3600 + assert get_simulated_run_len(rtc) == 3600 @pytest.mark.parametrize( From 00c29a5b0118c4d40c3715265a6ba9dcde780679 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Fri, 19 Jan 2024 09:28:30 +0000 Subject: [PATCH 04/51] Cached timeout time for sim tasks on the sim_modes object. --- cylc/flow/simulation.py | 43 +++++++++++++++++----------- cylc/flow/task_job_mgr.py | 2 ++ tests/integration/test_simulation.py | 42 ++++++++++++++++----------- 3 files changed, 54 insertions(+), 33 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index f34e9d8b260..9fe3291277f 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -53,7 +53,12 @@ class ModeSettings: simulated_run_length: float = 0.0 sim_task_fails: bool = False - def __init__(self, itask: 'TaskProxy', broadcast_mgr: 'BroadcastMgr'): + def __init__( + self, + itask: 'TaskProxy', + broadcast_mgr: 'BroadcastMgr', + db_mgr: 'WorkflowDatabaseManager' = None + ): overrides = broadcast_mgr.get_broadcast(itask.tokens) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) @@ -68,6 +73,22 @@ def __init__(self, itask: 'TaskProxy', broadcast_mgr: 'BroadcastMgr'): itask.submit_num ) + # itask.summary['started_time'] and mode_settings.timeout need + # repopulating from the DB on workflow restart: + started_time = itask.summary['started_time'] + if started_time is None: + started_time = int( + TimePointParser() + .parse( + db_mgr.pub_dao.select_task_job( + *itask.tokens.relative_id.split("/") + )["time_submit"] + ) + .seconds_since_unix_epoch + ) + itask.summary['started_time'] = started_time + + self.timeout = started_time + self.simulated_run_length def configure_sim_modes(taskdefs, sim_mode): """Adjust task defs for simulation and dummy mode. @@ -207,24 +228,12 @@ def sim_time_check( ): continue - # Started time and mode_settings are not set on restart: - started_time = itask.summary['started_time'] - if started_time is None: - started_time = int( - TimePointParser() - .parse( - db_mgr.pub_dao.select_task_job( - *itask.tokens.relative_id.split("/") - )["time_submit"] - ) - .seconds_since_unix_epoch - ) - itask.summary['started_time'] = started_time + + if itask.mode_settings is None: - itask.mode_settings = ModeSettings(itask, broadcast_mgr) + itask.mode_settings = ModeSettings(itask, broadcast_mgr, db_mgr) - timeout = started_time + itask.mode_settings.simulated_run_length - if now > timeout: + if now > itask.mode_settings.timeout: job_d = itask.tokens.duplicate(job=str(itask.submit_num)) now_str = get_current_time_string() diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index ea4effa35ff..268b2c635da 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -997,7 +997,9 @@ def _set_retry_timers( def _simulation_submit_task_jobs(self, itasks, workflow): """Simulation mode task jobs submission.""" + now = time() for itask in itasks: + itask.summary['started_time'] = now itask.mode_settings = ModeSettings( itask, self.task_events_mgr.broadcast_mgr) itask.waiting_on_job_prep = False diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index fe562ba63ce..fcdf2435afe 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -28,6 +28,15 @@ def get_msg_queue_item(queue, id_): return item +@pytest.fixture +def monkeytime(monkeypatch): + """Convenience function monkeypatching time.""" + def _inner(time_: int): + monkeypatch.setattr('cylc.flow.task_job_mgr.time', lambda: time_) + monkeypatch.setattr('cylc.flow.simulation.time', lambda: time_) + return _inner + + @pytest.fixture(scope='module') async def sim_time_check_setup( mod_flow, mod_scheduler, mod_start, mod_one_conf, @@ -134,11 +143,13 @@ def test_sim_time_check_sets_started_time( """ schd, _, msg_q = sim_time_check_setup one_1066 = schd.pool.get_task(ISO8601Point('1066'), 'one') + # Add info to databse as if it's be started before shutdown: schd.task_job_mgr._simulation_submit_task_jobs( [one_1066], schd.workflow) schd.workflow_db_mgr.process_queued_ops() one_1066.summary['started_time'] = None one_1066.state.is_queued = False + one_1066.mode_settings = None assert one_1066.summary['started_time'] is None assert sim_time_check( msg_q, [one_1066], schd.task_events_mgr.broadcast_mgr, @@ -147,7 +158,7 @@ def test_sim_time_check_sets_started_time( assert one_1066.summary['started_time'] is not None -def test_task_finishes(sim_time_check_setup, monkeypatch): +def test_task_finishes(sim_time_check_setup, monkeytime): """...and an appropriate message sent. Checks that failed and bar are output if a task is set to fail. @@ -156,7 +167,7 @@ def test_task_finishes(sim_time_check_setup, monkeypatch): in unit tests. """ schd, _, msg_q = sim_time_check_setup - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 0) + monkeytime(0) # Setup a task to fail, submit it. fail_all_1066 = schd.pool.get_task(ISO8601Point('1066'), 'fail_all') @@ -175,7 +186,7 @@ def test_task_finishes(sim_time_check_setup, monkeypatch): ) is False # Time's up... - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 12) + monkeytime(12) # After simulation time is up it Fails and records custom outputs: assert sim_time_check( @@ -184,35 +195,34 @@ def test_task_finishes(sim_time_check_setup, monkeypatch): assert sorted(i.message for i in msg_q.queue) == ['bar', 'failed'] -def test_task_sped_up(sim_time_check_setup, monkeypatch): +def test_task_sped_up(sim_time_check_setup, monkeytime): """Task will speed up by a factor set in config.""" + schd, _, msg_q = sim_time_check_setup fast_forward_1066 = schd.pool.get_task( ISO8601Point('1066'), 'fast_forward') + + # Run the job submission method: + monkeytime(0) schd.task_job_mgr._simulation_submit_task_jobs( [fast_forward_1066], schd.workflow) - - # For the purpose of the test delete the started time set but - # _simulation_submit_task_jobs. - fast_forward_1066.summary['started_time'] = 0 fast_forward_1066.state.is_queued = False - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 0) assert sim_time_check( msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' ) is False - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 29) + monkeytime(29) assert sim_time_check( msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' ) is False - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 31) + monkeytime(31) assert sim_time_check( msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' ) is True async def test_simulation_mode_settings_restart( - monkeypatch, flow, scheduler, run, start + monkeytime, flow, scheduler, run, start ): """Check that simulation mode settings are correctly restored upon restart. @@ -250,7 +260,7 @@ async def test_simulation_mode_settings_restart( # Submit it, then mock the wallclock and assert that it's not finshed. schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 0) + monkeytime(0) assert sim_time_check( msg_q, [itask], schd.task_events_mgr.broadcast_mgr, @@ -262,7 +272,7 @@ async def test_simulation_mode_settings_restart( async with start(schd): # Get our tasks and fix wallclock: itask = schd.pool.get_tasks()[0] - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 12) + monkeytime(12) itask.state.status = 'running' # Check that we haven't got started time back @@ -275,14 +285,14 @@ async def test_simulation_mode_settings_restart( schd.workflow_db_mgr.process_queued_ops() # Set the current time: - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 12) + monkeytime(12) assert sim_time_check( msg_q, [itask], schd.task_events_mgr.broadcast_mgr, schd.workflow_db_mgr ) is False # Set the current time > timeout - monkeypatch.setattr('cylc.flow.simulation.time', lambda: 61) + monkeytime(61) assert sim_time_check( msg_q, [itask], schd.task_events_mgr.broadcast_mgr, schd.workflow_db_mgr From f6b13b18480ba498d2cd0f17b6a2b1aeb3dd32f4 Mon Sep 17 00:00:00 2001 From: WXTIM <26465611+wxtim@users.noreply.github.com> Date: Mon, 22 Jan 2024 10:22:16 +0000 Subject: [PATCH 05/51] de-flake tests --- tests/integration/test_simulation.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index fcdf2435afe..cbb03c81f93 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -37,6 +37,16 @@ def _inner(time_: int): return _inner +@pytest.fixture +def q_clean(): + """Clear message queue to revent test interference. + """ + def _inner(msg_q): + if not msg_q.empty(): + msg_q.get() + return _inner + + @pytest.fixture(scope='module') async def sim_time_check_setup( mod_flow, mod_scheduler, mod_start, mod_one_conf, @@ -84,9 +94,13 @@ async def sim_time_check_setup( yield schd, itasks, msg_q -def test_false_if_not_running(sim_time_check_setup, monkeypatch): +def test_false_if_not_running( + sim_time_check_setup, monkeypatch, q_clean +): schd, itasks, msg_q = sim_time_check_setup + itasks = [i for i in itasks if i.state.status != 'running'] + # False if task status not running: assert sim_time_check( msg_q, itasks, schd.task_events_mgr.broadcast_mgr, '' @@ -158,7 +172,7 @@ def test_sim_time_check_sets_started_time( assert one_1066.summary['started_time'] is not None -def test_task_finishes(sim_time_check_setup, monkeytime): +def test_task_finishes(sim_time_check_setup, monkeytime, q_clean): """...and an appropriate message sent. Checks that failed and bar are output if a task is set to fail. @@ -167,6 +181,9 @@ def test_task_finishes(sim_time_check_setup, monkeytime): in unit tests. """ schd, _, msg_q = sim_time_check_setup + + q_clean(msg_q) + monkeytime(0) # Setup a task to fail, submit it. From ff6037c3a018ef2035f914857696d1ac435103cf Mon Sep 17 00:00:00 2001 From: WXTIM <26465611+wxtim@users.noreply.github.com> Date: Mon, 29 Jan 2024 10:01:55 +0000 Subject: [PATCH 06/51] Ensure that mode_settings are deleted from the task proxy when a psuedo-job finishes. Add test for workflow reload. --- cylc/flow/simulation.py | 8 ++- tests/integration/test_simulation.py | 102 ++++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 9fe3291277f..02ff53d04d8 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -90,6 +90,7 @@ def __init__( self.timeout = started_time + self.simulated_run_length + def configure_sim_modes(taskdefs, sim_mode): """Adjust task defs for simulation and dummy mode. @@ -219,6 +220,7 @@ def sim_time_check( """ now = time() sim_task_state_changed: bool = False + for itask in itasks: if ( itask.state.status != TASK_STATUS_RUNNING @@ -228,8 +230,6 @@ def sim_time_check( ): continue - - if itask.mode_settings is None: itask.mode_settings = ModeSettings(itask, broadcast_mgr, db_mgr) @@ -251,7 +251,11 @@ def sim_time_check( message_queue.put( TaskMsg(job_d, now_str, 'DEBUG', msg) ) + + # We've finished this psuedojob, so delete all the mode settings. + itask.mode_settings = None sim_task_state_changed = True + itask.mode_settings = None return sim_task_state_changed diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index cbb03c81f93..89585ba5afb 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -14,6 +14,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +from pathlib import Path import pytest from pytest import param from queue import Queue @@ -239,7 +240,7 @@ def test_task_sped_up(sim_time_check_setup, monkeytime): async def test_simulation_mode_settings_restart( - monkeytime, flow, scheduler, run, start + monkeytime, flow, scheduler, start ): """Check that simulation mode settings are correctly restored upon restart. @@ -258,8 +259,11 @@ async def test_simulation_mode_settings_restart( 'runtime': { 'one': { 'execution time limit': 'PT1M', + 'execution retry delays': 'P0Y', 'simulation': { - 'speedup factor': 1 + 'speedup factor': 1, + 'fail cycle points': 'all', + 'fail try 1 only': True, } }, } @@ -292,8 +296,9 @@ async def test_simulation_mode_settings_restart( monkeytime(12) itask.state.status = 'running' - # Check that we haven't got started time back + # Check that we haven't got started time & mode settings back: assert itask.summary['started_time'] is None + assert itask.mode_settings is None # Set the start time in the database to 0 to make the # test simpler: @@ -308,9 +313,100 @@ async def test_simulation_mode_settings_restart( schd.workflow_db_mgr ) is False + # Check that the itask.mode_settings is now re-created + assert itask.mode_settings.__dict__ == { + 'simulated_run_length': 60.0, + 'sim_task_fails': False, + 'timeout': 60.0 + } + # Set the current time > timeout monkeytime(61) assert sim_time_check( msg_q, [itask], schd.task_events_mgr.broadcast_mgr, schd.workflow_db_mgr ) is True + + assert itask.mode_settings is None + + schd.task_events_mgr.broadcast_mgr.put_broadcast( + ['1066'], ['one'], [{ + 'execution time limit': 'PT1S'}]) + + assert itask.mode_settings is None + + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + + assert itask.submit_num == 2 + assert itask.mode_settings.__dict__ == { + 'simulated_run_length': 1.0, + 'sim_task_fails': False, + 'timeout': 62.0 + } + + +async def test_simulation_mode_settings_reload( + monkeytime, flow, scheduler, start +): + """Check that simulation mode settings are changed for future + pseudo jobs on reload. + + """ + def sim_run(schd): + """Submit and run a psuedo-job. + """ + # Get the only task proxy, submit the psuedo job: + itask = schd.pool.get_tasks()[0] + itask.state.is_queued = False + monkeytime(0) + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + monkeytime(61) + + # Run Time Check + assert sim_time_check( + schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, + schd.workflow_db_mgr + ) is True + + # Capture result process queue. + out = schd.message_queue.queue[0].message + schd.process_queued_task_messages() + return out + + id_ = flow({ + 'scheduler': {'cycle point format': '%Y'}, + 'scheduling': { + 'initial cycle point': '1066', + 'graph': { + 'R1': 'one' + } + }, + 'runtime': { + 'one': { + 'execution time limit': 'PT1M', + 'execution retry delays': 'P0Y', + 'simulation': { + 'speedup factor': 1, + 'fail cycle points': 'all', + 'fail try 1 only': False, + } + }, + } + }) + schd = scheduler(id_) + async with start(schd): + # Submit first psuedo-job and "run" to failure: + assert sim_run(schd) == 'failed' + + # Modify config as if reinstall had taken place: + conf_file = Path(schd.workflow_run_dir) / 'flow.cylc' + conf_file.write_text( + conf_file.read_text().replace('False', 'True')) + + # Reload Workflow: + await schd.command_reload_workflow() + + # Submit second psuedo-job and "run" to success: + assert sim_run(schd) == 'succeeded' From 0cd47a97f6c7f050ca68beaa280bbff20a13783d Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 30 Jan 2024 11:53:23 +0000 Subject: [PATCH 07/51] rationalize tests --- cylc/flow/simulation.py | 4 +- tests/integration/test_simulation.py | 96 +++++++++++++--------------- 2 files changed, 47 insertions(+), 53 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 02ff53d04d8..da6d7fb4c57 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -57,7 +57,7 @@ def __init__( self, itask: 'TaskProxy', broadcast_mgr: 'BroadcastMgr', - db_mgr: 'WorkflowDatabaseManager' = None + db_mgr: 'Optional[WorkflowDatabaseManager]' = None ): overrides = broadcast_mgr.get_broadcast(itask.tokens) if overrides: @@ -76,7 +76,7 @@ def __init__( # itask.summary['started_time'] and mode_settings.timeout need # repopulating from the DB on workflow restart: started_time = itask.summary['started_time'] - if started_time is None: + if started_time is None and db_mgr: started_time = int( TimePointParser() .parse( diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 89585ba5afb..f7aaef79c0d 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -16,7 +16,7 @@ from pathlib import Path import pytest -from pytest import param +from pytest import UsageError, param from queue import Queue from cylc.flow.cycling.iso8601 import ISO8601Point @@ -48,6 +48,47 @@ def _inner(msg_q): return _inner +@pytest.fixture +def run_simjob(monkeytime): + """Run a simulated job to completion. + + Returns the output status. + """ + def _inner(schd, point=None, task=None): + # Get the only task proxy, submit the psuedo job: + if task and point: + itask = schd.pool.get_task(point, task) + elif task or point: + raise UsageError( + 'run_simjob requires either a task and point, or neither.') + else: + itasks = schd.pool.get_tasks() + if len(itasks) != 1: + raise UsageError( + 'run_simjob cannot needs a task and point if more ' + 'than one task is in the task pool.') + else: + itask, = itasks + + itask.state.is_queued = False + monkeytime(0) + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + monkeytime(itask.mode_settings.timeout + 1) + + # Run Time Check + assert sim_time_check( + schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, + schd.workflow_db_mgr + ) is True + + # Capture result process queue. + out = schd.message_queue.queue[0].message + schd.process_queued_task_messages() + return out + return _inner + + @pytest.fixture(scope='module') async def sim_time_check_setup( mod_flow, mod_scheduler, mod_start, mod_one_conf, @@ -148,31 +189,6 @@ def test_fail_once(sim_time_check_setup, itask, point, results): assert itask.mode_settings.sim_task_fails is result -def test_sim_time_check_sets_started_time( - scheduler, sim_time_check_setup -): - """But sim_time_check still returns False - - This only occurs in reality if we've restarted from database and - not retrieved the started time from itask.summary. - """ - schd, _, msg_q = sim_time_check_setup - one_1066 = schd.pool.get_task(ISO8601Point('1066'), 'one') - # Add info to databse as if it's be started before shutdown: - schd.task_job_mgr._simulation_submit_task_jobs( - [one_1066], schd.workflow) - schd.workflow_db_mgr.process_queued_ops() - one_1066.summary['started_time'] = None - one_1066.state.is_queued = False - one_1066.mode_settings = None - assert one_1066.summary['started_time'] is None - assert sim_time_check( - msg_q, [one_1066], schd.task_events_mgr.broadcast_mgr, - schd.workflow_db_mgr - ) is False - assert one_1066.summary['started_time'] is not None - - def test_task_finishes(sim_time_check_setup, monkeytime, q_clean): """...and an appropriate message sent. @@ -347,34 +363,12 @@ async def test_simulation_mode_settings_restart( async def test_simulation_mode_settings_reload( - monkeytime, flow, scheduler, start + flow, scheduler, start, run_simjob ): """Check that simulation mode settings are changed for future pseudo jobs on reload. """ - def sim_run(schd): - """Submit and run a psuedo-job. - """ - # Get the only task proxy, submit the psuedo job: - itask = schd.pool.get_tasks()[0] - itask.state.is_queued = False - monkeytime(0) - schd.task_job_mgr._simulation_submit_task_jobs( - [itask], schd.workflow) - monkeytime(61) - - # Run Time Check - assert sim_time_check( - schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, - schd.workflow_db_mgr - ) is True - - # Capture result process queue. - out = schd.message_queue.queue[0].message - schd.process_queued_task_messages() - return out - id_ = flow({ 'scheduler': {'cycle point format': '%Y'}, 'scheduling': { @@ -398,7 +392,7 @@ def sim_run(schd): schd = scheduler(id_) async with start(schd): # Submit first psuedo-job and "run" to failure: - assert sim_run(schd) == 'failed' + assert run_simjob(schd) == 'failed' # Modify config as if reinstall had taken place: conf_file = Path(schd.workflow_run_dir) / 'flow.cylc' @@ -409,4 +403,4 @@ def sim_run(schd): await schd.command_reload_workflow() # Submit second psuedo-job and "run" to success: - assert sim_run(schd) == 'succeeded' + assert run_simjob(schd) == 'succeeded' From c356bdfe50f656b9e2036f393e0e08263ba711f4 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 31 Jan 2024 15:40:54 +0000 Subject: [PATCH 08/51] r2r --- cylc/flow/simulation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index da6d7fb4c57..91649d8868e 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -70,7 +70,7 @@ def __init__( self.sim_task_fails = sim_task_failed( rtconfig['simulation'], itask.point, - itask.submit_num + itask.get_try_num() ) # itask.summary['started_time'] and mode_settings.timeout need @@ -234,7 +234,7 @@ def sim_time_check( itask.mode_settings = ModeSettings(itask, broadcast_mgr, db_mgr) if now > itask.mode_settings.timeout: - job_d = itask.tokens.duplicate(job=str(itask.submit_num)) + job_d = itask.tokens.duplicate(job=str(itask.get_try_num())) now_str = get_current_time_string() if itask.mode_settings.sim_task_fails: @@ -262,7 +262,7 @@ def sim_time_check( def sim_task_failed( sim_conf: Dict[str, Any], point: 'PointBase', - submit_num: int, + try_num: int, ) -> bool: """Encapsulate logic for deciding whether a sim task has failed. @@ -272,5 +272,5 @@ def sim_task_failed( sim_conf['fail cycle points'] is None # i.e. "all" or point in sim_conf['fail cycle points'] ) and ( - submit_num == 0 or not sim_conf['fail try 1 only'] + try_num == 0 or not sim_conf['fail try 1 only'] ) From d6e5aef9f3f54f4e65071b424d94aa371bcfae95 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 1 Feb 2024 13:10:14 +0000 Subject: [PATCH 09/51] Save correct datetime format to the DB. --- cylc/flow/simulation.py | 19 +++++++------------ cylc/flow/task_job_mgr.py | 4 +++- tests/integration/test_simulation.py | 7 ++++--- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 91649d8868e..ead5148e61d 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -32,9 +32,10 @@ TASK_STATUS_FAILED, TASK_STATUS_SUCCEEDED, ) -from cylc.flow.wallclock import get_current_time_string +from cylc.flow.wallclock import ( + get_current_time_string, get_unix_time_from_time_string) -from metomi.isodatetime.parsers import DurationParser, TimePointParser +from metomi.isodatetime.parsers import DurationParser if TYPE_CHECKING: from queue import Queue @@ -77,17 +78,11 @@ def __init__( # repopulating from the DB on workflow restart: started_time = itask.summary['started_time'] if started_time is None and db_mgr: - started_time = int( - TimePointParser() - .parse( - db_mgr.pub_dao.select_task_job( - *itask.tokens.relative_id.split("/") - )["time_submit"] - ) - .seconds_since_unix_epoch - ) + started_time_str = db_mgr.pub_dao.select_task_job( + *itask.tokens.relative_id.split("/"))["time_submit"] + started_time = get_unix_time_from_time_string( + started_time_str) itask.summary['started_time'] = started_time - self.timeout = started_time + self.simulated_run_length diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index 268b2c635da..58677b6c1c1 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -107,6 +107,7 @@ ) from cylc.flow.wallclock import ( get_current_time_string, + get_time_string_from_unix_time, get_utc_mode ) from cylc.flow.cfgspec.globalcfg import SYSPATH @@ -998,6 +999,7 @@ def _set_retry_timers( def _simulation_submit_task_jobs(self, itasks, workflow): """Simulation mode task jobs submission.""" now = time() + now_str = get_time_string_from_unix_time(now) for itask in itasks: itask.summary['started_time'] = now itask.mode_settings = ModeSettings( @@ -1018,7 +1020,7 @@ def _simulation_submit_task_jobs(self, itasks, workflow): itask, INFO, TASK_OUTPUT_SUBMITTED, ) self.workflow_db_mgr.put_insert_task_jobs( - itask, {'time_submit': get_current_time_string()}) + itask, {'time_submit': now_str}) return itasks diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index f7aaef79c0d..07d023dec53 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -174,7 +174,7 @@ def test_false_if_not_running( id='fail-no-submits'), ) ) -def test_fail_once(sim_time_check_setup, itask, point, results): +def test_fail_once(sim_time_check_setup, itask, point, results, monkeypatch): """A task with a fail cycle point only fails at that cycle point, and then only on the first submission. """ @@ -183,7 +183,8 @@ def test_fail_once(sim_time_check_setup, itask, point, results): itask = schd.pool.get_task( ISO8601Point(point), itask) - for result in results: + for i, result in enumerate(results): + itask.try_timers['execution-retry'].num = i - 1 schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is result @@ -319,7 +320,7 @@ async def test_simulation_mode_settings_restart( # Set the start time in the database to 0 to make the # test simpler: schd.workflow_db_mgr.put_insert_task_jobs( - itask, {'time_submit': '19700101T0000Z'}) + itask, {'time_submit': '1970-01-01T00:00:00Z'}) schd.workflow_db_mgr.process_queued_ops() # Set the current time: From 695e5a84953a31683fd6c4389f6f9dfdc9df3aa8 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 6 Feb 2024 13:02:57 +0000 Subject: [PATCH 10/51] Response to review - Re-order the ModeSettings.__init__ method to allow db loading before setting the start time. - Add `try_num` to database. - Fix/streamline the tests - add explicit broadcast test. --- cylc/flow/simulation.py | 46 ++++++----- cylc/flow/task_job_mgr.py | 10 ++- tests/integration/test_simulation.py | 118 +++++++++++++++------------ 3 files changed, 101 insertions(+), 73 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index ead5148e61d..164b4cbeece 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -53,6 +53,7 @@ class ModeSettings: """ simulated_run_length: float = 0.0 sim_task_fails: bool = False + timeout: float = 0.0 def __init__( self, @@ -60,29 +61,42 @@ def __init__( broadcast_mgr: 'BroadcastMgr', db_mgr: 'Optional[WorkflowDatabaseManager]' = None ): + + # itask.summary['started_time'] and mode_settings.timeout need + # repopulating from the DB on workflow restart: + started_time = itask.summary['started_time'] + try_num = None + if started_time is None and db_mgr: + # Get DB info + db_info = db_mgr.pub_dao.select_task_job( + *itask.tokens.relative_id.split("/")) + + # Get the started time: + started_time = get_unix_time_from_time_string( + db_info["time_submit"]) + itask.summary['started_time'] = started_time + + # Get the try number: + try_num = db_info["try_num"] + + # Update anything changed by broadcast: overrides = broadcast_mgr.get_broadcast(itask.tokens) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) else: rtconfig = itask.tdef.rtconfig + + # Calculate simulation info: self.simulated_run_length = ( get_simulated_run_len(rtconfig)) self.sim_task_fails = sim_task_failed( rtconfig['simulation'], itask.point, - itask.get_try_num() + try_num or itask.get_try_num() ) - - # itask.summary['started_time'] and mode_settings.timeout need - # repopulating from the DB on workflow restart: - started_time = itask.summary['started_time'] - if started_time is None and db_mgr: - started_time_str = db_mgr.pub_dao.select_task_job( - *itask.tokens.relative_id.split("/"))["time_submit"] - started_time = get_unix_time_from_time_string( - started_time_str) - itask.summary['started_time'] = started_time + from cylc.flow import LOG + LOG.critical(try_num or itask.get_try_num()) self.timeout = started_time + self.simulated_run_length @@ -217,12 +231,7 @@ def sim_time_check( sim_task_state_changed: bool = False for itask in itasks: - if ( - itask.state.status != TASK_STATUS_RUNNING - or itask.state.is_queued - or itask.state.is_held - or itask.state.is_runahead - ): + if itask.state.status != TASK_STATUS_RUNNING: continue if itask.mode_settings is None: @@ -250,7 +259,6 @@ def sim_time_check( # We've finished this psuedojob, so delete all the mode settings. itask.mode_settings = None sim_task_state_changed = True - itask.mode_settings = None return sim_task_state_changed @@ -267,5 +275,5 @@ def sim_task_failed( sim_conf['fail cycle points'] is None # i.e. "all" or point in sim_conf['fail cycle points'] ) and ( - try_num == 0 or not sim_conf['fail try 1 only'] + try_num == 1 or not sim_conf['fail try 1 only'] ) diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index 58677b6c1c1..47cf419745c 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -1002,11 +1002,11 @@ def _simulation_submit_task_jobs(self, itasks, workflow): now_str = get_time_string_from_unix_time(now) for itask in itasks: itask.summary['started_time'] = now + self._set_retry_timers(itask, itask.tdef.rtconfig) itask.mode_settings = ModeSettings( itask, self.task_events_mgr.broadcast_mgr) itask.waiting_on_job_prep = False itask.submit_num += 1 - self._set_retry_timers(itask) itask.platform = {'name': 'SIMULATION'} itask.summary['job_runner_name'] = 'SIMULATION' @@ -1020,8 +1020,12 @@ def _simulation_submit_task_jobs(self, itasks, workflow): itask, INFO, TASK_OUTPUT_SUBMITTED, ) self.workflow_db_mgr.put_insert_task_jobs( - itask, {'time_submit': now_str}) - + itask, { + 'time_submit': now_str, + 'try_num': itask.get_try_num(), + } + ) + self.workflow_db_mgr.process_queued_ops() return itasks def _submit_task_jobs_callback(self, ctx, workflow, itasks): diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 07d023dec53..cab7b5ec90b 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -184,7 +184,7 @@ def test_fail_once(sim_time_check_setup, itask, point, results, monkeypatch): ISO8601Point(point), itask) for i, result in enumerate(results): - itask.try_timers['execution-retry'].num = i - 1 + itask.try_timers['execution-retry'].num = i schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is result @@ -256,7 +256,7 @@ def test_task_sped_up(sim_time_check_setup, monkeytime): ) is True -async def test_simulation_mode_settings_restart( +async def test_settings_restart( monkeytime, flow, scheduler, start ): """Check that simulation mode settings are correctly restored @@ -286,22 +286,19 @@ async def test_simulation_mode_settings_restart( } }) schd = scheduler(id_) - msg_q = Queue() # Start the workflow: async with start(schd): - # Pick the task proxy, Mock its start time, set state to running: itask = schd.pool.get_tasks()[0] - itask.summary['started_time'] = 0 - itask.state.status = 'running' - - # Submit it, then mock the wallclock and assert that it's not finshed. schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) - monkeytime(0) + og_timeout = itask.mode_settings.timeout + + # Mock wallclock < sim end timeout + monkeytime(itask.mode_settings.timeout - 1) assert sim_time_check( - msg_q, [itask], schd.task_events_mgr.broadcast_mgr, + schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, schd.workflow_db_mgr ) is False @@ -310,60 +307,27 @@ async def test_simulation_mode_settings_restart( async with start(schd): # Get our tasks and fix wallclock: itask = schd.pool.get_tasks()[0] - monkeytime(12) - itask.state.status = 'running' # Check that we haven't got started time & mode settings back: assert itask.summary['started_time'] is None assert itask.mode_settings is None - # Set the start time in the database to 0 to make the - # test simpler: - schd.workflow_db_mgr.put_insert_task_jobs( - itask, {'time_submit': '1970-01-01T00:00:00Z'}) - schd.workflow_db_mgr.process_queued_ops() - # Set the current time: - monkeytime(12) + monkeytime(og_timeout - 1) assert sim_time_check( - msg_q, [itask], schd.task_events_mgr.broadcast_mgr, + schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, schd.workflow_db_mgr ) is False # Check that the itask.mode_settings is now re-created assert itask.mode_settings.__dict__ == { 'simulated_run_length': 60.0, - 'sim_task_fails': False, - 'timeout': 60.0 - } - - # Set the current time > timeout - monkeytime(61) - assert sim_time_check( - msg_q, [itask], schd.task_events_mgr.broadcast_mgr, - schd.workflow_db_mgr - ) is True - - assert itask.mode_settings is None - - schd.task_events_mgr.broadcast_mgr.put_broadcast( - ['1066'], ['one'], [{ - 'execution time limit': 'PT1S'}]) - - assert itask.mode_settings is None - - schd.task_job_mgr._simulation_submit_task_jobs( - [itask], schd.workflow) - - assert itask.submit_num == 2 - assert itask.mode_settings.__dict__ == { - 'simulated_run_length': 1.0, - 'sim_task_fails': False, - 'timeout': 62.0 + 'sim_task_fails': True, + 'timeout': float(int(og_timeout)) } -async def test_simulation_mode_settings_reload( +async def test_settings_reload( flow, scheduler, start, run_simjob ): """Check that simulation mode settings are changed for future @@ -374,9 +338,7 @@ async def test_simulation_mode_settings_reload( 'scheduler': {'cycle point format': '%Y'}, 'scheduling': { 'initial cycle point': '1066', - 'graph': { - 'R1': 'one' - } + 'graph': {'R1': 'one'} }, 'runtime': { 'one': { @@ -405,3 +367,57 @@ async def test_simulation_mode_settings_reload( # Submit second psuedo-job and "run" to success: assert run_simjob(schd) == 'succeeded' + + +async def test_settings_broadcast( + flow, scheduler, start, complete, monkeytime +): + """Assert that broadcasting a change in the settings for a task + affects subsequent psuedo-submissions. + """ + id_ = flow({ + 'scheduler': {'cycle point format': '%Y'}, + 'scheduling': { + 'initial cycle point': '1066', + 'graph': {'R1': 'one'} + }, + 'runtime': { + 'one': { + 'execution time limit': 'PT1S', + 'simulation': { + 'speedup factor': 1, + 'fail cycle points': '1066', + } + }, + } + }, defaults=False) + schd = scheduler(id_, paused_start=False, run_mode='simulation') + async with start(schd): + itask = schd.pool.get_tasks()[0] + itask.state.is_queued = False + + # Submit the first - the sim task will fail: + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + assert itask.mode_settings.sim_task_fails is True + + # Let task finish. + monkeytime(itask.mode_settings.timeout + 1) + assert sim_time_check( + schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, + schd.workflow_db_mgr + ) is True + + # The mode_settings object has been cleared: + assert itask.mode_settings is None + + # Change a setting using broadcast: + schd.task_events_mgr.broadcast_mgr.put_broadcast( + ['1066'], ['one'], [{ + 'simulation': {'fail cycle points': ''} + }]) + + # Submit again - result is different: + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + assert itask.mode_settings.sim_task_fails is False From a6f7606dee58bb6d110805a7fb629b6373c7fc78 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 6 Feb 2024 14:13:15 +0000 Subject: [PATCH 11/51] small test fix --- cylc/flow/simulation.py | 2 -- tests/integration/tui/test_updater.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 164b4cbeece..7a22e8a9a78 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -95,8 +95,6 @@ def __init__( itask.point, try_num or itask.get_try_num() ) - from cylc.flow import LOG - LOG.critical(try_num or itask.get_try_num()) self.timeout = started_time + self.simulated_run_length diff --git a/tests/integration/tui/test_updater.py b/tests/integration/tui/test_updater.py index 2d9927ca5eb..3fa310b6966 100644 --- a/tests/integration/tui/test_updater.py +++ b/tests/integration/tui/test_updater.py @@ -19,6 +19,7 @@ from pathlib import Path from queue import Queue import re +from time import time from async_timeout import timeout import pytest @@ -165,6 +166,7 @@ async def test_filters(one_conf, flow, scheduler, run, updater): async with run(one): # mark "1/a" as running and "1/b" as succeeded one_a = one.pool.get_task(IntegerPoint('1'), 'a') + one_a.summary['started_time'] = time() one_a.state_reset('running') one.data_store_mgr.delta_task_state(one_a) one.pool.get_task(IntegerPoint('1'), 'b').state_reset('succeeded') From 69b03c9294b27b852a1e58d30713c4ac779112ad Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 8 Feb 2024 10:11:42 +0000 Subject: [PATCH 12/51] Update cylc/flow/task_job_mgr.py Co-authored-by: Oliver Sanders --- cylc/flow/task_job_mgr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index 47cf419745c..fae94ffc7cb 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -1025,7 +1025,6 @@ def _simulation_submit_task_jobs(self, itasks, workflow): 'try_num': itask.get_try_num(), } ) - self.workflow_db_mgr.process_queued_ops() return itasks def _submit_task_jobs_callback(self, ctx, workflow, itasks): From 2665d425d9d794963041bd88354f20625015da68 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Fri, 9 Feb 2024 14:22:10 +0000 Subject: [PATCH 13/51] Apply suggestions from code review Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- changes.d/5721.feat.md | 2 +- cylc/flow/task_job_mgr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/changes.d/5721.feat.md b/changes.d/5721.feat.md index 8562337c063..bae6667eb34 100644 --- a/changes.d/5721.feat.md +++ b/changes.d/5721.feat.md @@ -1 +1 @@ -Allow users to broadcast run_mode to tasks. \ No newline at end of file +Allow users to broadcast simulation mode settings to tasks. \ No newline at end of file diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index fae94ffc7cb..e943a106eba 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -1002,7 +1002,7 @@ def _simulation_submit_task_jobs(self, itasks, workflow): now_str = get_time_string_from_unix_time(now) for itask in itasks: itask.summary['started_time'] = now - self._set_retry_timers(itask, itask.tdef.rtconfig) + self._set_retry_timers(itask) itask.mode_settings = ModeSettings( itask, self.task_events_mgr.broadcast_mgr) itask.waiting_on_job_prep = False From 89e50418aeb9b3abcddb1b669ea4746e7e02cde6 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Fri, 9 Feb 2024 14:42:07 +0000 Subject: [PATCH 14/51] response to review --- .../functional/modes/04-simulation-runtime.t | 4 +++ tests/integration/test_simulation.py | 26 +++++-------------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/tests/functional/modes/04-simulation-runtime.t b/tests/functional/modes/04-simulation-runtime.t index 5137e4cbe0d..1ce3850f202 100644 --- a/tests/functional/modes/04-simulation-runtime.t +++ b/tests/functional/modes/04-simulation-runtime.t @@ -34,12 +34,16 @@ run_ok "second-task-broadcast-too-long" \ -n second_task \ -s 'execution time limit = P1D' +poll_grep 'Broadcast set:' "${SCHD_LOG}" + # Test cancelling broadcast changes the task config back. run_ok "cancel-second-task-broadcast" \ cylc broadcast "${WORKFLOW_NAME}" \ -n second_task\ --clear +poll_grep 'Broadcast cancelled:' "${SCHD_LOG}" + # If we speed up the simulated first_task task we # can make it finish before workflow timeout # (neither change will do this on its own): diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index cab7b5ec90b..798ac397c5d 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -54,22 +54,8 @@ def run_simjob(monkeytime): Returns the output status. """ - def _inner(schd, point=None, task=None): - # Get the only task proxy, submit the psuedo job: - if task and point: - itask = schd.pool.get_task(point, task) - elif task or point: - raise UsageError( - 'run_simjob requires either a task and point, or neither.') - else: - itasks = schd.pool.get_tasks() - if len(itasks) != 1: - raise UsageError( - 'run_simjob cannot needs a task and point if more ' - 'than one task is in the task pool.') - else: - itask, = itasks - + def _run_simjob(schd, point, task): + itask = schd.pool.get_task(point, task) itask.state.is_queued = False monkeytime(0) schd.task_job_mgr._simulation_submit_task_jobs( @@ -86,7 +72,7 @@ def _inner(schd, point=None, task=None): out = schd.message_queue.queue[0].message schd.process_queued_task_messages() return out - return _inner + return _run_simjob @pytest.fixture(scope='module') @@ -355,7 +341,8 @@ async def test_settings_reload( schd = scheduler(id_) async with start(schd): # Submit first psuedo-job and "run" to failure: - assert run_simjob(schd) == 'failed' + one_1066 = schd.pool.get_tasks()[0] + assert run_simjob(schd, one_1066.point, 'one') == 'failed' # Modify config as if reinstall had taken place: conf_file = Path(schd.workflow_run_dir) / 'flow.cylc' @@ -366,7 +353,8 @@ async def test_settings_reload( await schd.command_reload_workflow() # Submit second psuedo-job and "run" to success: - assert run_simjob(schd) == 'succeeded' + assert run_simjob( + schd, one_1066.point, 'one') == 'succeeded' async def test_settings_broadcast( From b097d33d28084d1686d86ac529723abe85a13a50 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 12 Feb 2024 14:41:15 +0000 Subject: [PATCH 15/51] Pass `task_events_manager` wholesale to sim_time_check. Submit task messages for sim mode outcomes using `task_events_manager` rather than resolvers, to reduce the risk of ordering problems. Refactored tests to make them work with the above. --- cylc/flow/scheduler.py | 3 +- cylc/flow/simulation.py | 25 ++++------ tests/integration/test_simulation.py | 75 ++++++++++++---------------- 3 files changed, 42 insertions(+), 61 deletions(-) diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py index 64c58cd1a01..cd75f4dedac 100644 --- a/cylc/flow/scheduler.py +++ b/cylc/flow/scheduler.py @@ -1747,9 +1747,8 @@ async def main_loop(self) -> None: if ( self.pool.config.run_mode('simulation') and sim_time_check( - self.message_queue, + self.task_events_mgr, self.pool.get_tasks(), - self.task_events_mgr.broadcast_mgr, self.workflow_db_mgr, ) ): diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 7a22e8a9a78..5496eaa51e2 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -21,7 +21,6 @@ from time import time from cylc.flow.cycling.loader import get_point -from cylc.flow.network.resolvers import TaskMsg from cylc.flow.parsec.util import ( pdeepcopy, poverride @@ -38,7 +37,6 @@ from metomi.isodatetime.parsers import DurationParser if TYPE_CHECKING: - from queue import Queue from cylc.flow.broadcast_mgr import BroadcastMgr from cylc.flow.cycling import PointBase from cylc.flow.task_proxy import TaskProxy @@ -213,9 +211,8 @@ def parse_fail_cycle_points( def sim_time_check( - message_queue: 'Queue[TaskMsg]', + task_events_manager, #: 'TaskEventsMgr', itasks: 'List[TaskProxy]', - broadcast_mgr: 'BroadcastMgr', db_mgr: 'WorkflowDatabaseManager', ) -> bool: """Check if sim tasks have been "running" for as long as required. @@ -227,31 +224,27 @@ def sim_time_check( """ now = time() sim_task_state_changed: bool = False - for itask in itasks: if itask.state.status != TASK_STATUS_RUNNING: continue if itask.mode_settings is None: - itask.mode_settings = ModeSettings(itask, broadcast_mgr, db_mgr) + itask.mode_settings = ModeSettings( + itask, task_events_manager.broadcast_mgr, db_mgr) if now > itask.mode_settings.timeout: - job_d = itask.tokens.duplicate(job=str(itask.get_try_num())) - now_str = get_current_time_string() - if itask.mode_settings.sim_task_fails: - message_queue.put( - TaskMsg(job_d, now_str, 'CRITICAL', TASK_STATUS_FAILED) + task_events_manager.process_message( + itask, 'CRITICAL', TASK_STATUS_FAILED ) else: - message_queue.put( - TaskMsg(job_d, now_str, 'DEBUG', TASK_STATUS_SUCCEEDED) + task_events_manager.process_message( + itask, 'DEBUG', TASK_STATUS_SUCCEEDED ) - # Simulate message outputs. for msg in itask.tdef.rtconfig['outputs'].values(): - message_queue.put( - TaskMsg(job_d, now_str, 'DEBUG', msg) + task_events_manager.process_message( + itask, 'DEBUG', msg ) # We've finished this psuedojob, so delete all the mode settings. diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 798ac397c5d..3c11e59603f 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -16,8 +16,7 @@ from pathlib import Path import pytest -from pytest import UsageError, param -from queue import Queue +from pytest import param from cylc.flow.cycling.iso8601 import ISO8601Point from cylc.flow.simulation import sim_time_check @@ -64,14 +63,12 @@ def _run_simjob(schd, point, task): # Run Time Check assert sim_time_check( - schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, + schd.task_events_mgr, [itask], schd.workflow_db_mgr ) is True # Capture result process queue. - out = schd.message_queue.queue[0].message - schd.process_queued_task_messages() - return out + return itask return _run_simjob @@ -115,24 +112,21 @@ async def sim_time_check_setup( } } })) - msg_q = Queue() async with mod_start(schd): itasks = schd.pool.get_tasks() [schd.task_job_mgr._set_retry_timers(i) for i in itasks] - yield schd, itasks, msg_q + yield schd, itasks def test_false_if_not_running( sim_time_check_setup, monkeypatch, q_clean ): - schd, itasks, msg_q = sim_time_check_setup + schd, itasks = sim_time_check_setup itasks = [i for i in itasks if i.state.status != 'running'] # False if task status not running: - assert sim_time_check( - msg_q, itasks, schd.task_events_mgr.broadcast_mgr, '' - ) is False + assert sim_time_check(schd.task_events_mgr, itasks, '') is False @pytest.mark.parametrize( @@ -164,7 +158,7 @@ def test_fail_once(sim_time_check_setup, itask, point, results, monkeypatch): """A task with a fail cycle point only fails at that cycle point, and then only on the first submission. """ - schd, _, msg_q = sim_time_check_setup + schd, _ = sim_time_check_setup itask = schd.pool.get_task( ISO8601Point(point), itask) @@ -176,7 +170,7 @@ def test_fail_once(sim_time_check_setup, itask, point, results, monkeypatch): assert itask.mode_settings.sim_task_fails is result -def test_task_finishes(sim_time_check_setup, monkeytime, q_clean): +def test_task_finishes(sim_time_check_setup, monkeytime, caplog): """...and an appropriate message sent. Checks that failed and bar are output if a task is set to fail. @@ -184,10 +178,7 @@ def test_task_finishes(sim_time_check_setup, monkeytime, q_clean): Does NOT check every possible cause of an outcome - this is done in unit tests. """ - schd, _, msg_q = sim_time_check_setup - - q_clean(msg_q) - + schd, _ = sim_time_check_setup monkeytime(0) # Setup a task to fail, submit it. @@ -202,24 +193,24 @@ def test_task_finishes(sim_time_check_setup, monkeytime, q_clean): fail_all_1066.summary['started_time'] = 0 # Before simulation time is up: - assert sim_time_check( - msg_q, [fail_all_1066], schd.task_events_mgr.broadcast_mgr, '' - ) is False + assert sim_time_check(schd.task_events_mgr, [fail_all_1066], '') is False # Time's up... monkeytime(12) # After simulation time is up it Fails and records custom outputs: - assert sim_time_check( - msg_q, [fail_all_1066], schd.task_events_mgr.broadcast_mgr, '' - ) is True - assert sorted(i.message for i in msg_q.queue) == ['bar', 'failed'] + assert sim_time_check(schd.task_events_mgr, [fail_all_1066], '') is True + outputs = { + o[0]: (o[1], o[2]) for o in fail_all_1066.state.outputs.get_all()} + assert outputs['succeeded'] == ('succeeded', False) + assert outputs['foo'] == ('bar', True) + assert outputs['failed'] == ('failed', True) def test_task_sped_up(sim_time_check_setup, monkeytime): """Task will speed up by a factor set in config.""" - schd, _, msg_q = sim_time_check_setup + schd, _ = sim_time_check_setup fast_forward_1066 = schd.pool.get_task( ISO8601Point('1066'), 'fast_forward') @@ -229,17 +220,14 @@ def test_task_sped_up(sim_time_check_setup, monkeytime): [fast_forward_1066], schd.workflow) fast_forward_1066.state.is_queued = False - assert sim_time_check( - msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' - ) is False + result = sim_time_check(schd.task_events_mgr, [fast_forward_1066], '') + assert result is False monkeytime(29) - assert sim_time_check( - msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' - ) is False + result = sim_time_check(schd.task_events_mgr, [fast_forward_1066], '') + assert result is False monkeytime(31) - assert sim_time_check( - msg_q, [fast_forward_1066], schd.task_events_mgr.broadcast_mgr, '' - ) is True + result = sim_time_check(schd.task_events_mgr, [fast_forward_1066], '') + assert result is True async def test_settings_restart( @@ -284,8 +272,7 @@ async def test_settings_restart( # Mock wallclock < sim end timeout monkeytime(itask.mode_settings.timeout - 1) assert sim_time_check( - schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, - schd.workflow_db_mgr + schd.task_events_mgr, [itask], schd.workflow_db_mgr ) is False # Stop and restart the scheduler: @@ -301,8 +288,7 @@ async def test_settings_restart( # Set the current time: monkeytime(og_timeout - 1) assert sim_time_check( - schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, - schd.workflow_db_mgr + schd.task_events_mgr, [itask], schd.workflow_db_mgr ) is False # Check that the itask.mode_settings is now re-created @@ -342,7 +328,9 @@ async def test_settings_reload( async with start(schd): # Submit first psuedo-job and "run" to failure: one_1066 = schd.pool.get_tasks()[0] - assert run_simjob(schd, one_1066.point, 'one') == 'failed' + + itask = run_simjob(schd, one_1066.point, 'one') + assert ['failed', 'failed', False] in itask.state.outputs.get_all() # Modify config as if reinstall had taken place: conf_file = Path(schd.workflow_run_dir) / 'flow.cylc' @@ -353,8 +341,9 @@ async def test_settings_reload( await schd.command_reload_workflow() # Submit second psuedo-job and "run" to success: - assert run_simjob( - schd, one_1066.point, 'one') == 'succeeded' + itask = run_simjob(schd, one_1066.point, 'one') + assert [ + 'succeeded', 'succeeded', True] in itask.state.outputs.get_all() async def test_settings_broadcast( @@ -392,7 +381,7 @@ async def test_settings_broadcast( # Let task finish. monkeytime(itask.mode_settings.timeout + 1) assert sim_time_check( - schd.message_queue, [itask], schd.task_events_mgr.broadcast_mgr, + schd.task_events_mgr, [itask], schd.workflow_db_mgr ) is True From 7217c0051240b923b5933c2f74cf3459061c8dbd Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 13 Feb 2024 10:00:31 +0000 Subject: [PATCH 16/51] Rationalize existing tests and add a test for a bug discovered by Ronnie --- tests/functional/modes/05-sim-trigger.t | 44 +++++++++++++++++++ .../functional/modes/05-sim-trigger/flow.cylc | 12 +++++ .../modes/05-sim-trigger/reference.log | 2 + tests/integration/test_simulation.py | 20 +-------- 4 files changed, 60 insertions(+), 18 deletions(-) create mode 100644 tests/functional/modes/05-sim-trigger.t create mode 100644 tests/functional/modes/05-sim-trigger/flow.cylc create mode 100644 tests/functional/modes/05-sim-trigger/reference.log diff --git a/tests/functional/modes/05-sim-trigger.t b/tests/functional/modes/05-sim-trigger.t new file mode 100644 index 00000000000..27c77ee2c7e --- /dev/null +++ b/tests/functional/modes/05-sim-trigger.t @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Test that we can broadcast an alteration to simulation mode. + +. "$(dirname "$0")/test_header" +set_test_number 4 + +install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" +run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" +workflow_run_ok "${TEST_NAME_BASE}-start" \ + cylc play "${WORKFLOW_NAME}" --mode=simulation +SCHD_LOG="${WORKFLOW_RUN_DIR}/log/scheduler/log" + +# Wait for the workflow to stall, then check for first task failure: +poll_grep_workflow_log 'stall timer starts' +grep_ok \ + '\[1/fail_fail_fail running job:01 flows:1\] => failed' \ + "${SCHD_LOG}" + +# Trigger task again, wait for it to send a warning and check that it +# too has failed: +cylc trigger "${WORKFLOW_NAME}//1/fail_fail_fail" +poll_grep_workflow_log \ + 'job:02.*did not complete' +grep_ok \ + '\[1/fail_fail_fail running job:02 flows:1\] => failed' \ + "${SCHD_LOG}" + +purge diff --git a/tests/functional/modes/05-sim-trigger/flow.cylc b/tests/functional/modes/05-sim-trigger/flow.cylc new file mode 100644 index 00000000000..2220e958e00 --- /dev/null +++ b/tests/functional/modes/05-sim-trigger/flow.cylc @@ -0,0 +1,12 @@ +[scheduling] + [[graph]] + R1 = fail_fail_fail => bar + +[runtime] + [[root, bar]] + [[[simulation]]] + default run length = PT0S + [[fail_fail_fail]] + [[[simulation]]] + fail cycle points = all + diff --git a/tests/functional/modes/05-sim-trigger/reference.log b/tests/functional/modes/05-sim-trigger/reference.log new file mode 100644 index 00000000000..2d14bc201fb --- /dev/null +++ b/tests/functional/modes/05-sim-trigger/reference.log @@ -0,0 +1,2 @@ +23590101T0000Z/get_observations -triggered off [] in flow 1 +23590101T0000Z/get_observations -triggered off [] in flow 1 diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 3c11e59603f..8191608fe28 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -22,12 +22,6 @@ from cylc.flow.simulation import sim_time_check -def get_msg_queue_item(queue, id_): - for item in queue.queue: - if id_ in str(item.job_id): - return item - - @pytest.fixture def monkeytime(monkeypatch): """Convenience function monkeypatching time.""" @@ -37,16 +31,6 @@ def _inner(time_: int): return _inner -@pytest.fixture -def q_clean(): - """Clear message queue to revent test interference. - """ - def _inner(msg_q): - if not msg_q.empty(): - msg_q.get() - return _inner - - @pytest.fixture def run_simjob(monkeytime): """Run a simulated job to completion. @@ -119,7 +103,7 @@ async def sim_time_check_setup( def test_false_if_not_running( - sim_time_check_setup, monkeypatch, q_clean + sim_time_check_setup, monkeypatch ): schd, itasks = sim_time_check_setup @@ -347,7 +331,7 @@ async def test_settings_reload( async def test_settings_broadcast( - flow, scheduler, start, complete, monkeytime + flow, scheduler, start, monkeytime ): """Assert that broadcasting a change in the settings for a task affects subsequent psuedo-submissions. From 970fa1da5f4f70ac7d8d5033d0e871d79fb1779f Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 13 Feb 2024 10:47:36 +0000 Subject: [PATCH 17/51] Update tests/functional/modes/05-sim-trigger.t --- tests/functional/modes/05-sim-trigger.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/modes/05-sim-trigger.t b/tests/functional/modes/05-sim-trigger.t index 27c77ee2c7e..272a95c2bc6 100644 --- a/tests/functional/modes/05-sim-trigger.t +++ b/tests/functional/modes/05-sim-trigger.t @@ -15,7 +15,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -# Test that we can broadcast an alteration to simulation mode. +# Test that we can re-trigger a task in sim mode . "$(dirname "$0")/test_header" set_test_number 4 From 312cf28235fa274a7672ef3a5a714dc6754f2ad2 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 13 Feb 2024 10:51:54 +0000 Subject: [PATCH 18/51] add a warning that fail try 1 only does not change on resubmit, only retry --- cylc/flow/cfgspec/workflow.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cylc/flow/cfgspec/workflow.py b/cylc/flow/cfgspec/workflow.py index 6a951ae02fb..bcd1479ce76 100644 --- a/cylc/flow/cfgspec/workflow.py +++ b/cylc/flow/cfgspec/workflow.py @@ -1288,6 +1288,12 @@ def get_script_common_text(this: str, example: Optional[str] = None): Task instances must be set to fail by :cylc:conf:`[..]fail cycle points`. + + .. warning:: + + This setting is designed for use with automatic + retries. Subsequent manual submissions will not + change the outcome of the task. ''') Conf('disable task event handlers', VDR.V_BOOLEAN, True, desc=''' From e2392afd597d957344cacb74e6323738f15e728e Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 13 Feb 2024 11:00:43 +0000 Subject: [PATCH 19/51] Update cylc/flow/simulation.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- cylc/flow/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 5496eaa51e2..1cafa0174a5 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -211,7 +211,7 @@ def parse_fail_cycle_points( def sim_time_check( - task_events_manager, #: 'TaskEventsMgr', + task_events_manager: 'TaskEventsMgr', itasks: 'List[TaskProxy]', db_mgr: 'WorkflowDatabaseManager', ) -> bool: From 08f43450327cddb2255ab472bbaa7f2f9951990e Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 20 Feb 2024 08:29:46 +0000 Subject: [PATCH 20/51] Update cylc/flow/cfgspec/workflow.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- cylc/flow/cfgspec/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/cfgspec/workflow.py b/cylc/flow/cfgspec/workflow.py index bcd1479ce76..643efd0ef81 100644 --- a/cylc/flow/cfgspec/workflow.py +++ b/cylc/flow/cfgspec/workflow.py @@ -1289,7 +1289,7 @@ def get_script_common_text(this: str, example: Optional[str] = None): Task instances must be set to fail by :cylc:conf:`[..]fail cycle points`. - .. warning:: + .. note:: This setting is designed for use with automatic retries. Subsequent manual submissions will not From 62d58976cbe8a03a50d5338d074562d6f88168ac Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 20 Feb 2024 09:15:01 +0000 Subject: [PATCH 21/51] fix flake8 issues --- cylc/flow/simulation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 1cafa0174a5..e5c4aff1b02 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -31,14 +31,14 @@ TASK_STATUS_FAILED, TASK_STATUS_SUCCEEDED, ) -from cylc.flow.wallclock import ( - get_current_time_string, get_unix_time_from_time_string) +from cylc.flow.wallclock import get_unix_time_from_time_string from metomi.isodatetime.parsers import DurationParser if TYPE_CHECKING: from cylc.flow.broadcast_mgr import BroadcastMgr from cylc.flow.cycling import PointBase + from cylc.flow.task_events_mgr import TaskEventsManager from cylc.flow.task_proxy import TaskProxy from cylc.flow.workflow_db_mgr import WorkflowDatabaseManager @@ -211,7 +211,7 @@ def parse_fail_cycle_points( def sim_time_check( - task_events_manager: 'TaskEventsMgr', + task_events_manager: 'TaskEventsManager', itasks: 'List[TaskProxy]', db_mgr: 'WorkflowDatabaseManager', ) -> bool: From 3832e7ba47a53b2adaf791145679c1130674e192 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 20 Feb 2024 10:28:51 +0000 Subject: [PATCH 22/51] refactor test broken by sim-mode change --- .../flakyfunctional/cylc-get-config/04-dummy-mode-output.t | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t b/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t index 9fa85a4aec9..772ae5c14ba 100755 --- a/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t +++ b/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t @@ -47,8 +47,8 @@ delete_db workflow_run_ok "${TEST_NAME_BASE}-run-simulation" \ cylc play -m 'simulation' --reference-test --debug --no-detach "${WORKFLOW_NAME}" LOG="$(cylc log -m p "$WORKFLOW_NAME")" -count_ok '(received)meet' "${LOG}" 1 -count_ok '(received)greet' "${LOG}" 1 +count_ok '(internal)meet' "${LOG}" 1 +count_ok '(internal)greet' "${LOG}" 1 -purge +# purge exit From 8c3bf61407cd514d1871fef225303ab2bc837568 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 20 Feb 2024 14:56:16 +0000 Subject: [PATCH 23/51] Update tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t b/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t index 772ae5c14ba..8acf159cc68 100755 --- a/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t +++ b/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t @@ -50,5 +50,5 @@ LOG="$(cylc log -m p "$WORKFLOW_NAME")" count_ok '(internal)meet' "${LOG}" 1 count_ok '(internal)greet' "${LOG}" 1 -# purge +purge exit From c8d5aea6da1fbd56dd662e5522ca00ce3327bf2d Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 26 Feb 2024 13:31:47 +0000 Subject: [PATCH 24/51] Apply suggestions from code review Co-authored-by: Oliver Sanders --- changes.d/5721.feat.md | 2 +- cylc/flow/simulation.py | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/changes.d/5721.feat.md b/changes.d/5721.feat.md index bae6667eb34..fa4adf9c455 100644 --- a/changes.d/5721.feat.md +++ b/changes.d/5721.feat.md @@ -1 +1 @@ -Allow users to broadcast simulation mode settings to tasks. \ No newline at end of file +Allow task's run mode (i.e. "live", "simulation" or "dummy") to be changed dynamically at runtime using `cylc broadcast`. \ No newline at end of file diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index e5c4aff1b02..f30175ad231 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -48,6 +48,27 @@ class ModeSettings: """A store of state for simulation modes. Used instead of modifying the runtime config. + + Args: + itask: + The task proxy this submission relates to. + broadcast_mgr: + The broadcast manager is used to apply any runtime alterations + pre simulated submission. + db_mgr: + The database manager must be provided for simulated jobs + that are being resumed after workflow restart. It is used to extract + the original scheduled finish time for the job. + + Attrs: + simulated_run_length: + The length of time this simulated job will take to run in seconds. + timeout: + The wall-clock time at which this simulated job will finish as + a Unix epoch time. + sim_task_fails: + True, if this job is intended to fail when it finishes, else False. + """ simulated_run_length: float = 0.0 sim_task_fails: bool = False @@ -67,7 +88,10 @@ def __init__( if started_time is None and db_mgr: # Get DB info db_info = db_mgr.pub_dao.select_task_job( - *itask.tokens.relative_id.split("/")) + itask.tokens['cycle'], + itask.tokens['task'], + itask.tokens['job'], + ) # Get the started time: started_time = get_unix_time_from_time_string( From 42aa4b2fdc71e3246a047bbbb5e2f33fcfffab77 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 26 Feb 2024 15:56:15 +0000 Subject: [PATCH 25/51] Fallback created for lack of start time in database. This is likely if someone upgrades a sim mode workflow part-way through a run. --- cylc/flow/simulation.py | 18 +++++--- tests/integration/test_simulation.py | 68 ++++++++++++++++++---------- 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index f30175ad231..ada5847962b 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -48,7 +48,7 @@ class ModeSettings: """A store of state for simulation modes. Used instead of modifying the runtime config. - + Args: itask: The task proxy this submission relates to. @@ -59,7 +59,7 @@ class ModeSettings: The database manager must be provided for simulated jobs that are being resumed after workflow restart. It is used to extract the original scheduled finish time for the job. - + Attrs: simulated_run_length: The length of time this simulated job will take to run in seconds. @@ -94,9 +94,12 @@ def __init__( ) # Get the started time: - started_time = get_unix_time_from_time_string( - db_info["time_submit"]) - itask.summary['started_time'] = started_time + if db_info['time_submit']: + started_time = get_unix_time_from_time_string( + db_info["time_submit"]) + itask.summary['started_time'] = started_time + else: + started_time = time() # Get the try number: try_num = db_info["try_num"] @@ -254,7 +257,10 @@ def sim_time_check( if itask.mode_settings is None: itask.mode_settings = ModeSettings( - itask, task_events_manager.broadcast_mgr, db_mgr) + itask, + task_events_manager.broadcast_mgr, + db_mgr, + ) if now > itask.mode_settings.timeout: if itask.mode_settings.sim_task_fails: diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 8191608fe28..f2fda2e5c3e 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -222,17 +222,22 @@ async def test_settings_restart( In the case of start time this is collected from the database from task_jobs.start_time. + + tasks: + one: Runs straighforwardly. + two: Test case where database is missing started_time + because it was upgraded from an earlier version of Cylc. """ id_ = flow({ 'scheduler': {'cycle point format': '%Y'}, 'scheduling': { 'initial cycle point': '1066', 'graph': { - 'R1': 'one' + 'R1': 'one & two' } }, 'runtime': { - 'one': { + 'root': { 'execution time limit': 'PT1M', 'execution retry delays': 'P0Y', 'simulation': { @@ -247,11 +252,12 @@ async def test_settings_restart( # Start the workflow: async with start(schd): - itask = schd.pool.get_tasks()[0] - schd.task_job_mgr._simulation_submit_task_jobs( - [itask], schd.workflow) + og_timeouts = {} + for itask in schd.pool.get_tasks(): + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) - og_timeout = itask.mode_settings.timeout + og_timeouts[itask.identity] = itask.mode_settings.timeout # Mock wallclock < sim end timeout monkeytime(itask.mode_settings.timeout - 1) @@ -263,24 +269,38 @@ async def test_settings_restart( schd = scheduler(id_) async with start(schd): # Get our tasks and fix wallclock: - itask = schd.pool.get_tasks()[0] - - # Check that we haven't got started time & mode settings back: - assert itask.summary['started_time'] is None - assert itask.mode_settings is None - - # Set the current time: - monkeytime(og_timeout - 1) - assert sim_time_check( - schd.task_events_mgr, [itask], schd.workflow_db_mgr - ) is False - - # Check that the itask.mode_settings is now re-created - assert itask.mode_settings.__dict__ == { - 'simulated_run_length': 60.0, - 'sim_task_fails': True, - 'timeout': float(int(og_timeout)) - } + itasks = schd.pool.get_tasks() + for itask in itasks: + + # Check that we haven't got started time & mode settings back: + assert itask.summary['started_time'] is None + assert itask.mode_settings is None + + if itask.identity == '1066/two': + # Delete the database entry for `two`: Ensure that + # we don't break sim mode on upgrade to this version of Cylc. + schd.workflow_db_mgr.pub_dao.connect().execute( + 'UPDATE task_jobs' + '\n SET time_submit = NULL' + '\n WHERE (name == \'two\')' + ) + schd.workflow_db_mgr.process_queued_ops() + monkeytime(42) + expected_timeout = 102.0 + else: + monkeytime(og_timeouts[itask.identity] - 1) + expected_timeout = float(int(og_timeouts[itask.identity])) + + assert sim_time_check( + schd.task_events_mgr, [itask], schd.workflow_db_mgr + ) is False + + # Check that the itask.mode_settings is now re-created + assert itask.mode_settings.__dict__ == { + 'simulated_run_length': 60.0, + 'sim_task_fails': True, + 'timeout': expected_timeout + } async def test_settings_reload( From 42ad8cc65ad34edbee93f19396f4a9af74306595 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 27 Feb 2024 09:18:52 +0000 Subject: [PATCH 26/51] Ensure that broadcasts to fail cycle points triggers a re-parse of fail cycle points. Check other items in simulation mode settings for similar bugs. --- cylc/flow/simulation.py | 11 ++++++++--- tests/integration/test_simulation.py | 11 +++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index ada5847962b..0d8d732d807 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -20,6 +20,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from time import time +from cylc.flow.cycling import PointBase from cylc.flow.cycling.loader import get_point from cylc.flow.parsec.util import ( pdeepcopy, @@ -37,7 +38,6 @@ if TYPE_CHECKING: from cylc.flow.broadcast_mgr import BroadcastMgr - from cylc.flow.cycling import PointBase from cylc.flow.task_events_mgr import TaskEventsManager from cylc.flow.task_proxy import TaskProxy from cylc.flow.workflow_db_mgr import WorkflowDatabaseManager @@ -57,8 +57,8 @@ class ModeSettings: pre simulated submission. db_mgr: The database manager must be provided for simulated jobs - that are being resumed after workflow restart. It is used to extract - the original scheduled finish time for the job. + that are being resumed after workflow restart. It is used to + extract the original scheduled finish time for the job. Attrs: simulated_run_length: @@ -109,6 +109,11 @@ def __init__( if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) + rtconfig["simulation"][ + "fail cycle points" + ] = parse_fail_cycle_points( + rtconfig["simulation"]["fail cycle points"] + ) else: rtconfig = itask.tdef.rtconfig diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index f2fda2e5c3e..4ee9a4225e0 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -402,3 +402,14 @@ async def test_settings_broadcast( schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is False + + # Broadcast tasks will reparse correctly: + schd.task_events_mgr.broadcast_mgr.put_broadcast( + ['1066'], ['one'], [{ + 'simulation': {'fail cycle points': '1945, 1977, 1066'} + }]) + + # Submit again - result is different: + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + assert itask.mode_settings.sim_task_fails is True From 38db5dc36f90f56ea6ae12368fd82d869bcb6285 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 27 Feb 2024 09:31:15 +0000 Subject: [PATCH 27/51] Use itask summary started time as sole arbiter of simulation tasks which have been submitted before restart. --- cylc/flow/simulation.py | 7 ++++--- cylc/flow/task_job_mgr.py | 5 ++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 0d8d732d807..bbca671aaba 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -20,7 +20,6 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from time import time -from cylc.flow.cycling import PointBase from cylc.flow.cycling.loader import get_point from cylc.flow.parsec.util import ( pdeepcopy, @@ -41,6 +40,7 @@ from cylc.flow.task_events_mgr import TaskEventsManager from cylc.flow.task_proxy import TaskProxy from cylc.flow.workflow_db_mgr import WorkflowDatabaseManager + from cylc.flow.cycling import PointBase @dataclass @@ -78,14 +78,14 @@ def __init__( self, itask: 'TaskProxy', broadcast_mgr: 'BroadcastMgr', - db_mgr: 'Optional[WorkflowDatabaseManager]' = None + db_mgr: 'Optional[WorkflowDatabaseManager]', ): # itask.summary['started_time'] and mode_settings.timeout need # repopulating from the DB on workflow restart: started_time = itask.summary['started_time'] try_num = None - if started_time is None and db_mgr: + if started_time is None: # Get DB info db_info = db_mgr.pub_dao.select_task_job( itask.tokens['cycle'], @@ -260,6 +260,7 @@ def sim_time_check( if itask.state.status != TASK_STATUS_RUNNING: continue + # This occurs if the workflow has been restarted. if itask.mode_settings is None: itask.mode_settings = ModeSettings( itask, diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index e943a106eba..936bc38d0fc 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -1004,7 +1004,10 @@ def _simulation_submit_task_jobs(self, itasks, workflow): itask.summary['started_time'] = now self._set_retry_timers(itask) itask.mode_settings = ModeSettings( - itask, self.task_events_mgr.broadcast_mgr) + itask, + self.task_events_mgr.broadcast_mgr, + self.workflow_db_mgr + ) itask.waiting_on_job_prep = False itask.submit_num += 1 From 95d3dc48139cb409fb26317a12f93a3b0ef3aef2 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:48:56 +0000 Subject: [PATCH 28/51] ensure that broadcast checks and rejects unparsable fail cycle points --- cylc/flow/broadcast_mgr.py | 35 +++++++++++++++++++++++++--- tests/integration/test_simulation.py | 14 +++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/cylc/flow/broadcast_mgr.py b/cylc/flow/broadcast_mgr.py index 81504055052..0f9d68977d5 100644 --- a/cylc/flow/broadcast_mgr.py +++ b/cylc/flow/broadcast_mgr.py @@ -265,9 +265,23 @@ def put_broadcast( modified_settings = [] bad_point_strings = [] bad_namespaces = [] - + bad_fail_points = [] with self.lock: for setting in settings: + fail_cycle_points = setting.get( + 'simulation', {'fail cycle points': None}).get( + 'fail cycle points', None) + if fail_cycle_points: + from cylc.flow.simulation import parse_fail_cycle_points + try: + parse_fail_cycle_points( + [i.strip() for i in fail_cycle_points.split(',')]) + except PointParsingError as exc: + exc_msg = ':'.join(exc.args[0].split(':')[:-2]) + bad_fail_points.append( + f'{fail_cycle_points} : {exc_msg}') + continue + for point_string in point_strings: # Standardise the point and check its validity. bad_point = False @@ -303,13 +317,16 @@ def put_broadcast( # Log the broadcast self.workflow_db_mgr.put_broadcast(modified_settings) - LOG.info(get_broadcast_change_report(modified_settings)) + if modified_settings: + LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces + if bad_fail_points: + bad_options['fail_cycle_points'] = bad_fail_points if modified_settings: self.data_store_mgr.delta_broadcast() return modified_settings, bad_options @@ -322,7 +339,13 @@ def _cancel_keys_in_prunes(prunes, cancel_keys): @classmethod def _get_bad_options( - cls, prunes, point_strings, namespaces, cancel_keys_list): + cls, + prunes, + point_strings, + namespaces, + fail_points, + cancel_keys_list + ): """Return unpruned namespaces and/or point_strings options.""" cancel_keys_list = [ tuple(cancel_keys) for cancel_keys in cancel_keys_list] @@ -334,6 +357,7 @@ def _get_bad_options( for opt_name, opt_list, opt_test in [ ("point_strings", point_strings, cls._point_string_in_prunes), ("namespaces", namespaces, cls._namespace_in_prunes), + ('fail_points', fail_points, cls._fail_points_in_prunes), ("cancel", cancel_keys_list, cls._cancel_keys_in_prunes)]: if opt_list: bad_options[opt_name] = set(opt_list) @@ -359,6 +383,11 @@ def _point_string_in_prunes(prunes, point_string): """Is point_string pruned?""" return point_string in [prune[0] for prune in prunes] + @staticmethod + def _fail_points_in_prunes(prunes, fail_points): + """Is point_string pruned?""" + return fail_points in [prune[0] for prune in prunes] + def _prune(self): """Remove empty leaves left by unsetting broadcast values. diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 4ee9a4225e0..852cc3338ab 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -398,6 +398,19 @@ async def test_settings_broadcast( 'simulation': {'fail cycle points': ''} }]) + # Assert that list of broadcasts doesn't change if we submit + # Invalid fail cycle points to broadcast. + schd.task_events_mgr.broadcast_mgr.put_broadcast( + ['1066'], ['one'], [{ + 'simulation': {'fail cycle points': 'higadfuhasgiurguj'} + }]) + schd.task_events_mgr.broadcast_mgr.put_broadcast( + ['1066'], ['one'], [{ + 'simulation': {'fail cycle points': '1'} + }]) + assert {'fail cycle points': []} == ( + schd.broadcast_mgr.broadcasts['1066']['one']['simulation']) + # Submit again - result is different: schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) @@ -413,3 +426,4 @@ async def test_settings_broadcast( schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is True + From 10580237fc52ce043277fa43dacd032d66eca6b4 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 28 Feb 2024 09:04:25 +0000 Subject: [PATCH 29/51] Fix a broken test, and some linting and typing issues --- cylc/flow/broadcast_mgr.py | 17 +++++++++++++---- cylc/flow/network/resolvers.py | 4 +++- cylc/flow/simulation.py | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/cylc/flow/broadcast_mgr.py b/cylc/flow/broadcast_mgr.py index 0f9d68977d5..6b852c63600 100644 --- a/cylc/flow/broadcast_mgr.py +++ b/cylc/flow/broadcast_mgr.py @@ -83,7 +83,12 @@ def check_ext_triggers(self, itask, ext_trigger_queue): return self._match_ext_trigger(itask) def clear_broadcast( - self, point_strings=None, namespaces=None, cancel_settings=None): + self, + point_strings=None, + namespaces=None, + fail_points=None, + cancel_settings=None + ): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: @@ -132,7 +137,11 @@ def clear_broadcast( # Prune any empty branches bad_options = self._get_bad_options( - self._prune(), point_strings, namespaces, cancel_keys_list) + self._prune(), + point_strings, + namespaces, + fail_points, + cancel_keys_list) # Log the broadcast self.workflow_db_mgr.put_broadcast(modified_settings, is_cancel=True) @@ -326,7 +335,7 @@ def put_broadcast( if bad_namespaces: bad_options["namespaces"] = bad_namespaces if bad_fail_points: - bad_options['fail_cycle_points'] = bad_fail_points + bad_options['bad_fail_points'] = bad_fail_points if modified_settings: self.data_store_mgr.delta_broadcast() return modified_settings, bad_options @@ -357,7 +366,7 @@ def _get_bad_options( for opt_name, opt_list, opt_test in [ ("point_strings", point_strings, cls._point_string_in_prunes), ("namespaces", namespaces, cls._namespace_in_prunes), - ('fail_points', fail_points, cls._fail_points_in_prunes), + ('bad_fail_points', fail_points, cls._fail_points_in_prunes), ("cancel", cancel_keys_list, cls._cancel_keys_in_prunes)]: if opt_list: bad_options[opt_name] = set(opt_list) diff --git a/cylc/flow/network/resolvers.py b/cylc/flow/network/resolvers.py index 5af50b0a4f8..983ce69023d 100644 --- a/cylc/flow/network/resolvers.py +++ b/cylc/flow/network/resolvers.py @@ -767,7 +767,9 @@ def broadcast( cycle_points, namespaces, settings) if mode == 'clear_broadcast': return self.schd.task_events_mgr.broadcast_mgr.clear_broadcast( - cycle_points, namespaces, settings) + point_strings=cycle_points, + namespaces=namespaces, + cancel_settings=settings) if mode == 'expire_broadcast': return self.schd.task_events_mgr.broadcast_mgr.expire_broadcast( cutoff) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index bbca671aaba..4d1200aac9d 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -78,7 +78,7 @@ def __init__( self, itask: 'TaskProxy', broadcast_mgr: 'BroadcastMgr', - db_mgr: 'Optional[WorkflowDatabaseManager]', + db_mgr: 'WorkflowDatabaseManager', ): # itask.summary['started_time'] and mode_settings.timeout need From 96db8267254a1256953879c72ce165d0bd4af53a Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 28 Feb 2024 09:49:42 +0000 Subject: [PATCH 30/51] Update cylc/flow/network/resolvers.py --- cylc/flow/network/resolvers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/network/resolvers.py b/cylc/flow/network/resolvers.py index 983ce69023d..afa9bdc18f4 100644 --- a/cylc/flow/network/resolvers.py +++ b/cylc/flow/network/resolvers.py @@ -768,7 +768,7 @@ def broadcast( if mode == 'clear_broadcast': return self.schd.task_events_mgr.broadcast_mgr.clear_broadcast( point_strings=cycle_points, - namespaces=namespaces, + namespaces=namespaces, cancel_settings=settings) if mode == 'expire_broadcast': return self.schd.task_events_mgr.broadcast_mgr.expire_broadcast( From 0b41424303700f10436b2c095069cb001449cb4b Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 4 Mar 2024 10:15:03 +0000 Subject: [PATCH 31/51] Apply suggestions from code review Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- cylc/flow/broadcast_mgr.py | 5 ++--- tests/integration/test_simulation.py | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cylc/flow/broadcast_mgr.py b/cylc/flow/broadcast_mgr.py index 6b852c63600..95bd0d79daa 100644 --- a/cylc/flow/broadcast_mgr.py +++ b/cylc/flow/broadcast_mgr.py @@ -278,8 +278,7 @@ def put_broadcast( with self.lock: for setting in settings: fail_cycle_points = setting.get( - 'simulation', {'fail cycle points': None}).get( - 'fail cycle points', None) + 'simulation', {}).get('fail cycle points', None) if fail_cycle_points: from cylc.flow.simulation import parse_fail_cycle_points try: @@ -394,7 +393,7 @@ def _point_string_in_prunes(prunes, point_string): @staticmethod def _fail_points_in_prunes(prunes, fail_points): - """Is point_string pruned?""" + """Is fail cycle points pruned?""" return fail_points in [prune[0] for prune in prunes] def _prune(self): diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 852cc3338ab..3d882b7b32d 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -408,8 +408,9 @@ async def test_settings_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': '1'} }]) - assert {'fail cycle points': []} == ( - schd.broadcast_mgr.broadcasts['1066']['one']['simulation']) + assert schd.broadcast_mgr.broadcasts['1066']['one'][ + 'simulation' + ] == {'fail cycle points': []} # Submit again - result is different: schd.task_job_mgr._simulation_submit_task_jobs( From d93de69f1439bccb721310e71e6312bac3310e52 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 4 Mar 2024 10:16:04 +0000 Subject: [PATCH 32/51] Update changes.d/5721.feat.md --- changes.d/5721.feat.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes.d/5721.feat.md b/changes.d/5721.feat.md index fa4adf9c455..24c5476e2ad 100644 --- a/changes.d/5721.feat.md +++ b/changes.d/5721.feat.md @@ -1 +1 @@ -Allow task's run mode (i.e. "live", "simulation" or "dummy") to be changed dynamically at runtime using `cylc broadcast`. \ No newline at end of file +Allow task's run mode settings to be changed dynamically at runtime using `cylc broadcast`. \ No newline at end of file From 62688095edfb0301d11d7f97c789c8a11ba7d96c Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 4 Mar 2024 11:59:03 +0000 Subject: [PATCH 33/51] Update cylc/flow/simulation.py --- cylc/flow/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 4d1200aac9d..f7ceafa49a3 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -87,7 +87,7 @@ def __init__( try_num = None if started_time is None: # Get DB info - db_info = db_mgr.pub_dao.select_task_job( + db_info = db_mgr.pri_dao.select_task_job( itask.tokens['cycle'], itask.tokens['task'], itask.tokens['job'], From 5447530d8b36349f84cfafe208d41ad36f2843e9 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 4 Mar 2024 11:59:32 +0000 Subject: [PATCH 34/51] Update changes.d/5721.feat.md Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- changes.d/5721.feat.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes.d/5721.feat.md b/changes.d/5721.feat.md index 24c5476e2ad..009b9067189 100644 --- a/changes.d/5721.feat.md +++ b/changes.d/5721.feat.md @@ -1 +1 @@ -Allow task's run mode settings to be changed dynamically at runtime using `cylc broadcast`. \ No newline at end of file +Allow tasks' simulation mode settings to be changed dynamically at runtime using `cylc broadcast`. \ No newline at end of file From ab532274cb74a3d14c6f4bf0ba3297a7ad0761f9 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 4 Mar 2024 12:47:03 +0000 Subject: [PATCH 35/51] fix broken test --- tests/integration/test_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 3d882b7b32d..130c979a598 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -279,7 +279,7 @@ async def test_settings_restart( if itask.identity == '1066/two': # Delete the database entry for `two`: Ensure that # we don't break sim mode on upgrade to this version of Cylc. - schd.workflow_db_mgr.pub_dao.connect().execute( + schd.workflow_db_mgr.pri_dao.connect().execute( 'UPDATE task_jobs' '\n SET time_submit = NULL' '\n WHERE (name == \'two\')' From d2b372e6875934bbc4820ea223554c44e5d9e3a5 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 5 Mar 2024 09:29:40 +0000 Subject: [PATCH 36/51] make sim mode messages _look_ external --- cylc/flow/simulation.py | 9 ++++++--- .../cylc-get-config/04-dummy-mode-output.t | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index f7ceafa49a3..ca1436f3578 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -271,16 +271,19 @@ def sim_time_check( if now > itask.mode_settings.timeout: if itask.mode_settings.sim_task_fails: task_events_manager.process_message( - itask, 'CRITICAL', TASK_STATUS_FAILED + itask, 'CRITICAL', TASK_STATUS_FAILED, + flag=task_events_manager.FLAG_RECEIVED ) else: task_events_manager.process_message( - itask, 'DEBUG', TASK_STATUS_SUCCEEDED + itask, 'DEBUG', TASK_STATUS_SUCCEEDED, + flag=task_events_manager.FLAG_RECEIVED ) # Simulate message outputs. for msg in itask.tdef.rtconfig['outputs'].values(): task_events_manager.process_message( - itask, 'DEBUG', msg + itask, 'DEBUG', msg, + flag=task_events_manager.FLAG_RECEIVED ) # We've finished this psuedojob, so delete all the mode settings. diff --git a/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t b/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t index 8acf159cc68..9fa85a4aec9 100755 --- a/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t +++ b/tests/flakyfunctional/cylc-get-config/04-dummy-mode-output.t @@ -47,8 +47,8 @@ delete_db workflow_run_ok "${TEST_NAME_BASE}-run-simulation" \ cylc play -m 'simulation' --reference-test --debug --no-detach "${WORKFLOW_NAME}" LOG="$(cylc log -m p "$WORKFLOW_NAME")" -count_ok '(internal)meet' "${LOG}" 1 -count_ok '(internal)greet' "${LOG}" 1 +count_ok '(received)meet' "${LOG}" 1 +count_ok '(received)greet' "${LOG}" 1 purge exit From ceb30639296796ee967b3877f80c0cc3436379d9 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 5 Mar 2024 10:00:37 +0000 Subject: [PATCH 37/51] removed bc manager changes - one test is failing... --- cylc/flow/broadcast_mgr.py | 47 ++++---------------------------------- cylc/flow/simulation.py | 4 +++- 2 files changed, 8 insertions(+), 43 deletions(-) diff --git a/cylc/flow/broadcast_mgr.py b/cylc/flow/broadcast_mgr.py index 95bd0d79daa..81504055052 100644 --- a/cylc/flow/broadcast_mgr.py +++ b/cylc/flow/broadcast_mgr.py @@ -83,12 +83,7 @@ def check_ext_triggers(self, itask, ext_trigger_queue): return self._match_ext_trigger(itask) def clear_broadcast( - self, - point_strings=None, - namespaces=None, - fail_points=None, - cancel_settings=None - ): + self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: @@ -137,11 +132,7 @@ def clear_broadcast( # Prune any empty branches bad_options = self._get_bad_options( - self._prune(), - point_strings, - namespaces, - fail_points, - cancel_keys_list) + self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.workflow_db_mgr.put_broadcast(modified_settings, is_cancel=True) @@ -274,22 +265,9 @@ def put_broadcast( modified_settings = [] bad_point_strings = [] bad_namespaces = [] - bad_fail_points = [] + with self.lock: for setting in settings: - fail_cycle_points = setting.get( - 'simulation', {}).get('fail cycle points', None) - if fail_cycle_points: - from cylc.flow.simulation import parse_fail_cycle_points - try: - parse_fail_cycle_points( - [i.strip() for i in fail_cycle_points.split(',')]) - except PointParsingError as exc: - exc_msg = ':'.join(exc.args[0].split(':')[:-2]) - bad_fail_points.append( - f'{fail_cycle_points} : {exc_msg}') - continue - for point_string in point_strings: # Standardise the point and check its validity. bad_point = False @@ -325,16 +303,13 @@ def put_broadcast( # Log the broadcast self.workflow_db_mgr.put_broadcast(modified_settings) - if modified_settings: - LOG.info(get_broadcast_change_report(modified_settings)) + LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces - if bad_fail_points: - bad_options['bad_fail_points'] = bad_fail_points if modified_settings: self.data_store_mgr.delta_broadcast() return modified_settings, bad_options @@ -347,13 +322,7 @@ def _cancel_keys_in_prunes(prunes, cancel_keys): @classmethod def _get_bad_options( - cls, - prunes, - point_strings, - namespaces, - fail_points, - cancel_keys_list - ): + cls, prunes, point_strings, namespaces, cancel_keys_list): """Return unpruned namespaces and/or point_strings options.""" cancel_keys_list = [ tuple(cancel_keys) for cancel_keys in cancel_keys_list] @@ -365,7 +334,6 @@ def _get_bad_options( for opt_name, opt_list, opt_test in [ ("point_strings", point_strings, cls._point_string_in_prunes), ("namespaces", namespaces, cls._namespace_in_prunes), - ('bad_fail_points', fail_points, cls._fail_points_in_prunes), ("cancel", cancel_keys_list, cls._cancel_keys_in_prunes)]: if opt_list: bad_options[opt_name] = set(opt_list) @@ -391,11 +359,6 @@ def _point_string_in_prunes(prunes, point_string): """Is point_string pruned?""" return point_string in [prune[0] for prune in prunes] - @staticmethod - def _fail_points_in_prunes(prunes, fail_points): - """Is fail cycle points pruned?""" - return fail_points in [prune[0] for prune in prunes] - def _prune(self): """Remove empty leaves left by unsetting broadcast values. diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index ca1436f3578..724a86e7c0e 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -301,9 +301,11 @@ def sim_task_failed( Allows Unit testing. """ - return ( + x = ( sim_conf['fail cycle points'] is None # i.e. "all" or point in sim_conf['fail cycle points'] ) and ( try_num == 1 or not sim_conf['fail try 1 only'] ) + # breakpoint(header=f'{x},{sim_conf}, {point}, {try_num}') + return x From 89db69c390f492e00e87e5361cb57a93ff203c30 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 5 Mar 2024 11:10:32 +0000 Subject: [PATCH 38/51] Prevent totally invalid fail cycle points being accepted for simulation. --- cylc/flow/simulation.py | 23 ++++++++++++++++++----- tests/integration/test_simulation.py | 25 +++++++++++++++---------- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 724a86e7c0e..69c5991c1ae 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -20,7 +20,9 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from time import time +from cylc.flow import LOG from cylc.flow.cycling.loader import get_point +from cylc.flow.exceptions import PointParsingError from cylc.flow.parsec.util import ( pdeepcopy, poverride @@ -109,11 +111,22 @@ def __init__( if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) - rtconfig["simulation"][ - "fail cycle points" - ] = parse_fail_cycle_points( - rtconfig["simulation"]["fail cycle points"] - ) + + try: + rtconfig["simulation"][ + "fail cycle points" + ] = parse_fail_cycle_points( + rtconfig["simulation"]["fail cycle points"] + ) + except PointParsingError as exc: + # Broadcast Fail CP didn't parse + LOG.error( + 'Broadcast fail cycle point was invalid:\n' + f' {exc.args[0]}' + ) + rtconfig['simulation'][ + 'fail cycle points' + ] = itask.tdef.rtconfig['simulation']['fail cycle points'] else: rtconfig = itask.tdef.rtconfig diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 130c979a598..efcbef7d4cd 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -373,7 +373,7 @@ async def test_settings_broadcast( } }, defaults=False) schd = scheduler(id_, paused_start=False, run_mode='simulation') - async with start(schd): + async with start(schd) as log: itask = schd.pool.get_tasks()[0] itask.state.is_queued = False @@ -391,39 +391,44 @@ async def test_settings_broadcast( # The mode_settings object has been cleared: assert itask.mode_settings is None - # Change a setting using broadcast: schd.task_events_mgr.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': ''} }]) + # Submit again - result is different: + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + assert itask.mode_settings.sim_task_fails is False # Assert that list of broadcasts doesn't change if we submit # Invalid fail cycle points to broadcast. + itask.mode_settings = None schd.task_events_mgr.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': 'higadfuhasgiurguj'} }]) + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + assert ( + 'Invalid ISO 8601 date representation: higadfuhasgiurguj' + in log.messages[-1]) + schd.task_events_mgr.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': '1'} }]) - assert schd.broadcast_mgr.broadcasts['1066']['one'][ - 'simulation' - ] == {'fail cycle points': []} - - # Submit again - result is different: schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) - assert itask.mode_settings.sim_task_fails is False + assert ( + 'Invalid ISO 8601 date representation: 1' + in log.messages[-1]) # Broadcast tasks will reparse correctly: schd.task_events_mgr.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': '1945, 1977, 1066'} }]) - - # Submit again - result is different: schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is True From cd2fe555fcf9d02afab6c447ae73e3e464142963 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 6 Mar 2024 09:17:09 +0000 Subject: [PATCH 39/51] Update cylc/flow/simulation.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- cylc/flow/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 69c5991c1ae..768bb5c4e5f 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -120,7 +120,7 @@ def __init__( ) except PointParsingError as exc: # Broadcast Fail CP didn't parse - LOG.error( + LOG.warning( 'Broadcast fail cycle point was invalid:\n' f' {exc.args[0]}' ) From 677324655e5fe1a4ba7effa575f1950639aacb6f Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 6 Mar 2024 09:51:52 +0000 Subject: [PATCH 40/51] ensure than changing `fail try 1 only` doesn't cause failure --- cylc/flow/simulation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 768bb5c4e5f..2c733f705e2 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -244,11 +244,13 @@ def parse_fail_cycle_points( True >>> this([]) [] + >>> this(None) + [] """ - f_pts: 'Optional[List[PointBase]]' - if 'all' in f_pts_orig: + f_pts: 'Optional[List[PointBase]]' = [] + if f_pts_orig and 'all' in f_pts_orig: f_pts = None - else: + elif f_pts_orig: f_pts = [] for point_str in f_pts_orig: f_pts.append(get_point(point_str).standardise()) From d147b0fd7ba3c39b33736f421f7bcb70c84560df Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 6 Mar 2024 17:14:56 +0000 Subject: [PATCH 41/51] Prevent repeated use of sim_task_failed giving different answers (based on the previous answer). --- cylc/flow/simulation.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 2c733f705e2..e9b41f92746 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -245,10 +245,13 @@ def parse_fail_cycle_points( >>> this([]) [] >>> this(None) - [] + None """ f_pts: 'Optional[List[PointBase]]' = [] - if f_pts_orig and 'all' in f_pts_orig: + if ( + f_pts_orig is None + or f_pts_orig and 'all' in f_pts_orig + ): f_pts = None elif f_pts_orig: f_pts = [] @@ -316,11 +319,9 @@ def sim_task_failed( Allows Unit testing. """ - x = ( + return ( sim_conf['fail cycle points'] is None # i.e. "all" or point in sim_conf['fail cycle points'] ) and ( try_num == 1 or not sim_conf['fail try 1 only'] ) - # breakpoint(header=f'{x},{sim_conf}, {point}, {try_num}') - return x From 269078a3c7b76d33e8259bf6fe6ca35f50b51400 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 6 Mar 2024 17:21:05 +0000 Subject: [PATCH 42/51] fix --- cylc/flow/simulation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index e9b41f92746..72c4db879d2 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -244,8 +244,8 @@ def parse_fail_cycle_points( True >>> this([]) [] - >>> this(None) - None + >>> this(None) is None + True """ f_pts: 'Optional[List[PointBase]]' = [] if ( From b4f4bb10630b1606c4bf4c6f36bb41696c1adc7a Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 11 Mar 2024 09:54:37 +0000 Subject: [PATCH 43/51] Broadcast changes to simulated tasks in task_job_manager to allow non simulation settings to be modified and to avoid changing the itask.tdef.rtconfig. --- cylc/flow/broadcast_mgr.py | 15 ++++++++++++++- cylc/flow/simulation.py | 19 +++++-------------- cylc/flow/task_job_mgr.py | 9 +++++++-- tests/integration/test_simulation.py | 7 +++++-- 4 files changed, 31 insertions(+), 19 deletions(-) diff --git a/cylc/flow/broadcast_mgr.py b/cylc/flow/broadcast_mgr.py index 81504055052..6cd007ee25a 100644 --- a/cylc/flow/broadcast_mgr.py +++ b/cylc/flow/broadcast_mgr.py @@ -30,11 +30,12 @@ from cylc.flow.cfgspec.workflow import SPEC from cylc.flow.cycling.loader import get_point, standardise_point_string from cylc.flow.exceptions import PointParsingError -from cylc.flow.parsec.util import listjoin +from cylc.flow.parsec.util import listjoin, pdeepcopy, poverride from cylc.flow.parsec.validate import BroadcastConfigValidator if TYPE_CHECKING: from cylc.flow.id import Tokens + from cylc.flow.task_proxy import TaskProxy ALL_CYCLE_POINTS_STRS = ["*", "all-cycle-points", "all-cycles"] @@ -179,6 +180,18 @@ def get_broadcast(self, tokens: 'Optional[Tokens]' = None) -> dict: addict(ret, self.broadcasts[cycle][namespace]) return ret + def get_updated_rtconfig(self, itask: 'TaskProxy') -> dict: + """Retrieve updated rtconfig for a single task proxy""" + overrides = self.get_broadcast( + itask.tokens + ) + if overrides: + rtconfig = pdeepcopy(itask.tdef.rtconfig) + poverride(rtconfig, overrides, prepend=True) + else: + rtconfig = itask.tdef.rtconfig + return rtconfig + def load_db_broadcast_states(self, row_idx, row): """Load broadcast variables from runtime DB broadcast states row.""" if row_idx == 0: diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 72c4db879d2..edf52d322d3 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -23,10 +23,6 @@ from cylc.flow import LOG from cylc.flow.cycling.loader import get_point from cylc.flow.exceptions import PointParsingError -from cylc.flow.parsec.util import ( - pdeepcopy, - poverride -) from cylc.flow.platforms import FORBIDDEN_WITH_PLATFORM from cylc.flow.task_state import ( TASK_STATUS_RUNNING, @@ -38,7 +34,6 @@ from metomi.isodatetime.parsers import DurationParser if TYPE_CHECKING: - from cylc.flow.broadcast_mgr import BroadcastMgr from cylc.flow.task_events_mgr import TaskEventsManager from cylc.flow.task_proxy import TaskProxy from cylc.flow.workflow_db_mgr import WorkflowDatabaseManager @@ -79,8 +74,8 @@ class ModeSettings: def __init__( self, itask: 'TaskProxy', - broadcast_mgr: 'BroadcastMgr', db_mgr: 'WorkflowDatabaseManager', + rtconfig ): # itask.summary['started_time'] and mode_settings.timeout need @@ -107,11 +102,7 @@ def __init__( try_num = db_info["try_num"] # Update anything changed by broadcast: - overrides = broadcast_mgr.get_broadcast(itask.tokens) - if overrides: - rtconfig = pdeepcopy(itask.tdef.rtconfig) - poverride(rtconfig, overrides, prepend=True) - + if rtconfig != itask.tdef.rtconfig: try: rtconfig["simulation"][ "fail cycle points" @@ -127,8 +118,6 @@ def __init__( rtconfig['simulation'][ 'fail cycle points' ] = itask.tdef.rtconfig['simulation']['fail cycle points'] - else: - rtconfig = itask.tdef.rtconfig # Calculate simulation info: self.simulated_run_length = ( @@ -280,10 +269,12 @@ def sim_time_check( # This occurs if the workflow has been restarted. if itask.mode_settings is None: + rtconfig = task_events_manager.broadcast_mgr.get_updated_rtconfig( + itask) itask.mode_settings = ModeSettings( itask, - task_events_manager.broadcast_mgr, db_mgr, + rtconfig ) if now > itask.mode_settings.timeout: diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index 936bc38d0fc..f52c88b85e7 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -1001,13 +1001,18 @@ def _simulation_submit_task_jobs(self, itasks, workflow): now = time() now_str = get_time_string_from_unix_time(now) for itask in itasks: + # Handle broadcasts + rtconfig = self.task_events_mgr.broadcast_mgr.get_updated_rtconfig( + itask) + itask.summary['started_time'] = now self._set_retry_timers(itask) itask.mode_settings = ModeSettings( itask, - self.task_events_mgr.broadcast_mgr, - self.workflow_db_mgr + self.workflow_db_mgr, + rtconfig ) + itask.waiting_on_job_prep = False itask.submit_num += 1 diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index efcbef7d4cd..bad9b94835b 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -365,9 +365,11 @@ async def test_settings_broadcast( 'runtime': { 'one': { 'execution time limit': 'PT1S', + 'execution retry delays': '2*PT5S', 'simulation': { 'speedup factor': 1, 'fail cycle points': '1066', + 'fail try 1 only': False } }, } @@ -427,9 +429,10 @@ async def test_settings_broadcast( # Broadcast tasks will reparse correctly: schd.task_events_mgr.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ - 'simulation': {'fail cycle points': '1945, 1977, 1066'} + 'simulation': {'fail cycle points': '1945, 1977, 1066'}, + 'execution retry delays': '5*PT2S' }]) schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is True - + assert itask.tdef.rtconfig['execution retry delays'] == [5.0, 5.0] From 370d2b6538c06884e283458f0bb20e508fa82587 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 11 Mar 2024 10:04:15 +0000 Subject: [PATCH 44/51] test that clearing broadcasts works for sim tasks --- tests/integration/test_simulation.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index bad9b94835b..3474976c7ca 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -394,7 +394,7 @@ async def test_settings_broadcast( # The mode_settings object has been cleared: assert itask.mode_settings is None # Change a setting using broadcast: - schd.task_events_mgr.broadcast_mgr.put_broadcast( + schd.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': ''} }]) @@ -403,10 +403,16 @@ async def test_settings_broadcast( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is False + # Assert Clearing the broadcast works + schd.broadcast_mgr.clear_broadcast() + schd.task_job_mgr._simulation_submit_task_jobs( + [itask], schd.workflow) + assert itask.mode_settings.sim_task_fails is True + # Assert that list of broadcasts doesn't change if we submit # Invalid fail cycle points to broadcast. itask.mode_settings = None - schd.task_events_mgr.broadcast_mgr.put_broadcast( + schd.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': 'higadfuhasgiurguj'} }]) @@ -416,7 +422,7 @@ async def test_settings_broadcast( 'Invalid ISO 8601 date representation: higadfuhasgiurguj' in log.messages[-1]) - schd.task_events_mgr.broadcast_mgr.put_broadcast( + schd.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': '1'} }]) @@ -427,7 +433,7 @@ async def test_settings_broadcast( in log.messages[-1]) # Broadcast tasks will reparse correctly: - schd.task_events_mgr.broadcast_mgr.put_broadcast( + schd.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': '1945, 1977, 1066'}, 'execution retry delays': '5*PT2S' From b9e1d15bcf165856317c5b1039df7faa5391ec0e Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:40:20 +0000 Subject: [PATCH 45/51] Update cylc/flow/simulation.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- cylc/flow/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index edf52d322d3..3b737eec642 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -101,7 +101,7 @@ def __init__( # Get the try number: try_num = db_info["try_num"] - # Update anything changed by broadcast: + # Parse fail cycle points: if rtconfig != itask.tdef.rtconfig: try: rtconfig["simulation"][ From b355ae272793b0833d799deeb8b563ae831c4f9e Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:40:30 +0000 Subject: [PATCH 46/51] Update cylc/flow/simulation.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- cylc/flow/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index 3b737eec642..c8d91c93356 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -75,7 +75,7 @@ def __init__( self, itask: 'TaskProxy', db_mgr: 'WorkflowDatabaseManager', - rtconfig + rtconfig: Dict[str, Any] ): # itask.summary['started_time'] and mode_settings.timeout need From 44abd5acb3aefa26291795540f0f4ad2406d85ec Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:40:37 +0000 Subject: [PATCH 47/51] Update tests/integration/test_simulation.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- tests/integration/test_simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 3474976c7ca..5d95066c6a8 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -441,4 +441,4 @@ async def test_settings_broadcast( schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is True - assert itask.tdef.rtconfig['execution retry delays'] == [5.0, 5.0] + assert itask.tdef.rtconfig['execution retry delays'] == [2, 2, 2, 2, 2] From cb064a053c6b713e99500b2ba09974df11b5aa8e Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Mon, 11 Mar 2024 15:00:10 +0000 Subject: [PATCH 48/51] fix test --- cylc/flow/task_job_mgr.py | 2 +- tests/integration/test_simulation.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index f52c88b85e7..05795197139 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -1006,7 +1006,7 @@ def _simulation_submit_task_jobs(self, itasks, workflow): itask) itask.summary['started_time'] = now - self._set_retry_timers(itask) + self._set_retry_timers(itask, rtconfig) itask.mode_settings = ModeSettings( itask, self.workflow_db_mgr, diff --git a/tests/integration/test_simulation.py b/tests/integration/test_simulation.py index 5d95066c6a8..72cf23996a4 100644 --- a/tests/integration/test_simulation.py +++ b/tests/integration/test_simulation.py @@ -436,9 +436,11 @@ async def test_settings_broadcast( schd.broadcast_mgr.put_broadcast( ['1066'], ['one'], [{ 'simulation': {'fail cycle points': '1945, 1977, 1066'}, - 'execution retry delays': '5*PT2S' + 'execution retry delays': '3*PT2S' }]) schd.task_job_mgr._simulation_submit_task_jobs( [itask], schd.workflow) assert itask.mode_settings.sim_task_fails is True - assert itask.tdef.rtconfig['execution retry delays'] == [2, 2, 2, 2, 2] + assert itask.try_timers['execution-retry'].delays == [2.0, 2.0, 2.0] + # n.b. rtconfig should remain unchanged, lest we cancel broadcasts: + assert itask.tdef.rtconfig['execution retry delays'] == [5.0, 5.0] From 6be73724e5104e8e887982d571b923d727dfd893 Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:22:56 +0000 Subject: [PATCH 49/51] Update changes.d/5721.feat.md Co-authored-by: Hilary James Oliver --- changes.d/5721.feat.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes.d/5721.feat.md b/changes.d/5721.feat.md index 009b9067189..bdeb27c7844 100644 --- a/changes.d/5721.feat.md +++ b/changes.d/5721.feat.md @@ -1 +1 @@ -Allow tasks' simulation mode settings to be changed dynamically at runtime using `cylc broadcast`. \ No newline at end of file +Allow task simulation mode settings to be changed dynamically using `cylc broadcast`. \ No newline at end of file From 389a0beba1891e2ce6b03e3eff39f7a932c813ec Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:23:05 +0000 Subject: [PATCH 50/51] Update cylc/flow/simulation.py Co-authored-by: Hilary James Oliver --- cylc/flow/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py index c8d91c93356..ba67d4e1b46 100644 --- a/cylc/flow/simulation.py +++ b/cylc/flow/simulation.py @@ -295,7 +295,7 @@ def sim_time_check( flag=task_events_manager.FLAG_RECEIVED ) - # We've finished this psuedojob, so delete all the mode settings. + # We've finished this pseudo job, so delete all the mode settings. itask.mode_settings = None sim_task_state_changed = True return sim_task_state_changed From 9186ad11f6b0816384a6a1675a420e13660f7c33 Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Fri, 15 Mar 2024 12:32:39 +1300 Subject: [PATCH 51/51] Fix func test, after update on master. --- tests/functional/modes/05-sim-trigger.t | 27 ++++++++++++++----------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/functional/modes/05-sim-trigger.t b/tests/functional/modes/05-sim-trigger.t index 272a95c2bc6..661a5843eda 100644 --- a/tests/functional/modes/05-sim-trigger.t +++ b/tests/functional/modes/05-sim-trigger.t @@ -18,27 +18,30 @@ # Test that we can re-trigger a task in sim mode . "$(dirname "$0")/test_header" -set_test_number 4 +set_test_number 5 install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" + run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" + workflow_run_ok "${TEST_NAME_BASE}-start" \ cylc play "${WORKFLOW_NAME}" --mode=simulation + SCHD_LOG="${WORKFLOW_RUN_DIR}/log/scheduler/log" -# Wait for the workflow to stall, then check for first task failure: +# Wait for stall, then check for first task failure: poll_grep_workflow_log 'stall timer starts' -grep_ok \ - '\[1/fail_fail_fail running job:01 flows:1\] => failed' \ - "${SCHD_LOG}" -# Trigger task again, wait for it to send a warning and check that it -# too has failed: +grep_ok '\[1/fail_fail_fail/01:running\] => failed' "${SCHD_LOG}" + +# Trigger task again, and check that it too failed: cylc trigger "${WORKFLOW_NAME}//1/fail_fail_fail" -poll_grep_workflow_log \ - 'job:02.*did not complete' -grep_ok \ - '\[1/fail_fail_fail running job:02 flows:1\] => failed' \ - "${SCHD_LOG}" + +poll_grep_workflow_log -E \ + '1/fail_fail_fail/02.* did not complete required outputs' + +grep_ok '\[1/fail_fail_fail/02:running\] => failed' "${SCHD_LOG}" + +run_ok "stop" cylc stop --max-polls=10 --interval=1 "${WORKFLOW_NAME}" purge