diff --git a/example_runtime_config.toml b/example_runtime_config.toml index ff1468d2..4f8922d8 100644 --- a/example_runtime_config.toml +++ b/example_runtime_config.toml @@ -13,7 +13,14 @@ checkpoint_mode = 'task_exit' # e.g. "/gscratch/dirac/kbmod/workflow/staging" staging_directory = "/home/drew/code/kbmod-wf/dev_staging" +[apps.ic_to_wu] +# The path to the KBMOD search config file +# e.g. "/gscratch/dirac/kbmod/workflow/kbmod_search_config.yaml" +search_config_filepath = "/home/drew/code/kbmod-wf/dev_staging/search_config.yaml" + [apps.reproject_wu] +# Number of processors to use for parallelizing the reprojection +n_workers = 32 # The name of the observation site to use for reflex correction observation_site = "ctio" diff --git a/src/kbmod_wf/resource_configs/klone_configuration.py b/src/kbmod_wf/resource_configs/klone_configuration.py index 9621a872..2609d515 100644 --- a/src/kbmod_wf/resource_configs/klone_configuration.py +++ b/src/kbmod_wf/resource_configs/klone_configuration.py @@ -7,8 +7,8 @@ walltimes = { "compute_bigmem": "01:00:00", - "large_mem": "08:00:00", - "gpu_max": "04:00:00", + "large_mem": "04:00:00", + "gpu_max": "08:00:00", } @@ -20,19 +20,21 @@ def klone_resource_config(): os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat()) ), run_dir=os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat()), + retries=1, executors=[ HighThroughputExecutor( label="small_cpu", + max_workers_per_node=1, provider=SlurmProvider( partition="compute-bigmem", account="astro", min_blocks=0, max_blocks=4, - init_blocks=1, + init_blocks=0, parallelism=1, nodes_per_block=1, cores_per_node=1, # perhaps should be 8??? - mem_per_node=64, # In GB + mem_per_node=256, # In GB exclusive=False, walltime=walltimes["compute_bigmem"], # Command to run before starting worker - i.e. conda activate @@ -41,12 +43,13 @@ def klone_resource_config(): ), HighThroughputExecutor( label="large_mem", + max_workers_per_node=1, provider=SlurmProvider( - partition="compute-bigmem", + partition="ckpt-g2", account="astro", min_blocks=0, max_blocks=2, - init_blocks=1, + init_blocks=0, parallelism=1, nodes_per_block=1, cores_per_node=8, @@ -59,27 +62,28 @@ def klone_resource_config(): ), HighThroughputExecutor( label="gpu", - available_accelerators=2, + max_workers_per_node=1, provider=SlurmProvider( - partition="gpu_a40", + partition="ckpt-g2", account="escience", min_blocks=0, max_blocks=2, - init_blocks=1, + init_blocks=0, parallelism=1, nodes_per_block=1, - cores_per_node=4, # perhaps should be 8??? - mem_per_node=128, # In GB + cores_per_node=2, # perhaps should be 8??? + mem_per_node=512, # In GB exclusive=False, walltime=walltimes["gpu_max"], # Command to run before starting worker - i.e. conda activate worker_init="", + scheduler_options="#SBATCH --gpus=1", ), ), HighThroughputExecutor( label="local_thread", provider=LocalProvider( - init_blocks=1, + init_blocks=0, max_blocks=1, ), ), diff --git a/src/kbmod_wf/task_impls/ic_to_wu.py b/src/kbmod_wf/task_impls/ic_to_wu.py index 83096a6c..1607f08a 100644 --- a/src/kbmod_wf/task_impls/ic_to_wu.py +++ b/src/kbmod_wf/task_impls/ic_to_wu.py @@ -62,7 +62,7 @@ def __init__( self.logger = logger self.overwrite = self.runtime_config.get("overwrite", False) - self.search_config = self.runtime_config.get("search_config", None) + self.search_config_filepath = self.runtime_config.get("search_config_filepath", None) def create_work_unit(self): make_wu = True @@ -78,7 +78,7 @@ def create_work_unit(self): self.logger.info(f"ImageCollection read from {self.ic_filepath}, creating work unit next.") last_time = time.time() - orig_wu = ic.toWorkUnit(config=SearchConfiguration.from_file(self.search_config)) + orig_wu = ic.toWorkUnit(config=SearchConfiguration.from_file(self.search_config_filepath)) elapsed = round(time.time() - last_time, 1) self.logger.debug(f"Required {elapsed}[s] to create WorkUnit.") diff --git a/src/kbmod_wf/task_impls/reproject_wu.py b/src/kbmod_wf/task_impls/reproject_wu.py index 9a01b15f..0a7ccf43 100644 --- a/src/kbmod_wf/task_impls/reproject_wu.py +++ b/src/kbmod_wf/task_impls/reproject_wu.py @@ -85,9 +85,9 @@ def __init__( self.search_config = self.runtime_config.get("search_config", None) # Default to 8 workers if not in the config. Value must be 0