From 84dd1263ec408ce3dfe33b0cb01beefe25075c4b Mon Sep 17 00:00:00 2001 From: guipenedo Date: Fri, 12 Jul 2024 02:13:47 +0200 Subject: [PATCH] do not kill everything when a single task fails --- src/datatrove/executor/slurm_nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datatrove/executor/slurm_nodes.py b/src/datatrove/executor/slurm_nodes.py index 8cd90f48..611c3f78 100644 --- a/src/datatrove/executor/slurm_nodes.py +++ b/src/datatrove/executor/slurm_nodes.py @@ -262,7 +262,7 @@ def launch_job(self): srun_args_str = " ".join([f"--{k}={v}" for k, v in self.srun_args.items()]) if self.srun_args else "" launch_file_contents = self.get_launch_file_contents( self.get_sbatch_args(max_array), - f"srun {srun_args_str} --environment=datatrove -l launch_pickled_pipeline {self.logging_dir.resolve_paths('executor.pik')}", + f"srun {srun_args_str} --kill-on-bad-exit=0 --environment=datatrove -l launch_pickled_pipeline {self.logging_dir.resolve_paths('executor.pik')}", ) # save it with self.logging_dir.open("launch_script.slurm", "w") as launchscript_f: