From 1a57a8691df43bfab6c98e263fbf6b524a3941e5 Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Thu, 19 Sep 2024 11:44:56 -0400 Subject: [PATCH] chore: add exit code & tox fix Currently, the training library does not exit when an error is encountered within the training loop (invoked through torchrun). This commit updates that functionality so we correctly return an exit code of 1 on child failure. Additionally, this commit also adds the `make fix` command which automatically fixes all trivial issues picked up on by ruff Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> (cherry picked from commit 9c899dcc0d85c95e267468bd30d006a0bb4f423c) --- src/instructlab/training/main_ds.py | 15 ++++++++++----- tox.ini | 1 - 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index bd99d80b..6999eeac 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -745,6 +745,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: print(f"\033[92mRunning training command as subprocess: {' '.join(command)}\033[0m") process = None interrupt: KeyboardInterrupt | Exception | None = None + failure = False try: process = StreamablePopen( f"{train_args.ckpt_output_dir}/full_logs_global{torch_args.node_rank}.log", @@ -755,19 +756,20 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: print("Training subprocess interrupted by user.") interrupt = e except Exception as e: - print(f"An error occurred: {str(e)}") + print("Unexpected exception received during distributed training") interrupt = e finally: if "process" not in locals() or process is None: return - if process.poll() == 0: - print("\033[92mTraining subprocess exited successfully! 🎉\033[0m") + + failure = process.poll() != 0 + if not failure: + print("\033[92mOperation completed successfully! 🎉\033[0m") else: print( "\033[91mTraining subprocess has not exited yet. Sending SIGTERM.\033[0m" ) - print("Sending interrupt signal to Training subprocess.") process.terminate() try: print("Waiting for process to exit, 60s...") @@ -779,8 +781,11 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: process.kill() if interrupt: - print(f"Error caught from training subprocess.: {interrupt}") raise interrupt + if failure: + raise RuntimeError( + "Suffered a failure during distributed training. Please see the training logs for more context." + ) if __name__ == "__main__": diff --git a/tox.ini b/tox.ini index b6b625cd..5756b665 100644 --- a/tox.ini +++ b/tox.ini @@ -66,7 +66,6 @@ commands = sh -c 'git diff --exit-code || (echo "pyproject.toml formatting is incorrect. Please run \"make toml-fmt\" and commit the changes." && exit 1)' allowlist_externals = make, sh - [testenv:spellcheck] description = spell check (needs 'aspell' command) basepython = {[testenv:py3]basepython}