Skip to content

Commit

Permalink
Merge pull request #282 from instructlab/mergify/bp/release-v0.5/pr-217
Browse files Browse the repository at this point in the history
chore: add exit code & tox fix (backport #217)
  • Loading branch information
mergify[bot] authored Oct 21, 2024
2 parents aabd86b + 1a57a86 commit 204cf61
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 6 deletions.
15 changes: 10 additions & 5 deletions src/instructlab/training/main_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
print(f"\033[92mRunning training command as subprocess: {' '.join(command)}\033[0m")
process = None
interrupt: KeyboardInterrupt | Exception | None = None
failure = False
try:
process = StreamablePopen(
f"{train_args.ckpt_output_dir}/full_logs_global{torch_args.node_rank}.log",
Expand All @@ -755,19 +756,20 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
print("Training subprocess interrupted by user.")
interrupt = e
except Exception as e:
print(f"An error occurred: {str(e)}")
print("Unexpected exception received during distributed training")
interrupt = e
finally:
if "process" not in locals() or process is None:
return
if process.poll() == 0:
print("\033[92mTraining subprocess exited successfully! 🎉\033[0m")

failure = process.poll() != 0
if not failure:
print("\033[92mOperation completed successfully! 🎉\033[0m")
else:
print(
"\033[91mTraining subprocess has not exited yet. Sending SIGTERM.\033[0m"
)

print("Sending interrupt signal to Training subprocess.")
process.terminate()
try:
print("Waiting for process to exit, 60s...")
Expand All @@ -779,8 +781,11 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
process.kill()

if interrupt:
print(f"Error caught from training subprocess.: {interrupt}")
raise interrupt
if failure:
raise RuntimeError(
"Suffered a failure during distributed training. Please see the training logs for more context."
)


if __name__ == "__main__":
Expand Down
1 change: 0 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ commands =
sh -c 'git diff --exit-code || (echo "pyproject.toml formatting is incorrect. Please run \"make toml-fmt\" and commit the changes." && exit 1)'
allowlist_externals = make, sh


[testenv:spellcheck]
description = spell check (needs 'aspell' command)
basepython = {[testenv:py3]basepython}
Expand Down

0 comments on commit 204cf61

Please sign in to comment.