Skip to content

Commit

Permalink
workaround for FSX quota
Browse files Browse the repository at this point in the history
  • Loading branch information
haileyschoelkopf committed Sep 15, 2023
1 parent e59c873 commit 801192e
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions megatron/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,19 +327,19 @@ def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):
raise ValueError("Must be using deepspeed to use neox")

torch.distributed.barrier()
upload_to_s3 = torch.distributed.get_rank() == 0 and neox_args.s3_path is not None
if upload_to_s3:
upload_checkpoint(iteration, neox_args)

# Wait so everyone is done (necessary)
torch.distributed.barrier()
if neox_args.keep_last_n_checkpoints is not None:
delete_old_checkpoints(neox_args.save, neox_args.keep_last_n_checkpoints)

# Wait so everyone is done (not necessary)
torch.distributed.barrier()
upload_to_s3 = torch.distributed.get_rank() == 0 and neox_args.s3_path is not None
if upload_to_s3:
upload_checkpoint(iteration, neox_args)


# Wait so everyone is done (necessary)
torch.distributed.barrier()

def load_checkpoint(
neox_args, model, optimizer, lr_scheduler, inference=False, iteration=None
):
Expand Down

0 comments on commit 801192e

Please sign in to comment.