From 2959091705ca4653da7ea7abbe72d7569b66e347 Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Tue, 24 Sep 2024 15:52:10 +0000 Subject: [PATCH] additional checks --- configs/README.md | 2 ++ configs/neox_arguments.md | 9 +++++++++ megatron/data/data_utils.py | 3 +++ megatron/neox_arguments/neox_args.py | 1 + 4 files changed, 15 insertions(+) diff --git a/configs/README.md b/configs/README.md index 71a09ebea..ac20ed89b 100644 --- a/configs/README.md +++ b/configs/README.md @@ -124,6 +124,8 @@ These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must # this should provide some speedup but takes a while to build, set to true if desired "scaled_upper_triang_masked_softmax_fusion": false, "train_iters": 320000, + # alternatively, use train_epochs to automatically determine the number of training iterations + #"train_epochs": 1, ``` An example of some basic settings used to configure your model's architecture and number of training steps. diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 698e28697..639a5daf2 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -1928,6 +1928,15 @@ Training Arguments +- **train_epochs**: int + + Default = None + + Number of epochs to run for training. Do not specify both train_epochs and train_iters. + Not currently compatible with data reweighing, pairwise datasets, and packing other than 'packed' + + + - **eval_iters**: int Default = 100 diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py index 67bf610fa..86fbebc85 100644 --- a/megatron/data/data_utils.py +++ b/megatron/data/data_utils.py @@ -494,6 +494,9 @@ def validate_train_epochs(neox_args): if neox_args.train_data_weights and (not all(weight == 1.0 for weight in neox_args.train_data_weights)): raise ValueError("train_data_weights != None is currently unsupported with train_epochs") + + if neox_args.dataset_impl != "gpt2": + raise ValueError("non gpt2 datasets are not currently unsupported with train_epochs") def build_train_valid_test_data_loaders(neox_args): diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index c8c3101e5..82aca952c 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -1193,6 +1193,7 @@ class NeoXArgsTraining(NeoXArgsTemplate): train_epochs: int = None """ Number of epochs to run for training. Do not specify both train_epochs and train_iters. + Not currently compatible with data reweighing, pairwise datasets, and packing other than 'packed' """ eval_iters: int = 100