From 2959091705ca4653da7ea7abbe72d7569b66e347 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Tue, 24 Sep 2024 15:52:10 +0000
Subject: [PATCH] additional checks

---
 configs/README.md                    | 2 ++
 configs/neox_arguments.md            | 9 +++++++++
 megatron/data/data_utils.py          | 3 +++
 megatron/neox_arguments/neox_args.py | 1 +
 4 files changed, 15 insertions(+)

diff --git a/configs/README.md b/configs/README.md
index 71a09ebea..ac20ed89b 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -124,6 +124,8 @@ These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must
     # this should provide some speedup but takes a while to build, set to true if desired
    "scaled_upper_triang_masked_softmax_fusion": false,
    "train_iters": 320000,
+    # alternatively, use train_epochs to automatically determine the number of training iterations
+    #"train_epochs": 1,
 ```
 An example of some basic settings used to configure your model's architecture and number of training steps.
 
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 698e28697..639a5daf2 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -1928,6 +1928,15 @@ Training Arguments
 
 
 
+- **train_epochs**: int
+
+    Default = None
+
+    Number of epochs to run for training. Do not specify both train_epochs and train_iters.
+    Not currently compatible with data reweighing, pairwise datasets, and packing other than 'packed'
+
+
+
 - **eval_iters**: int
 
     Default = 100
diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py
index 67bf610fa..86fbebc85 100644
--- a/megatron/data/data_utils.py
+++ b/megatron/data/data_utils.py
@@ -494,6 +494,9 @@ def validate_train_epochs(neox_args):
     
     if neox_args.train_data_weights and (not all(weight == 1.0 for weight in neox_args.train_data_weights)):
         raise ValueError("train_data_weights != None is currently unsupported with train_epochs")
+    
+    if neox_args.dataset_impl !=  "gpt2":
+        raise ValueError("non gpt2 datasets are not currently unsupported with train_epochs")
 
 
 def build_train_valid_test_data_loaders(neox_args):
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index c8c3101e5..82aca952c 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -1193,6 +1193,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     train_epochs: int = None
     """
     Number of epochs to run for training. Do not specify both train_epochs and train_iters.
+    Not currently compatible with data reweighing, pairwise datasets, and packing other than 'packed'
     """
 
     eval_iters: int = 100