diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py
index fca024f9e..d366add8a 100644
--- a/megatron/data/data_utils.py
+++ b/megatron/data/data_utils.py
@@ -265,12 +265,15 @@ def get_normalized_weights_and_num_samples(
     weight_sum = sum(weights)
     assert weight_sum > 0.0
     weights = [weight / weight_sum for weight in weights]
-    # Add 0.5% (the 1.005 factor) so in case the blending dataset does
-    # not uniformly distribute the number of samples, we still have
-    # samples left to feed to the network.
-    weighted_num_samples = []
-    for weight in weights:
-        weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005)))
+    if num_samples is not None:
+        # Add 0.5% (the 1.005 factor) so in case the blending dataset does
+        # not uniformly distribute the number of samples, we still have
+        # samples left to feed to the network.
+        weighted_num_samples = []
+        for weight in weights:
+            weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005)))
+    else:
+        weighted_num_samples = [None for _ in weights]
     return weights, weighted_num_samples
 
 
@@ -467,7 +470,7 @@ def validate_train_epochs(neox_args):
     if neox_args.weight_by_num_documents:
         raise ValueError("Weighting by number of documents is currently unsupported with train_epochs")
     
-    if not all(weight == 1.0 for weight in neox_args.train_data_weights):
+    if neox_args.train_data_weights and (not all(weight == 1.0 for weight in neox_args.train_data_weights)):
         raise ValueError("train_data_weights != None is currently unsupported with train_epochs")
 
 
@@ -610,7 +613,9 @@ def build_train_valid_test_data_loaders(neox_args):
 
         # Flags to know if we need to do training/validation/testing.
         if neox_args.train_epochs:
-            do_train,do_valid, do_test = train_dataloader, valid_dataloader, test_dataloader
+            do_train = train_dataloader is not None
+            do_valid = valid_dataloader is not None
+            do_test = test_dataloader is not None
         else:
             do_train = train_dataloader is not None and neox_args.train_iters > 0 
             do_valid = valid_dataloader is not None and neox_args.eval_iters > 0
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index f4f66b9c8..6b836acca 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1041,6 +1041,8 @@ def calculate_derived(self):
         )
 
         if self.optimizer_type.lower() == "onebitadam":
+            assert self.train_iters is not None, "OneBitAdam requires train_iters to be specified"
+
             # onebitadam needs to instantiated by deepspeed, and so we need to pass deepspeed scheduler args
             # for all other optimizers, the scheduling is handled by megatron
             self.scheduler = {
diff --git a/megatron/training.py b/megatron/training.py
index b50b59820..e70ce4fef 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -175,7 +175,7 @@ def update_iterations(neox_args, data_loaders):
     """
     Compute the number of train iterations if not specified and num_epochs, updates the neox_args object.
     Note that if len(train_dataloader) % train_micro_batch_size_per_gpu != 0, this will configure neox
-    to do as many iterations as possible while ensuring that each example is seen at most train_epochs 
+    to do as many iterations as possible while ensuring that each example is seen *at most* train_epochs 
     times.
     """
     if neox_args.train_iters is not None: