diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py index fca024f9e..d366add8a 100644 --- a/megatron/data/data_utils.py +++ b/megatron/data/data_utils.py @@ -265,12 +265,15 @@ def get_normalized_weights_and_num_samples( weight_sum = sum(weights) assert weight_sum > 0.0 weights = [weight / weight_sum for weight in weights] - # Add 0.5% (the 1.005 factor) so in case the blending dataset does - # not uniformly distribute the number of samples, we still have - # samples left to feed to the network. - weighted_num_samples = [] - for weight in weights: - weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005))) + if num_samples is not None: + # Add 0.5% (the 1.005 factor) so in case the blending dataset does + # not uniformly distribute the number of samples, we still have + # samples left to feed to the network. + weighted_num_samples = [] + for weight in weights: + weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005))) + else: + weighted_num_samples = [None for _ in weights] return weights, weighted_num_samples @@ -467,7 +470,7 @@ def validate_train_epochs(neox_args): if neox_args.weight_by_num_documents: raise ValueError("Weighting by number of documents is currently unsupported with train_epochs") - if not all(weight == 1.0 for weight in neox_args.train_data_weights): + if neox_args.train_data_weights and (not all(weight == 1.0 for weight in neox_args.train_data_weights)): raise ValueError("train_data_weights != None is currently unsupported with train_epochs") @@ -610,7 +613,9 @@ def build_train_valid_test_data_loaders(neox_args): # Flags to know if we need to do training/validation/testing. if neox_args.train_epochs: - do_train,do_valid, do_test = train_dataloader, valid_dataloader, test_dataloader + do_train = train_dataloader is not None + do_valid = valid_dataloader is not None + do_test = test_dataloader is not None else: do_train = train_dataloader is not None and neox_args.train_iters > 0 do_valid = valid_dataloader is not None and neox_args.eval_iters > 0 diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index f4f66b9c8..6b836acca 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1041,6 +1041,8 @@ def calculate_derived(self): ) if self.optimizer_type.lower() == "onebitadam": + assert self.train_iters is not None, "OneBitAdam requires train_iters to be specified" + # onebitadam needs to instantiated by deepspeed, and so we need to pass deepspeed scheduler args # for all other optimizers, the scheduling is handled by megatron self.scheduler = { diff --git a/megatron/training.py b/megatron/training.py index b50b59820..e70ce4fef 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -175,7 +175,7 @@ def update_iterations(neox_args, data_loaders): """ Compute the number of train iterations if not specified and num_epochs, updates the neox_args object. Note that if len(train_dataloader) % train_micro_batch_size_per_gpu != 0, this will configure neox - to do as many iterations as possible while ensuring that each example is seen at most train_epochs + to do as many iterations as possible while ensuring that each example is seen *at most* train_epochs times. """ if neox_args.train_iters is not None: