Skip to content

Commit

Permalink
working single path case
Browse files Browse the repository at this point in the history
  • Loading branch information
AI-WAIFU committed Sep 23, 2024
1 parent e15912c commit 1d57ecc
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 9 deletions.
21 changes: 13 additions & 8 deletions megatron/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,15 @@ def get_normalized_weights_and_num_samples(
weight_sum = sum(weights)
assert weight_sum > 0.0
weights = [weight / weight_sum for weight in weights]
# Add 0.5% (the 1.005 factor) so in case the blending dataset does
# not uniformly distribute the number of samples, we still have
# samples left to feed to the network.
weighted_num_samples = []
for weight in weights:
weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005)))
if num_samples is not None:
# Add 0.5% (the 1.005 factor) so in case the blending dataset does
# not uniformly distribute the number of samples, we still have
# samples left to feed to the network.
weighted_num_samples = []
for weight in weights:
weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005)))
else:
weighted_num_samples = [None for _ in weights]
return weights, weighted_num_samples


Expand Down Expand Up @@ -467,7 +470,7 @@ def validate_train_epochs(neox_args):
if neox_args.weight_by_num_documents:
raise ValueError("Weighting by number of documents is currently unsupported with train_epochs")

if not all(weight == 1.0 for weight in neox_args.train_data_weights):
if neox_args.train_data_weights and (not all(weight == 1.0 for weight in neox_args.train_data_weights)):
raise ValueError("train_data_weights != None is currently unsupported with train_epochs")


Expand Down Expand Up @@ -610,7 +613,9 @@ def build_train_valid_test_data_loaders(neox_args):

# Flags to know if we need to do training/validation/testing.
if neox_args.train_epochs:
do_train,do_valid, do_test = train_dataloader, valid_dataloader, test_dataloader
do_train = train_dataloader is not None
do_valid = valid_dataloader is not None
do_test = test_dataloader is not None
else:
do_train = train_dataloader is not None and neox_args.train_iters > 0
do_valid = valid_dataloader is not None and neox_args.eval_iters > 0
Expand Down
2 changes: 2 additions & 0 deletions megatron/neox_arguments/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,8 @@ def calculate_derived(self):
)

if self.optimizer_type.lower() == "onebitadam":
assert self.train_iters is not None, "OneBitAdam requires train_iters to be specified"

# onebitadam needs to instantiated by deepspeed, and so we need to pass deepspeed scheduler args
# for all other optimizers, the scheduling is handled by megatron
self.scheduler = {
Expand Down
2 changes: 1 addition & 1 deletion megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def update_iterations(neox_args, data_loaders):
"""
Compute the number of train iterations if not specified and num_epochs, updates the neox_args object.
Note that if len(train_dataloader) % train_micro_batch_size_per_gpu != 0, this will configure neox
to do as many iterations as possible while ensuring that each example is seen at most train_epochs
to do as many iterations as possible while ensuring that each example is seen *at most* train_epochs
times.
"""
if neox_args.train_iters is not None:
Expand Down

0 comments on commit 1d57ecc

Please sign in to comment.