Skip to content

Commit

Permalink
another commit
Browse files Browse the repository at this point in the history
  • Loading branch information
haileyschoelkopf committed Nov 29, 2023
1 parent 4422b0d commit c87cf19
Showing 1 changed file with 8 additions and 7 deletions.
15 changes: 8 additions & 7 deletions megatron/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,18 +537,19 @@ def build_train_valid_test_data_iterators_streaming(neox_args):
# neox_args.test_data_weights, train_val_test_num_samples[2]
# )

ds = []
for split, data_path in zip(
["train", "valid", "test"],
[neox_args.train_data_paths, neox_args.valid_data_paths, neox_args.test_data_paths]
): # TODO: assumes only one data source per split
# Remote directory (S3 or local filesystem) where dataset is stored
remote_dir = 's3://{data_path[0]}'
# Local directory where dataset is cached during operation
local_dir = '/tmp/cache-{data_path[0]}/{split}'
dataset = StreamingDataset(local=local_dir, remote=remote_dir, split=None, shuffle=True) # TODO: sampler from megatron handles shuffle, right? check this
# Remote directory (S3 or local filesystem) where dataset is stored
remote_dir = 's3://{data_path[0]}'
# Local directory where dataset is cached during operation
local_dir = '/tmp/cache-{data_path[0]}/{split}'
ds.append(StreamingDataset(local=local_dir, remote=remote_dir, split=None, shuffle=True)) # TODO: sampler from megatron handles shuffle, right? check this

#Load mosaic streaming datasets from train_data_paths, valid_data_paths, test_data_paths

#Load mosaic streaming datasets from train_data_paths, valid_data_paths, test_data_paths
train_ds, valid_ds, test_ds = ds

# next, make a blended dataset out of the ones we built (estimate how many docs we need from each?)
# TODO: pull up how you do the sampling-proportional-to-weights from Mosaic dataset
Expand Down

0 comments on commit c87cf19

Please sign in to comment.