diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 77c577054..4e13afd2d 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1054,6 +1054,12 @@ def calculate_derived(self): if self.test_data_paths and (self.test_data_weights is None): self.test_data_weights = [1.0] * len(self.test_data_paths) + if self.label_data_paths: + err_str = ( + "Must use `label_data_paths` with `train_data_paths`, not `data_path`" + ) + assert self.train_data_paths and not self.data_path, err_str + # if a sample input file is provided, default text_gen_type type to input-file if self.text_gen_type is None: if self.sample_input_file: diff --git a/tools/datasets/README.md b/tools/datasets/README.md index 0f4c382e4..9b459e1e3 100644 --- a/tools/datasets/README.md +++ b/tools/datasets/README.md @@ -1,6 +1,6 @@ # Data Scripts -## `preprocess_data.py` +## `preprocess_data.py` Takes a raw dataset, splits it up, tokenizes it, and saves it as numpy files that can be memmapped and used efficiently by the training code. ``` @@ -42,9 +42,20 @@ runtime: --log-interval LOG_INTERVAL Interval between progress updates ``` -## `preprocess_data_with_mask.py` +## `preprocess_data_with_mask.py` Does the same but also creates `label` tensors if the dataset has labels. +N.B. If using this, you **must** specify your data when training/finetuning with the following configs +```json +"train_data_paths": ["train_documents"], +"test_data_paths": ["test_documents"], +"valid_data_paths": ["test_documents"], +"label_data_paths": ["label_documents"] +``` + +the `"data_path"` option will not work with `"label_data_paths"`. + + ``` usage: preprocess_data_with_mask.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--mask-before-token MASK_BEFORE_TOKEN] [--num-docs NUM_DOCS] --tokenizer-type @@ -87,7 +98,7 @@ runtime: --log-interval LOG_INTERVAL Interval between progress updates ``` -## `multinode_prepare_data.sh` +## `multinode_prepare_data.sh` Does the same but distributed over multiple nodes. ``` @@ -103,5 +114,5 @@ Does the same but distributed over multiple nodes. ``` -## `corpora.py` +## `corpora.py` Has information for common datasets. Primarily meant for use in top-level `prepare_data.py` script.