From f6ac04da780ed30af0bff440b897ae596eb2bc51 Mon Sep 17 00:00:00 2001 From: Dashiell Stander Date: Tue, 3 Oct 2023 20:03:06 -0400 Subject: [PATCH] Add documentation about using labelled datasets (#1056) * Add documentation and an informative error Signed-off-by: Dashiell Stander * Update NeoXArgs docs automatically --------- Signed-off-by: Dashiell Stander Co-authored-by: github-actions --- configs/neox_arguments.md | 2 +- megatron/neox_arguments/arguments.py | 6 ++++++ tools/datasets/README.md | 19 +++++++++++++++---- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 6ba7a58bf..3b4b253f7 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = a0cf0e8 + Default = ec71f71 current git hash of repository diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 77c577054..4e13afd2d 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1054,6 +1054,12 @@ def calculate_derived(self): if self.test_data_paths and (self.test_data_weights is None): self.test_data_weights = [1.0] * len(self.test_data_paths) + if self.label_data_paths: + err_str = ( + "Must use `label_data_paths` with `train_data_paths`, not `data_path`" + ) + assert self.train_data_paths and not self.data_path, err_str + # if a sample input file is provided, default text_gen_type type to input-file if self.text_gen_type is None: if self.sample_input_file: diff --git a/tools/datasets/README.md b/tools/datasets/README.md index 0f4c382e4..9b459e1e3 100644 --- a/tools/datasets/README.md +++ b/tools/datasets/README.md @@ -1,6 +1,6 @@ # Data Scripts -## `preprocess_data.py` +## `preprocess_data.py` Takes a raw dataset, splits it up, tokenizes it, and saves it as numpy files that can be memmapped and used efficiently by the training code. ``` @@ -42,9 +42,20 @@ runtime: --log-interval LOG_INTERVAL Interval between progress updates ``` -## `preprocess_data_with_mask.py` +## `preprocess_data_with_mask.py` Does the same but also creates `label` tensors if the dataset has labels. +N.B. If using this, you **must** specify your data when training/finetuning with the following configs +```json +"train_data_paths": ["train_documents"], +"test_data_paths": ["test_documents"], +"valid_data_paths": ["test_documents"], +"label_data_paths": ["label_documents"] +``` + +the `"data_path"` option will not work with `"label_data_paths"`. + + ``` usage: preprocess_data_with_mask.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--mask-before-token MASK_BEFORE_TOKEN] [--num-docs NUM_DOCS] --tokenizer-type @@ -87,7 +98,7 @@ runtime: --log-interval LOG_INTERVAL Interval between progress updates ``` -## `multinode_prepare_data.sh` +## `multinode_prepare_data.sh` Does the same but distributed over multiple nodes. ``` @@ -103,5 +114,5 @@ Does the same but distributed over multiple nodes. ``` -## `corpora.py` +## `corpora.py` Has information for common datasets. Primarily meant for use in top-level `prepare_data.py` script.