From f6ac04da780ed30af0bff440b897ae596eb2bc51 Mon Sep 17 00:00:00 2001
From: Dashiell Stander <dstander@protonmail.com>
Date: Tue, 3 Oct 2023 20:03:06 -0400
Subject: [PATCH] Add documentation about using labelled datasets (#1056)

* Add documentation and an informative error

Signed-off-by: Dashiell Stander <dstander@protonmail.com>

* Update NeoXArgs docs automatically

---------

Signed-off-by: Dashiell Stander <dstander@protonmail.com>
Co-authored-by: github-actions <github-actions@github.com>
---
 configs/neox_arguments.md            |  2 +-
 megatron/neox_arguments/arguments.py |  6 ++++++
 tools/datasets/README.md             | 19 +++++++++++++++----
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 6ba7a58bf..3b4b253f7 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = a0cf0e8
+    Default = ec71f71
 
     current git hash of repository
 
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 77c577054..4e13afd2d 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1054,6 +1054,12 @@ def calculate_derived(self):
         if self.test_data_paths and (self.test_data_weights is None):
             self.test_data_weights = [1.0] * len(self.test_data_paths)
 
+        if self.label_data_paths:
+            err_str = (
+                "Must use `label_data_paths` with `train_data_paths`, not `data_path`"
+            )
+            assert self.train_data_paths and not self.data_path, err_str
+
         # if a sample input file is provided, default text_gen_type type to input-file
         if self.text_gen_type is None:
             if self.sample_input_file:
diff --git a/tools/datasets/README.md b/tools/datasets/README.md
index 0f4c382e4..9b459e1e3 100644
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@@ -1,6 +1,6 @@
 # Data Scripts
 
-## `preprocess_data.py` 
+## `preprocess_data.py`
 Takes a raw dataset, splits it up, tokenizes it, and saves it as numpy files that can be memmapped and used efficiently by the training code.
 
 ```
@@ -42,9 +42,20 @@ runtime:
   --log-interval LOG_INTERVAL
                         Interval between progress updates
 ```
-## `preprocess_data_with_mask.py` 
+## `preprocess_data_with_mask.py`
 Does the same but also creates `label` tensors if the dataset has labels.
 
+N.B. If using this, you  **must** specify your data when training/finetuning with the following configs
+```json
+"train_data_paths": ["train_documents"],
+"test_data_paths": ["test_documents"],
+"valid_data_paths": ["test_documents"],
+"label_data_paths": ["label_documents"]
+```
+
+the `"data_path"` option will not work with `"label_data_paths"`.
+
+
 ```
 usage: preprocess_data_with_mask.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]]
                                     [--mask-before-token MASK_BEFORE_TOKEN] [--num-docs NUM_DOCS] --tokenizer-type
@@ -87,7 +98,7 @@ runtime:
   --log-interval LOG_INTERVAL
                         Interval between progress updates
 ```
-## `multinode_prepare_data.sh` 
+## `multinode_prepare_data.sh`
 Does the same but distributed over multiple nodes.
 
 ```
@@ -103,5 +114,5 @@ Does the same but distributed over multiple nodes.
 ```
 
 
-## `corpora.py` 
+## `corpora.py`
 Has information for common datasets. Primarily meant for use in top-level `prepare_data.py` script.