diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 77c577054..4e13afd2d 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1054,6 +1054,12 @@ def calculate_derived(self):
         if self.test_data_paths and (self.test_data_weights is None):
             self.test_data_weights = [1.0] * len(self.test_data_paths)
 
+        if self.label_data_paths:
+            err_str = (
+                "Must use `label_data_paths` with `train_data_paths`, not `data_path`"
+            )
+            assert self.train_data_paths and not self.data_path, err_str
+
         # if a sample input file is provided, default text_gen_type type to input-file
         if self.text_gen_type is None:
             if self.sample_input_file:
diff --git a/tools/datasets/README.md b/tools/datasets/README.md
index 0f4c382e4..9b459e1e3 100644
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@@ -1,6 +1,6 @@
 # Data Scripts
 
-## `preprocess_data.py` 
+## `preprocess_data.py`
 Takes a raw dataset, splits it up, tokenizes it, and saves it as numpy files that can be memmapped and used efficiently by the training code.
 
 ```
@@ -42,9 +42,20 @@ runtime:
   --log-interval LOG_INTERVAL
                         Interval between progress updates
 ```
-## `preprocess_data_with_mask.py` 
+## `preprocess_data_with_mask.py`
 Does the same but also creates `label` tensors if the dataset has labels.
 
+N.B. If using this, you  **must** specify your data when training/finetuning with the following configs
+```json
+"train_data_paths": ["train_documents"],
+"test_data_paths": ["test_documents"],
+"valid_data_paths": ["test_documents"],
+"label_data_paths": ["label_documents"]
+```
+
+the `"data_path"` option will not work with `"label_data_paths"`.
+
+
 ```
 usage: preprocess_data_with_mask.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]]
                                     [--mask-before-token MASK_BEFORE_TOKEN] [--num-docs NUM_DOCS] --tokenizer-type
@@ -87,7 +98,7 @@ runtime:
   --log-interval LOG_INTERVAL
                         Interval between progress updates
 ```
-## `multinode_prepare_data.sh` 
+## `multinode_prepare_data.sh`
 Does the same but distributed over multiple nodes.
 
 ```
@@ -103,5 +114,5 @@ Does the same but distributed over multiple nodes.
 ```
 
 
-## `corpora.py` 
+## `corpora.py`
 Has information for common datasets. Primarily meant for use in top-level `prepare_data.py` script.