updated README, file types

huggingface · Jun 13, 2024 · 4871d58 · 4871d58
1 parent 8577e5a
commit 4871d58
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 7 deletions.
diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
@@ -655,6 +655,46 @@ DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 LOWER_LIST=ops_bf16.txt python3 ..
 ```
 Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`.
 
+#### Custom Files
+
+To run on your own training and validation files, use the following command:
+
+```bash
+python run_lora_clm.py \
+    --model_name_or_path bigcode/starcoder \
+    --train_file path_to_train_file \
+    --validation_file path_to_validation_file \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --do_train \
+    --do_eval \
+    --output_dir /tmp/test-lora-clm \
+    --bf16 \
+    --use_habana \
+    --use_lazy_mode \
+    --use_hpu_graphs_for_inference \
+    --dataset_concatenation \
+    --throughput_warmup_steps 3
+```
+
+The format of the jsonlines files (with extensions .json or .jsonl) is expected to be
+
+```json
+{"text": "<text>"}
+{"text": "<text>"}
+{"text": "<text>"}
+{"text": "<text>"}
+```
+
+The format of the text files (with extensions .text or .txt) is expected to be
+
+```json
+"<text>"
+"<text>"
+"<text>"
+"<text>"
+```
+
 ### Prompt/Prefix/P-tuning
 
 To run prompt tuning finetuning, you can use `run_prompt_tuning_clm.py`.

diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
@@ -236,6 +236,9 @@ class DataArguments:
         default=False,
         metadata={"help": "Whether to keep in memory the loaded dataset. Defaults to False."},
     )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )
     dataset_seed: int = field(
         default=42,
         metadata={
@@ -552,6 +555,8 @@ def main():
         if extension == "txt":
             extension = "text"
             dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
+        if extension in ("json", "jsonl"):
+            extension = "json"
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
@@ -560,18 +565,26 @@ def main():
             **dataset_args,
         )
 
-        if data_args.train_file and training_args.do_train:
-            print([x for x in raw_datasets])
+        # For --do_train and --do_train and --do_eval
+        if (data_args.train_file and training_args.do_train) or (data_args.validation_file and training_args.do_eval):
             raw_datasets = raw_datasets.map(
                 lambda x: {
                     "input": "",
                     "output": x["text"],
                 }
             )
-            # Remove unused columns.
-            raw_datasets = raw_datasets.remove_columns(
-                [col for col in raw_datasets.column_names["train"] if col not in ["input", "output"]]
-            )
+
+            if training_args.do_train:
+                # Remove unused columns.
+                raw_datasets = raw_datasets.remove_columns(
+                    [col for col in raw_datasets.column_names["train"] if col not in ["input", "output"]]
+                )
+
+            if training_args.do_eval:
+                # Remove unused columns.
+                raw_datasets = raw_datasets.remove_columns(
+                    [col for col in raw_datasets.column_names["validation"] if col not in ["input", "output"]]
+                )
 
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
         if "validation" not in raw_datasets.keys() and training_args.do_eval:
@@ -867,7 +880,7 @@ def compute_metrics(eval_preds):
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
 
-        # Evaluation
+    # Evaluation
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
         metrics = trainer.evaluate()