Merge branch 'main' into cohere

XinyuYe-Intel · Sep 26, 2024 · e6915ed · e6915ed
2 parents 532c2f9 + a4dbcff
commit e6915ed
Show file tree

Hide file tree

Showing 32 changed files with 1,131 additions and 327 deletions.
diff --git a/README.md b/README.md
@@ -199,6 +199,7 @@ The following model architectures, tasks and device distributions have been vali
 | BLOOM(Z) |   | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | StarCoder / StarCoder2 | :heavy_check_mark:  | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | GPT-J | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| GPT-Neo |      | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | GPT-NeoX | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | OPT |   | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Llama 2 / CodeLlama / Llama 3 / Llama Guard / Granite | :heavy_check_mark: | :heavy_check_mark: | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification) (Llama Guard)</li> |

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -45,6 +45,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | BLOOM(Z)     |          | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | StarCoder / StarCoder2 | ✅ | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | GPT-J        | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| GPT-Neo      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | GPT-NeoX     | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | OPT          |          | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Llama 2 / CodeLlama / Llama 3 / Llama Guard / Granite | ✅ | ✅ | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification) (Llama Guard)</li> |

diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
@@ -161,7 +161,7 @@ def main():
         args.image_path = [
             "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
         ]
-    if args.prompt is None:
+    if args.prompt is None and model_type in ("llava", "llava_next"):
         if model_type == "llava":
             processor = LlavaProcessor.from_pretrained(args.model_name_or_path)
         elif model_type == "llava_next":

diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
@@ -137,6 +137,9 @@ The following command triggers the fine-tuning of [GPT-NeoX-20B](https://hugging
 Fine-tuning on 16 HPU cards (2 Gaudi2 nodes) takes around 9 minutes with a batch size of 32 (2 per device).
 It reaches a perplexity of 10.469.
 
+> [!NOTE]
+>  For GPT-NeoX-20B model, please switch to jemalloc in case of host OOM issues using ``` export LD_PRELOAD=<path>/libjemalloc.so.2 ```
+
 > Please refer to [this page](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training) for performing multi-node training properly.
 
 ```bash
@@ -362,7 +365,7 @@ python run_clm.py \
 
 ## PEFT
 
-### LORA/ADALORA/IA3/LLAMA_ADAPTER
+### LORA/ADALORA/IA3/LLAMA_ADAPTER/VERA/LN_TUNING
 
 To run LoRA finetuning, you can use `run_lora_clm.py`.
 Here are single-/multi-device command examples for Llama1-7B, Falcon-40B, Llama2-70B, Llama3-8B and Llama3-70B.
@@ -720,7 +723,7 @@ DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 LOWER_LIST=ops_bf16.txt python3 ..
     --validation_split_percentage 5 \
     --deepspeed ds_falcon_180b_z3.json
 ```
-Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`.
+Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`, or enable ln-tuning using `--peft_type ln_tuning`, or enable vera using `--peft_type vera`.
 
 #### Custom Files
 

diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
@@ -30,7 +30,17 @@
 import torch
 import transformers
 from datasets import load_dataset
-from peft import AdaLoraConfig, AdaptionPromptConfig, IA3Config, LoraConfig, TaskType, get_peft_model, tuners
+from peft import (
+    AdaLoraConfig,
+    AdaptionPromptConfig,
+    IA3Config,
+    LNTuningConfig,
+    LoraConfig,
+    TaskType,
+    VeraConfig,
+    get_peft_model,
+    tuners,
+)
 from peft.utils.other import fsdp_auto_wrap_policy
 from transformers import (
     AutoConfig,
@@ -349,7 +359,7 @@ class FinetuneArguments:
         default="lora",
         metadata={
             "help": ("The PEFT type to use."),
-            "choices": ["lora", "ia3", "adalora", "llama-adapter"],
+            "choices": ["lora", "ia3", "adalora", "llama-adapter", "vera", "ln_tuning"],
         },
     )
     ia3_target_modules: List[str] = field(
@@ -368,6 +378,14 @@ class FinetuneArguments:
         default=10,
         metadata={"help": "Number of adapter tokens to insert in llama-adapter"},
     )
+    vera_target_modules: List[str] = field(
+        default_factory=lambda: None,
+        metadata={"help": "Target modules for the vera method."},
+    )
+    ln_target_modules: List[str] = field(
+        default_factory=lambda: None,
+        metadata={"help": "Target modules for the ln method."},
+    )
 
 
 PROMPT_DICT = {
@@ -884,6 +902,15 @@ def compute_metrics(eval_preds):
 
             tuners.adaption_prompt.layer.AdaptedAttention.pre_attn_forward = GaudiAdaptedAttentionPreAttnForward
             tuners.adaption_prompt.layer.AdaptedAttention.__getattr__ = GaudiAdaptedAttention_getattr
+        elif finetune_args.peft_type == "vera":
+            peft_config = VeraConfig(
+                target_modules=finetune_args.vera_target_modules, task_type=TaskType.CAUSAL_LM, init_weights=False
+            )
+        elif finetune_args.peft_type == "ln_tuning":
+            peft_config = LNTuningConfig(
+                target_modules=finetune_args.ln_target_modules,
+                task_type=TaskType.CAUSAL_LM,
+            )
         if training_args.gradient_checkpointing:
             model.enable_input_require_grads()
         lora_model = get_peft_model(model, peft_config)

diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
@@ -316,6 +316,11 @@ def setup_parser(parser):
         default="none",
         help="Run multi card with the specified parallel strategy. Choices are 'tp' for Tensor Parallel Strategy or 'none'.",
     )
+    parser.add_argument(
+        "--input_embeds",
+        action="store_true",
+        help="Whether to enable inputs_embeds or not.",
+    )
 
     args = parser.parse_args()
 
@@ -336,6 +341,18 @@ def setup_parser(parser):
     return args
 
 
+def prepare_generation_embedding(model, model_name, input_tokens):
+    batch_size = input_tokens["input_ids"].size(0)
+
+    inputs_embeds = model.get_input_embeddings()(input_tokens["input_ids"])
+
+    if inputs_embeds.size(0) != batch_size:
+        inputs_embeds = inputs_embeds.expand(batch_size, -1, -1)
+
+    attention_mask = input_tokens["attention_mask"]
+    return {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+
+
 def main():
     parser = argparse.ArgumentParser()
     args = setup_parser(parser)
@@ -433,9 +450,22 @@ def generate(size=None, reduce_recompile=False):
                 for t in input_tokens:
                     if torch.is_tensor(input_tokens[t]):
                         input_tokens[t] = input_tokens[t].to(args.device)
+
+            input_data = {}
+            if args.input_embeds:
+                inputs_embeds = prepare_generation_embedding(model, args.model_name_or_path, input_tokens)
+                if inputs_embeds is not None:
+                    input_data.update(inputs_embeds)
+                    input_data.update(input_tokens)
+                else:
+                    args.input_embeds = False
+                    input_data.update(input_tokens)
+            else:
+                input_data.update(input_tokens)
+
             iteration_times = []
             outputs = model.generate(
-                **input_tokens,
+                **input_data,
                 generation_config=generation_config,
                 assistant_model=assistant_model,
                 lazy_mode=use_lazy_mode,
@@ -524,7 +554,8 @@ def rounder(x):
             with (output_dir / "results.json").open("w", encoding="utf-8") as f:
                 json.dump(results, f, ensure_ascii=False, indent=4)
 
-        stats = f"Throughput (including tokenization) = {throughput} tokens/second"
+        stats = "Input embeds" if args.input_embeds else "Input tokens"
+        stats = stats + f"\nThroughput (including tokenization) = {throughput} tokens/second"
         if args.show_graphs_count:
             stats = stats + f"\nNumber of HPU graphs                = {count_hpu_graphs()}"
         separator = "-" * len(stats)

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
@@ -82,6 +82,7 @@
     "gpt2",
     "opt",
     "gptj",
+    "gpt_neo",
     "gpt_neox",
     "llama",
     "falcon",
@@ -423,6 +424,10 @@ def update_model_kwargs_for_bucketing(
             # Pad inputs to have static shapes during generation, this gives better performance than dynamic shapes on HPUs
             pad_amount = params["allocated_space"] - input_ids.shape[-1]
             input_ids = torch.nn.functional.pad(input_ids, (0, pad_amount), value=pad_token_id)
+            if model_kwargs.get("inputs_embeds") is not None:
+                model_kwargs["inputs_embeds"] = torch.nn.functional.pad(
+                    model_kwargs["inputs_embeds"], (0, 0, 0, pad_amount), value=pad_token_id
+                )
             if model_kwargs["attention_mask"] is not None:
                 model_kwargs["attention_mask"] = torch.nn.functional.pad(
                     model_kwargs["attention_mask"], (0, pad_amount), value=0
@@ -663,7 +668,7 @@ def _prepare_generation_config(
             self.generation_config.static_shapes = generation_config.static_shapes
             if generation_config.ignore_eos is None:
                 generation_config.ignore_eos = kwargs.get("ignore_eos", kwargs.get("lazy_mode", None))
-                self.generation_config.ignore_eos = generation_config.ignore_eos
+            self.generation_config.ignore_eos = generation_config.ignore_eos
             model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
             if self.config.model_type == "falcon" and "token_type_ids" in kwargs.keys():
                 for key in ["token_type_ids"]:
@@ -923,34 +928,24 @@ def generate(
                     token_idx = inputs_tensor.shape[1]
                     if generation_config.max_new_tokens is None:
                         generation_config.max_new_tokens = generation_config.max_length - token_idx
-                    if "inputs_embeds" in model_kwargs:
-                        if "input_ids" in model_kwargs:
-                            inputs_embeds_offset = (
-                                model_kwargs["input_ids"].shape[1] - model_kwargs["inputs_embeds"].shape[1]
-                            )
-                        else:
-                            inputs_embeds_offset = -model_kwargs["inputs_embeds"].shape[1]
+                    if model_input_name == "inputs_embeds" and model_kwargs["input_ids"].numel() == 0:
+                        inputs_embeds_offset = -model_kwargs["inputs_embeds"].shape[1]
 
                         model_kwargs["inputs_embeds_offset"] = torch.tensor(
                             inputs_embeds_offset, device=inputs_tensor.device
                         )
+
+                    if (
+                        model_input_name == "inputs_embeds"
+                        and model_kwargs.get("inputs_embeds") is not None
+                        and not model_kwargs["bucket_internal"]
+                        and not generation_config.reuse_cache
+                    ):
                         model_kwargs["inputs_embeds"] = torch.nn.functional.pad(
                             model_kwargs["inputs_embeds"],
                             (0, 0, 0, generation_config.max_new_tokens),
                             value=generation_config.pad_token_id,
                         )
-
-                    if model_input_name == "inputs_embeds":
-                        inputs_tensor = torch.nn.functional.pad(
-                            inputs_tensor,
-                            (0, 0, 0, generation_config.max_new_tokens),
-                            value=generation_config.pad_token_id,
-                        )
-                        model_kwargs["input_ids"] = torch.nn.functional.pad(
-                            model_kwargs["input_ids"],
-                            (0, generation_config.max_new_tokens),
-                            value=generation_config.pad_token_id,
-                        )
                     else:
                         inputs_tensor = torch.nn.functional.pad(
                             inputs_tensor, (0, generation_config.max_new_tokens), value=generation_config.pad_token_id
@@ -961,7 +956,9 @@ def generate(
                     for other_inputs in ["attention_mask", "token_type_ids"]:
                         if model_kwargs.get(other_inputs) is not None:
                             model_kwargs[other_inputs] = torch.nn.functional.pad(
-                                model_kwargs[other_inputs], (0, generation_config.max_new_tokens), value=0
+                                model_kwargs[other_inputs],
+                                (0, generation_config.max_new_tokens),
+                                value=generation_config.pad_token_id,
                             )
             else:
                 assert generation_config.bucket_size <= 0, "Untested path for bucket>0"
@@ -999,6 +996,11 @@ def generate(
             )
         else:
             input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
+            if model_input_name == "inputs_embeds" and generation_config.static_shapes:
+                if not is_greedy_or_beam_and_bucket:
+                    input_ids = torch.nn.functional.pad(
+                        input_ids, (0, generation_config.max_new_tokens), value=generation_config.pad_token_id
+                    )
 
         if generation_config.token_healing:
             input_ids = self.heal_tokens(input_ids, tokenizer)
@@ -1007,7 +1009,7 @@ def generate(
             streamer.put(input_ids.cpu())
 
         # 6. Prepare `max_length` depending on other stopping criteria.
-        input_ids_length = input_ids.shape[-1]
+        input_ids_length = input_ids.shape[1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
         has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
         generation_config = self._prepare_generated_length(
@@ -1120,9 +1122,9 @@ def generate(
         model_kwargs["num_virtual_tokens"] = num_virtual_tokens
 
         if not self.config.is_encoder_decoder:
-            calculated_max_length = input_ids.shape[-1] + num_virtual_tokens
+            calculated_max_length = input_ids.shape[1] + num_virtual_tokens
             if not generation_config.static_shapes and generation_config.max_new_tokens is not None:
-                calculated_max_length = input_ids.shape[-1] + generation_config.max_new_tokens + num_virtual_tokens
+                calculated_max_length = input_ids.shape[1] + generation_config.max_new_tokens + num_virtual_tokens
             if generation_config.use_cache and generation_config.reuse_cache:
                 bs, _ = input_ids.shape
                 if not is_greedy_or_beam_and_bucket:
@@ -2414,7 +2416,13 @@ def _sample(
             ):
                 # Pad the returned past key values tensors from prefill phase forward run to maximum length
                 # before starting the decode phase.
-                if outputs.past_key_values[0][0].shape[2] == model_inputs["input_ids"].shape[1]:
+                if (
+                    "input_ids" in model_inputs
+                    and outputs.past_key_values[0][0].shape[2] == model_inputs["input_ids"].shape[1]
+                ) or (
+                    "inputs_embeds" in model_inputs
+                    and outputs.past_key_values[0][0].shape[2] == model_inputs["inputs_embeds"].shape[1]
+                ):
                     self._pad_past_key_values(model_kwargs)
                 model_kwargs["pad_done"] = True