huggingface · regisss · Aug 6, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
@@ -75,13 +75,13 @@ def __init__(self, tokenizer, model, args, options):
         self.options = options
         self._device = args.device
         self.model_inputs = {"use_cache": self.options.use_cache}
-        if self.model.config.model_type in ["llama", "mistral", "falcon", "phi", "mixtral", "qwen2", "gptj"]:
+        if self.model.config.model_type in ["llama", "mistral", "falcon", "phi", "mixtral", "qwen2", "gptj", "starcoder2"]:
             self.model_inputs.update(
                 {
                     "reuse_cache": self.options.reuse_cache,
                 }
             )
-        if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon"]:
+        if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon", "starcoder2"]:
             if self.model.config.model_type != "falcon":
                 self.model_inputs.update(
                     {

@@ -381,7 +381,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
 
     model = deepspeed.init_inference(model, **ds_inference_kwargs)
     model = model.module
-    if model.config.model_type in ["llama", "falcon", "qwen2"]:
+    if model.config.model_type in ["llama", "falcon", "qwen2", "starcoder2"]:
         patch_scoped_linear_all_reduce(model)
 
     if args.quant_config:

@@ -94,6 +94,7 @@
     "starcoder2",
     "persimmon",
     "qwen2",
+    "starcoder2",
     "llava",
     "llava_next",
     "stablelm",
@@ -435,7 +436,7 @@ def create_pad_arg(pad_amount, i, j):
                         else:
                             assert False
                     elif model_kwargs["past_key_values"][0][0].dim() == 4:
-                        return (0, 0, 0, pad_amount)  # llama, falcon, qwen2
+                        return (0, 0, 0, pad_amount)  # llama, falcon, qwen2, starcoder2
                     else:
                         assert False, "Unknown case, please handle, or dont use bucketing"
 
@@ -860,7 +861,8 @@ def generate(
                 "phi",
                 "qwen2",
                 "gptj",
-            ], "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2 and gptj at the moment"
+                "starcoder2",
+            ], "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2 and starcoder2 at the moment"
             if not generation_config.bucket_internal:
                 assert (
                     generation_config.bucket_size <= 0
@@ -1016,7 +1018,9 @@ def generate(
                 model_kwargs["kv_cache_len"] = calculated_max_length
                 model_kwargs["kv_cache_pad_len"] = generation_config.max_new_tokens
 
-            if self.config.model_type in ["llama", "falcon", "mistral", "qwen2", "gptj"]:
+
+            if self.config.model_type in ["llama", "falcon", "mistral", "qwen2", "gptj", "starcoder2"]:
+
                 if self.config.max_position_embeddings < calculated_max_length:
                     unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
 

@@ -90,6 +90,7 @@
     GaudiStableLmForCausalLM,
     GaudiStarcoder2DecoderLayer,
     GaudiStarcoder2ForCausalLM,
+    GaudiStarcoder2Model,
     LlamaConfig,
     MistralConfig,
     MixtralConfig,
@@ -175,8 +176,9 @@
     gaudi_SpeechT5DecoderLayer_forward,
     gaudi_stablelm_attention_forward,
     gaudi_stablelm_model_forward,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
+    GaudiStarcoder2Attention,
+    #gaudi_starcoder2_attention_forward,
+    #gaudi_starcoder2_model_forward,
     gaudi_swin_get_attn_mask,
     gaudi_t5_layernorm_forward,
     gaudi_T5Attention_forward,
@@ -517,8 +519,10 @@ def adapt_transformers_to_gaudi():
 
     # Optimization for starcoder2 on Gaudi
     transformers.models.starcoder2.modeling_starcoder2.Starcoder2ForCausalLM = GaudiStarcoder2ForCausalLM
-    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Model.forward = gaudi_starcoder2_model_forward
-    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Attention.forward = gaudi_starcoder2_attention_forward
+    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Model = GaudiStarcoder2Model
+    #transformers.models.starcoder2.modeling_starcoder2.Starcoder2Model.forward = gaudi_starcoder2_model_forward
+    #transformers.models.starcoder2.modeling_starcoder2.Starcoder2Attention.forward = gaudi_starcoder2_attention_forward
+    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Attention = GaudiStarcoder2Attention
     transformers.models.starcoder2.modeling_starcoder2.Starcoder2DecoderLayer = GaudiStarcoder2DecoderLayer
 
     # Optimization for qwen2 on Gaudi

@@ -191,10 +191,10 @@
     gaudi_stablelm_model_forward,
 )
 from .starcoder2 import (
+    GaudiStarcoder2Attention,
     GaudiStarcoder2DecoderLayer,
     GaudiStarcoder2ForCausalLM,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
+    GaudiStarcoder2Model,
 )
 from .swin import gaudi_swin_get_attn_mask
 from .t5 import (

@@ -1,6 +1,6 @@
 from .modeling_starcoder2 import (
     GaudiStarcoder2DecoderLayer,
     GaudiStarcoder2ForCausalLM,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
+    GaudiStarcoder2Attention,
+    GaudiStarcoder2Model,
 )