fix kv_cache_on_host if statement and add non_blocking copy

Signed-off-by: Yu Zhentao <[email protected]>
huggingface · Sep 12, 2024 · 4b0fa1a · 4b0fa1a
1 parent cd58c34
commit 4b0fa1a
Showing 1 changed file with 4 additions and 4 deletions.
diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -653,7 +653,7 @@ def pre_attn_forward(
         else:
             past_key_value = None
 
-        kv_cache_on_host = (key_states.device == "cpu" and value_states.device == "cpu")
+        kv_cache_on_host = (key_states.device == torch.device("cpu") and value_states.device == torch.device("cpu"))
         # CPU SDPA fot next token
         if kv_cache_on_host and q_len == 1 and not self.training:
             query_states, key_states, value_states, attention_mask = gaudi_llama_repeat_kv_cpu(
@@ -668,12 +668,12 @@ def pre_attn_forward(
                                                         dropout_p=0.0,
                                                         is_causal=False,
                                                         scale=self.norm_factor)
-            attn_output = attn_output.to("hpu")
+            attn_output = attn_output.to("hpu", non_blocking=True)
 
         else:
             if kv_cache_on_host:
-                key_states = key_states.to("hpu")
-                value_states = value_states.to("hpu")
+                key_states = key_states.to("hpu", non_blocking=True)
+                value_states = value_states.to("hpu", non_blocking=True)
             if use_flash_attention and FusedSDPA is not None:
                 import habana_frameworks.torch.hpu as ht