From b9587caddc44fa5adbd32cf986833ff700017b00 Mon Sep 17 00:00:00 2001
From: Akihiro Takahashi <akihiro.takahashi@intel.com>
Date: Fri, 2 Aug 2024 10:38:42 -0700
Subject: [PATCH 01/24] Enable MPT fp8 support

Add Softmax and FusedSDPA
Update GaudiMptAttention foward to r4.44.1 base

Co-authored-by: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai>
---
 optimum/habana/transformers/modeling_utils.py |   8 +-
 .../habana/transformers/models/__init__.py    |   4 +-
 .../transformers/models/mpt/__init__.py       |   4 +-
 .../transformers/models/mpt/modeling_mpt.py   | 347 ++++++++++--------
 4 files changed, 195 insertions(+), 168 deletions(-)

diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
index 621e391bfb..97e290157c 100644
--- a/optimum/habana/transformers/modeling_utils.py
+++ b/optimum/habana/transformers/modeling_utils.py
@@ -74,6 +74,8 @@
     GaudiMixtralDecoderLayer,
     GaudiMixtralForCausalLM,
     GaudiMixtralModel,
+    GaudiMptAttention,
+    GaudiMptBlock,
     GaudiMptForCausalLM,
     GaudiMptModel,
     GaudiOPTForCausalLM,
@@ -150,8 +152,6 @@
     gaudi_mistral_rmsnorm_forward,
     gaudi_mixtral_block_sparse_moe_forward,
     gaudi_mixtral_rmsnorm_forward,
-    gaudi_mpt_attention_forward,
-    gaudi_mpt_block_forward,
     gaudi_opt_attention_forward,
     gaudi_opt_decoder_forward,
     gaudi_opt_decoder_layer_forward,
@@ -415,8 +415,8 @@ def adapt_transformers_to_gaudi():
     # Optimization for mpt on Gaudi
     transformers.models.mpt.modeling_mpt.MptForCausalLM = GaudiMptForCausalLM
     transformers.models.mpt.modeling_mpt.MptModel = GaudiMptModel
-    transformers.models.mpt.modeling_mpt.MptAttention.forward = gaudi_mpt_attention_forward
-    transformers.models.mpt.modeling_mpt.MptBlock.forward = gaudi_mpt_block_forward
+    transformers.models.mpt.modeling_mpt.MptAttention = GaudiMptAttention
+    transformers.models.mpt.modeling_mpt.MptBlock = GaudiMptBlock
 
     # Optimization for mistral on Gaudi
     transformers.models.mistral.modeling_mistral.MistralForCausalLM = GaudiMistralForCausalLM
diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
index 99ef65c4e4..5a4861fbdf 100644
--- a/optimum/habana/transformers/models/__init__.py
+++ b/optimum/habana/transformers/models/__init__.py
@@ -138,10 +138,10 @@
     gaudi_invert_attention_mask,
 )
 from .mpt import (
+    GaudiMptAttention,
+    GaudiMptBlock,
     GaudiMptForCausalLM,
     GaudiMptModel,
-    gaudi_mpt_attention_forward,
-    gaudi_mpt_block_forward,
 )
 from .opt import (
     GaudiOPTForCausalLM,
diff --git a/optimum/habana/transformers/models/mpt/__init__.py b/optimum/habana/transformers/models/mpt/__init__.py
index 1ab41c1a80..351152c026 100644
--- a/optimum/habana/transformers/models/mpt/__init__.py
+++ b/optimum/habana/transformers/models/mpt/__init__.py
@@ -1,6 +1,6 @@
 from .modeling_mpt import (
+    GaudiMptAttention,
+    GaudiMptBlock,
     GaudiMptForCausalLM,
     GaudiMptModel,
-    gaudi_mpt_attention_forward,
-    gaudi_mpt_block_forward,
 )
diff --git a/optimum/habana/transformers/models/mpt/modeling_mpt.py b/optimum/habana/transformers/models/mpt/modeling_mpt.py
index 7cefc4e37f..2c632619b7 100755
--- a/optimum/habana/transformers/models/mpt/modeling_mpt.py
+++ b/optimum/habana/transformers/models/mpt/modeling_mpt.py
@@ -15,20 +15,25 @@
 ###############################################################################
 # Copyright (C) 2022-2023 Habana Labs, Ltd. an Intel Company
 ###############################################################################
-from typing import Optional, Tuple, Union
-
+import os
 import torch
+
+from typing import Optional, Tuple, Union
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from transformers.models.mpt.modeling_mpt import MptForCausalLM, MptModel
+from transformers.models.mpt.modeling_mpt import (
+    MptAttention,
+    MptBlock,
+    MptConfig,
+    MptForCausalLM,
+    MptModel,
+)
 from transformers.utils import logging
 
 from ...modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
 
 
-logger = logging.get_logger(__name__)
-
 try:
     from habana_frameworks.torch.hpex.kernels import FusedSDPA
 except ImportError:
@@ -36,159 +41,181 @@
     FusedSDPA = None
 
 
-def gaudi_mpt_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    position_bias: torch.Tensor,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    token_idx: Optional[torch.Tensor] = None,
-    use_flash_attention: Optional[bool] = False,
-    flash_attention_recompute: Optional[bool] = False,
-):
-    """
-    Copied from MptAttention.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
-    The only differences are:
-    - add new args token_idx
-    - optimize KV cache
-    - add new args use_flash_attention
-    - add new arg flash_attention_recompute
-    """
-
-    batch_size, seq_length = hidden_states.shape[:2]
-
-    mixed_qkv = self.Wqkv(hidden_states)
-    if self.clip_qkv:
-        mixed_qkv = mixed_qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
-
-    bs, seq_len, three_times_hidden_size = mixed_qkv.shape
-    mixed_qkv = mixed_qkv.view(bs, seq_len, self.n_heads * 3, self.head_dim)
-    mixed_qkv = mixed_qkv.transpose(1, 2)
-    query_states, key_states, value_states = (
-        mixed_qkv[:, : self.n_heads, ...],
-        mixed_qkv[:, self.n_heads : 2 * self.n_heads, ...],
-        mixed_qkv[:, 2 * self.n_heads :, ...],
-    )
-
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            if token_idx is not None:
-                past_key_value[0].index_copy_(2, token_idx - 1, key_states)
-                past_key_value[1].index_copy_(2, token_idx - 1, value_states)
-                key_states = past_key_value[0]
-                value_states = past_key_value[1]
-            else:
-                key_states = torch.cat([past_key_value[0], key_states], dim=2)
-                value_states = torch.cat([past_key_value[1], value_states], dim=2)
-                past_key_value = [key_states, value_states]
-    else:
-        past_key_value = [
-            torch.empty(key_states.shape, dtype=key_states.dtype, device=key_states.device),
-            torch.empty(key_states.shape, dtype=key_states.dtype, device=key_states.device),
-        ]
-        past_key_value[0][:] = key_states[:]
-        past_key_value[1][:] = value_states[:]
-
-    query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
-
-    if position_bias is not None:
-        if len(position_bias.shape) != 3:
-            raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
-        key_length = key_states.shape[-2]
-
-        position_bias_query_index = max(0, position_bias.size(1) - query_length)
-        position_bias_key_index = max(0, position_bias.size(2) - key_length)
-
-        position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
-
-    if use_flash_attention and FusedSDPA:
-        import habana_frameworks.torch.hpu as ht
-
-        with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
-            attn_output = FusedSDPA.apply(
-                query_states,
-                key_states,
-                value_states,
-                attention_mask * torch.finfo(query_states.dtype).min + position_bias.to(query_states.dtype),
-                0.0,
-                False,
-                None,
-            )
-        attn_weights = None
-    else:
-        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
+logger = logging.get_logger(__name__)
+
+
+class Softmax(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, dim=None, invAttnHead=None):
+        return torch.ops.hpu.softmax_fp8(x, dim, None, None, invAttnHead)
+
+
+class GaudiMptAttention(MptAttention):
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+
+        self.is_fp8 = os.getenv("QUANT_CONFIG", "") != ""
+        self.softmax = Softmax()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_bias: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+    ):
+        """
+        Copied from MptAttention.forward: https://github.com/huggingface/transformers/blob/v4.44.1/src/transformers/models/mpt/modeling_mpt.py
+        The only differences are:
+        - add new args token_idx
+        - optimize KV cache
+        - add new args use_flash_attention
+        - add new arg flash_attention_recompute
+        """
+
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        mixed_qkv = self.Wqkv(hidden_states)
+        if self.clip_qkv:
+            mixed_qkv = mixed_qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+        query_states, key_states, value_states = mixed_qkv.chunk(3, dim=2)
+        query_states = query_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            if len(past_key_value) != 0:
+                if token_idx is not None:
+                    past_key_value[0].index_copy_(2, token_idx - 1, key_states)
+                    past_key_value[1].index_copy_(2, token_idx - 1, value_states)
+                    key_states = past_key_value[0]
+                    value_states = past_key_value[1]
+                else:
+                    key_states = torch.cat([past_key_value[0], key_states], dim=2)
+                    value_states = torch.cat([past_key_value[1], value_states], dim=2)
+                    past_key_value = [key_states, value_states]
+        else:
+            past_key_value = [
+                torch.empty(key_states.shape, dtype=key_states.dtype, device=key_states.device),
+                torch.empty(key_states.shape, dtype=key_states.dtype, device=key_states.device),
+            ]
+            past_key_value[0][:] = key_states[:]
+            past_key_value[1][:] = value_states[:]
+
+        query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
 
         if position_bias is not None:
-            attention_scores = attention_scores + position_bias
-        if attention_mask is not None:
-            attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)
-
-        # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)
-
-        attn_output = torch.matmul(attn_weights, value_states)
-
-    attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
-    attn_output = self.out_proj(attn_output)
-
-    return attn_output, attn_weights, past_key_value
-
-
-def gaudi_mpt_block_forward(
-    self,
-    hidden_states: torch.Tensor,
-    position_bias: torch.Tensor,
-    attention_mask: torch.Tensor,
-    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    use_cache: bool = False,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-    use_flash_attention: Optional[bool] = False,
-    flash_attention_recompute: Optional[bool] = False,
-):
-    """
-    Copied from MptBlock.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
-    The only differences are:
-    - add new args token_idx
-    - add new args use_flash_attention
-    - add new arg flash_attention_recompute
-    """
-    # hidden_states: [batch_size, seq_length, hidden_size]
-    # Layer norm at the beginning of the transformer layer.
-    layernorm_output = self.norm_1(hidden_states)
-
-    residual = hidden_states
-
-    # Self attention.
-    attn_outputs, attn_weights, past_key_value = self.attn(
-        layernorm_output,
-        position_bias=position_bias,
-        attention_mask=attention_mask,
-        past_key_value=layer_past,
-        token_idx=token_idx,
-        use_flash_attention=use_flash_attention,
-        flash_attention_recompute=flash_attention_recompute,
-    )
-
-    hidden_states = self.resid_attn_dropout(attn_outputs) + residual
-
-    layernorm_output = self.norm_2(hidden_states)
-
-    # Get residual
-    residual = hidden_states
-
-    # MLP.
-    output = self.ffn(layernorm_output, residual)
-    outputs = (output,)
-
-    if use_cache:
-        outputs += (past_key_value,)
-
-    if output_attentions:
-        outputs += (attn_weights,)
-
-    return outputs  # hidden_states, present, attentions
+            if len(position_bias.shape) != 3:
+                raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
+            key_length = key_states.shape[-2]
+
+            position_bias_query_index = max(0, position_bias.size(1) - query_length)
+            position_bias_key_index = max(0, position_bias.size(2) - key_length)
+
+            position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
+
+        if use_flash_attention and FusedSDPA:
+            import habana_frameworks.torch.hpu as ht
+
+            with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
+                attn_output = FusedSDPA.apply(
+                    query_states,
+                    key_states,
+                    value_states,
+                    attention_mask * torch.finfo(query_states.dtype).min + position_bias.to(query_states.dtype),
+                    0.0,
+                    False,
+                    None,
+                )
+            attn_weights = None
+        else:
+            attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
+
+            if position_bias is not None:
+                attention_scores = attention_scores + position_bias
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)
+
+            # (batch_size, n_heads, seq_length, key_length)
+            if self.is_fp8:
+                attn_weights = self.softmax(attention_scores.bfloat16(), dim=-1)
+            else:
+                attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
+            attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)
+
+            attn_output = torch.matmul(attn_weights, value_states)
+
+        attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights, past_key_value
+
+
+class GaudiMptBlock(MptBlock):
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+        self.attn = GaudiMptAttention(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_bias: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        token_idx: Optional[torch.Tensor] = None,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+    ):
+        """
+        Copied from MptBlock.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
+        The only differences are:
+        - add new args token_idx
+        - add new args use_flash_attention
+        - add new arg flash_attention_recompute
+        """
+        # hidden_states: [batch_size, seq_length, hidden_size]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.norm_1(hidden_states)
+
+        residual = hidden_states
+
+        # Self attention.
+        attn_outputs, attn_weights, past_key_value = self.attn(
+            layernorm_output,
+            position_bias=position_bias,
+            attention_mask=attention_mask,
+            past_key_value=layer_past,
+            token_idx=token_idx,
+            use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
+        )
+
+        hidden_states = self.resid_attn_dropout(attn_outputs) + residual
+
+        layernorm_output = self.norm_2(hidden_states)
+
+        # Get residual
+        residual = hidden_states
+
+        # MLP.
+        output = self.ffn(layernorm_output, residual)
+        outputs = (output,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # hidden_states, present, attentions
 
 
 class GaudiMptModel(MptModel):
@@ -280,8 +307,6 @@ def forward(
                     use_cache,
                     output_attentions,
                     None,
-                    use_flash_attention,
-                    flash_attention_recompute,
                 )
             else:
                 outputs = block(
@@ -340,6 +365,8 @@ def prepare_inputs_for_generation(
         - support for internal bucketing
         """
         bucket_internal = kwargs.get("bucket_internal")
+        use_flash_attention = kwargs.get("use_flash_attention", False)
+        flash_attention_recompute = kwargs.get("flash_attention_recompute", False)
         # only last tokens for input_ids if past is not None
         if past_key_values is not None:
             if token_idx is None:
@@ -375,8 +402,8 @@ def prepare_inputs_for_generation(
                 "use_cache": use_cache,
                 "attention_mask": attention_mask,
                 "token_idx": token_idx,
-                "use_flash_attention": kwargs.get("use_flash_attention"),
-                "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
+                "use_flash_attention": use_flash_attention,
+                "flash_attention_recompute": flash_attention_recompute,
             }
         )
         return model_inputs

From 51ddf87fa06112d7ca7f18af8dca48ccccb96611 Mon Sep 17 00:00:00 2001
From: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai>
Date: Tue, 20 Aug 2024 01:04:51 -0700
Subject: [PATCH 02/24] Fix cache position issue in mixtral (#1272)

---
 .../transformers/models/mixtral/modeling_mixtral.py       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
index 299160be8a..fc414e6d76 100644
--- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
+++ b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
@@ -610,7 +610,13 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            past_seen_tokens = 0
+            if past_key_values is not None:
+                if isinstance(past_key_values, Cache):
+                    past_seen_tokens = past_key_values.get_seq_length()
+                else:
+                    past_seen_tokens = past_key_values[0][0].shape[2]
+
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )

From 60ae80b276d05942e31cbee8923ded8fdace60fb Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Tue, 20 Aug 2024 13:43:15 +0000
Subject: [PATCH 03/24] Add temporary directories to test_trainer.py

---
 Makefile              |    2 +-
 tests/test_trainer.py | 1648 +++++++++++++++++++++--------------------
 2 files changed, 866 insertions(+), 784 deletions(-)

diff --git a/Makefile b/Makefile
index 69bb265648..ac8ff3f108 100644
--- a/Makefile
+++ b/Makefile
@@ -165,13 +165,13 @@ clean:
 	find . -name .graph_dumps -type d -exec rm -r {} +
 	find . -name save-hpu.pdb -type f -delete
 	find . -name checkpoints.json -type f -delete
+	find . -name hpu_profile -type d -exec rm -r {} +
 	rm -rf regression/
 	rm -rf tmp_trainer/
 	rm -rf test/
 	rm -rf build/
 	rm -rf dist/
 	rm -rf optimum_habana.egg-info/
-	rm -rf hpu_profile/
 
 test_installs:
 	python -m pip install .[tests]
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index bcbb9521eb..ba78bbd2cc 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -551,16 +551,17 @@ class GaudiTrainerIntegrationPrerunTest(TestCasePlus, GaudiTrainerIntegrationCom
 
     def setUp(self):
         super().setUp()
-        args = GaudiTrainingArguments("..", use_habana=True, use_lazy_mode=True)
-        self.n_epochs = args.num_train_epochs
-        self.batch_size = args.train_batch_size
-        trainer = get_regression_trainer(learning_rate=0.1)
-        trainer.train()
-        self.default_trained_model = (trainer.model.a, trainer.model.b)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=True)
+            self.n_epochs = args.num_train_epochs
+            self.batch_size = args.train_batch_size
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1)
+            trainer.train()
+            self.default_trained_model = (trainer.model.a, trainer.model.b)
 
-        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
-        trainer.train()
-        self.alternate_trained_model = (trainer.model.a, trainer.model.b)
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1, seed=314)
+            trainer.train()
+            self.alternate_trained_model = (trainer.model.a, trainer.model.b)
 
     def check_trained_model(self, model, alternate_seed=False, bf16=False):
         # Checks a training seeded with learning_rate = 0.1
@@ -573,15 +574,16 @@ def check_trained_model(self, model, alternate_seed=False, bf16=False):
             self.assertTrue(torch.allclose(model.b, b, atol=1e-03, rtol=0))
 
     def test_reproducible_training(self):
-        # Checks that training worked, model trained and seed made a reproducible training.
-        trainer = get_regression_trainer(learning_rate=0.1)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Checks that training worked, model trained and seed made a reproducible training.
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Checks that a different seed gets different (reproducible) results.
-        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
-        trainer.train()
-        self.check_trained_model(trainer.model, alternate_seed=True)
+            # Checks that a different seed gets different (reproducible) results.
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1, seed=314)
+            trainer.train()
+            self.check_trained_model(trainer.model, alternate_seed=True)
 
     def test_trainer_with_datasets(self):
         import datasets
@@ -595,56 +597,59 @@ def test_trainer_with_datasets(self):
 
         # Base training. Should have the same results as test_reproducible_training
         model = RegressionModel()
-        args = GaudiTrainingArguments(
-            "./regression", learning_rate=0.1, use_habana=True, use_lazy_mode=True, report_to="none"
-        )
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir, learning_rate=0.1, use_habana=True, use_lazy_mode=True, report_to="none"
+            )
+            trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Can return tensors.
-        train_dataset.set_format(type="torch", dtype=torch.float32)
-        model = RegressionModel()
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+            # Can return tensors.
+            train_dataset.set_format(type="torch", dtype=torch.float32)
+            model = RegressionModel()
+            trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Adding one column not used by the model should have no impact
-        z = np.random.normal(size=(64,)).astype(np.float32)
-        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
-        model = RegressionModel()
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+            # Adding one column not used by the model should have no impact
+            z = np.random.normal(size=(64,)).astype(np.float32)
+            train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
+            model = RegressionModel()
+            trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
     def test_model_init(self):
         train_dataset = RegressionDataset()
         gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments(
-            "./regression", learning_rate=0.1, use_habana=True, use_lazy_mode=True, report_to="none"
-        )
-        trainer = GaudiTrainer(
-            gaudi_config=gaudi_config, args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel()
-        )
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir, learning_rate=0.1, use_habana=True, use_lazy_mode=True, report_to="none"
+            )
+            trainer = GaudiTrainer(
+                gaudi_config=gaudi_config, args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel()
+            )
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Re-training should restart from scratch, thus lead the same results.
-        trainer.train()
-        self.check_trained_model(trainer.model)
+            # Re-training should restart from scratch, thus lead the same results.
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Re-training should restart from scratch, thus lead the same results and new seed should be used.
-        trainer.args.seed = 314
-        trainer.train()
-        self.check_trained_model(trainer.model, alternate_seed=True)
+            # Re-training should restart from scratch, thus lead the same results and new seed should be used.
+            trainer.args.seed = 314
+            trainer.train()
+            self.check_trained_model(trainer.model, alternate_seed=True)
 
     def test_gradient_accumulation(self):
-        # Training with half the batch size but accumulation steps as 2 should give the same results.
-        trainer = get_regression_trainer(
-            gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1
-        )
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Training with half the batch size but accumulation steps as 2 should give the same results.
+            trainer = get_regression_trainer(
+                output_dir=tmpdir, gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1
+            )
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
     # The test below is commented because it leads to a core dumped error
     # when it is run with all other tests. It passes when run alone.
@@ -671,41 +676,43 @@ def test_gradient_accumulation(self):
     def test_training_loss(self):
         n_gpus = max(1, get_gpu_count())
 
-        # With even logs
-        trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus))
-        trainer.train()
-        log_history = trainer.state.log_history
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # With even logs
+            trainer = get_regression_trainer(output_dir=tmpdir, logging_steps=64 / (8 * n_gpus))
+            trainer.train()
+            log_history = trainer.state.log_history
 
-        losses = [log["loss"] for log in log_history if "loss" in log]
-        train_loss = log_history[-1]["train_loss"]
-        self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4)
+            losses = [log["loss"] for log in log_history if "loss" in log]
+            train_loss = log_history[-1]["train_loss"]
+            self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4)
 
-        # With uneven logs
-        trainer = get_regression_trainer(logging_steps=5)
-        trainer.train()
-        log_history = trainer.state.log_history
+            # With uneven logs
+            trainer = get_regression_trainer(output_dir=tmpdir, logging_steps=5)
+            trainer.train()
+            log_history = trainer.state.log_history
 
-        # Training loss should be the same as before
-        new_train_loss = log_history[-1]["train_loss"]
-        self.assertAlmostEqual(train_loss, new_train_loss, places=4)
+            # Training loss should be the same as before
+            new_train_loss = log_history[-1]["train_loss"]
+            self.assertAlmostEqual(train_loss, new_train_loss, places=4)
 
     def test_custom_optimizer(self):
         train_dataset = RegressionDataset()
         gaudi_config = get_gaudi_config()
         gaudi_config.use_fused_adam = False
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True, report_to="none")
-        model = RegressionModel()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
-        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
-        trainer = GaudiTrainer(
-            model, gaudi_config, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)
-        )
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=True, report_to="none")
+            model = RegressionModel()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+            lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
+            trainer = GaudiTrainer(
+                model, gaudi_config, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)
+            )
+            trainer.train()
 
-        (a, b) = self.default_trained_model
-        self.assertFalse(torch.allclose(trainer.model.a, a))
-        self.assertFalse(torch.allclose(trainer.model.b, b))
-        self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
+            (a, b) = self.default_trained_model
+            self.assertFalse(torch.allclose(trainer.model.a, a))
+            self.assertFalse(torch.allclose(trainer.model.b, b))
+            self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
 
     def test_lr_scheduler_kwargs(self):
         # test scheduler kwargs passed via TrainingArguments
@@ -713,56 +720,58 @@ def test_lr_scheduler_kwargs(self):
         model = RegressionModel()
         num_steps, num_warmup_steps = 10, 2
         extra_kwargs = {"power": 5.0, "lr_end": 1e-5}  # Non-default arguments
-        args = GaudiTrainingArguments(
-            "./regression",
-            lr_scheduler_type="polynomial",
-            lr_scheduler_kwargs=extra_kwargs,
-            learning_rate=0.2,
-            warmup_steps=num_warmup_steps,
-            use_habana=True,
-            use_lazy_mode=True,
-            report_to="none",
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir,
+                lr_scheduler_type="polynomial",
+                lr_scheduler_kwargs=extra_kwargs,
+                learning_rate=0.2,
+                warmup_steps=num_warmup_steps,
+                use_habana=True,
+                use_lazy_mode=True,
+                report_to="none",
+            )
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
+            trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
 
-        # Checking that the scheduler was created
-        self.assertIsNotNone(trainer.lr_scheduler)
+            # Checking that the scheduler was created
+            self.assertIsNotNone(trainer.lr_scheduler)
 
-        # Checking that the correct args were passed
-        sched1 = trainer.lr_scheduler
-        sched2 = get_polynomial_decay_schedule_with_warmup(
-            trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs
-        )
-        self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args)
-        self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords)
+            # Checking that the correct args were passed
+            sched1 = trainer.lr_scheduler
+            sched2 = get_polynomial_decay_schedule_with_warmup(
+                trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs
+            )
+            self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args)
+            self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords)
 
     def test_cosine_with_min_lr_scheduler(self):
         train_dataset = RegressionDataset()
         model = RegressionModel()
         num_steps, num_warmup_steps = 10, 2
         extra_kwargs = {"min_lr": 1e-5}  # Non-default arguments
-        args = GaudiTrainingArguments(
-            "./regression",
-            lr_scheduler_type="cosine_with_min_lr",
-            lr_scheduler_kwargs=extra_kwargs,
-            learning_rate=0.2,
-            warmup_steps=num_warmup_steps,
-            use_habana=True,
-            use_lazy_mode=True,
-            report_to="none",
-        )
-        trainer = GaudiTrainer(model, gaudi_config=get_gaudi_config(), args=args, train_dataset=train_dataset)
-        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir,
+                lr_scheduler_type="cosine_with_min_lr",
+                lr_scheduler_kwargs=extra_kwargs,
+                learning_rate=0.2,
+                warmup_steps=num_warmup_steps,
+                use_habana=True,
+                use_lazy_mode=True,
+                report_to="none",
+            )
+            trainer = GaudiTrainer(model, gaudi_config=get_gaudi_config(), args=args, train_dataset=train_dataset)
+            trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
 
-        # Checking that the scheduler was created
-        self.assertIsNotNone(trainer.lr_scheduler)
+            # Checking that the scheduler was created
+            self.assertIsNotNone(trainer.lr_scheduler)
 
-        # Check the last learning rate
-        for _ in range(num_steps):
-            trainer.lr_scheduler.step()
-        self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5)
+            # Check the last learning rate
+            for _ in range(num_steps):
+                trainer.lr_scheduler.step()
+            self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5)
 
     def test_reduce_lr_on_plateau_args(self):
         # test passed arguments for a custom ReduceLROnPlateau scheduler
@@ -770,31 +779,32 @@ def test_reduce_lr_on_plateau_args(self):
         eval_dataset = RegressionDataset(length=64)
         gaudi_config = get_gaudi_config()
         gaudi_config.use_fused_adam = False
-        args = GaudiTrainingArguments(
-            "./regression",
-            eval_strategy="epoch",
-            metric_for_best_model="eval_loss",
-            use_habana=True,
-            use_lazy_mode=True,
-            report_to="none",
-        )
-        model = RegressionModel()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
-        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2)
-        trainer = GaudiTrainer(
-            model,
-            gaudi_config,
-            args,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            optimizers=(optimizer, lr_scheduler),
-        )
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir,
+                eval_strategy="epoch",
+                metric_for_best_model="eval_loss",
+                use_habana=True,
+                use_lazy_mode=True,
+                report_to="none",
+            )
+            model = RegressionModel()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+            lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2)
+            trainer = GaudiTrainer(
+                model,
+                gaudi_config,
+                args,
+                train_dataset=train_dataset,
+                eval_dataset=eval_dataset,
+                optimizers=(optimizer, lr_scheduler),
+            )
+            trainer.train()
 
-        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
-        self.assertEqual(trainer.lr_scheduler.factor, 0.2)
-        self.assertEqual(trainer.lr_scheduler.patience, 5)
-        self.assertEqual(trainer.lr_scheduler.cooldown, 2)
+            self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
+            self.assertEqual(trainer.lr_scheduler.factor, 0.2)
+            self.assertEqual(trainer.lr_scheduler.patience, 5)
+            self.assertEqual(trainer.lr_scheduler.cooldown, 2)
 
     def test_reduce_lr_on_plateau(self):
         # test the ReduceLROnPlateau scheduler
@@ -811,41 +821,44 @@ def log(self, logs):
         gaudi_config = get_gaudi_config()
         gaudi_config.use_fused_adam = False
 
-        args = GaudiTrainingArguments(
-            "./regression",
-            lr_scheduler_type="reduce_lr_on_plateau",
-            eval_strategy="epoch",
-            metric_for_best_model="eval_loss",
-            num_train_epochs=10,
-            learning_rate=0.2,
-            report_to="none",
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        model = RegressionModel()
-        trainer = TrainerWithLRLogs(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir,
+                lr_scheduler_type="reduce_lr_on_plateau",
+                eval_strategy="epoch",
+                metric_for_best_model="eval_loss",
+                num_train_epochs=10,
+                learning_rate=0.2,
+                report_to="none",
+                use_habana=True,
+                use_lazy_mode=True,
+            )
+            model = RegressionModel()
+            trainer = TrainerWithLRLogs(
+                model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset
+            )
+            trainer.train()
 
-        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
-        patience = trainer.lr_scheduler.patience
-
-        logs = trainer.state.log_history[1:]
-        best_loss = logs[0]["eval_loss"]
-        bad_epochs = 0
-        for i, log in enumerate(logs[:-1]):  # Compare learning rate to next epoch's
-            loss = log["eval_loss"]
-            just_decreased = False
-            if loss > best_loss:
-                bad_epochs += 1
-                if bad_epochs > patience:
-                    self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"])
-                    just_decreased = True
+            self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
+            patience = trainer.lr_scheduler.patience
+
+            logs = trainer.state.log_history[1:]
+            best_loss = logs[0]["eval_loss"]
+            bad_epochs = 0
+            for i, log in enumerate(logs[:-1]):  # Compare learning rate to next epoch's
+                loss = log["eval_loss"]
+                just_decreased = False
+                if loss > best_loss:
+                    bad_epochs += 1
+                    if bad_epochs > patience:
+                        self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"])
+                        just_decreased = True
+                        bad_epochs = 0
+                else:
+                    best_loss = loss
                     bad_epochs = 0
-            else:
-                best_loss = loss
-                bad_epochs = 0
-            if not just_decreased:
-                self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"])
+                if not just_decreased:
+                    self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"])
 
     def test_adafactor_lr_none(self):
         # test the special case where lr=None, since Trainer can't not have lr_scheduler
@@ -853,28 +866,32 @@ def test_adafactor_lr_none(self):
         from transformers.optimization import Adafactor, AdafactorSchedule
 
         train_dataset = RegressionDataset()
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True, report_to="none")
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_fused_adam = False
-        model = RegressionModel().to("hpu")
-        optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
-        lr_scheduler = AdafactorSchedule(optimizer)
-        trainer = GaudiTrainer(
-            model, gaudi_config, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)
-        )
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=True, report_to="none")
+            gaudi_config = get_gaudi_config()
+            gaudi_config.use_fused_adam = False
+            model = RegressionModel().to("hpu")
+            optimizer = Adafactor(
+                model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None
+            )
+            lr_scheduler = AdafactorSchedule(optimizer)
+            trainer = GaudiTrainer(
+                model, gaudi_config, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)
+            )
+            trainer.train()
 
-        (a, b) = self.default_trained_model
-        self.assertFalse(torch.allclose(trainer.model.a, a))
-        self.assertFalse(torch.allclose(trainer.model.b, b))
-        self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
+            (a, b) = self.default_trained_model
+            self.assertFalse(torch.allclose(trainer.model.a, a))
+            self.assertFalse(torch.allclose(trainer.model.b, b))
+            self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
 
     def test_mixed_bf16(self):
-        # very basic test
-        trainer = get_regression_trainer(learning_rate=0.1, bf16=True)
-        self.assertTrue(trainer.use_hpu_amp)
-        trainer.train()
-        self.check_trained_model(trainer.model, bf16=True)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # very basic test
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1, bf16=True)
+            self.assertTrue(trainer.use_hpu_amp)
+            trainer.train()
+            self.check_trained_model(trainer.model, bf16=True)
 
 
 @require_torch
@@ -893,83 +910,89 @@ def test_eager_mode(self):
         eval_dataset = RegressionDataset()
         model = RegressionModel()
         gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=False)
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-        _ = trainer.evaluate()
-        _ = trainer.predict(eval_dataset)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=False)
+            trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+            trainer.train()
+            _ = trainer.evaluate()
+            _ = trainer.predict(eval_dataset)
 
     def test_hpu_graphs(self):
         train_dataset = RegressionDataset()
         eval_dataset = RegressionDataset()
         model = RegressionModel()
         gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments(
-            "./regression",
-            use_habana=True,
-            use_lazy_mode=True,
-            use_hpu_graphs_for_training=True,
-            use_hpu_graphs_for_inference=True,
-            disable_tensor_cache_hpu_graphs=True,
-            max_hpu_graphs=1,
-        )
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-        _ = trainer.evaluate()
-        _ = trainer.predict(eval_dataset)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir,
+                use_habana=True,
+                use_lazy_mode=True,
+                use_hpu_graphs_for_training=True,
+                use_hpu_graphs_for_inference=True,
+                disable_tensor_cache_hpu_graphs=True,
+                max_hpu_graphs=1,
+            )
+            trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+            trainer.train()
+            _ = trainer.evaluate()
+            _ = trainer.predict(eval_dataset)
 
     def test_trainer_works_with_dict(self):
         train_dataset = RegressionDataset()
         eval_dataset = RegressionDataset()
         model = RegressionDictModel()
         gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True, report_to="none")
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-        _ = trainer.evaluate()
-        _ = trainer.predict(eval_dataset)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=True, report_to="none")
+            trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+            trainer.train()
+            _ = trainer.evaluate()
+            _ = trainer.predict(eval_dataset)
 
     def test_evaluation_with_keys_to_drop(self):
-        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
-        tiny_gpt2 = GaudiGPT2LMHeadModel(config)
-        x = torch.randint(0, 100, (128,))
-        eval_dataset = RepeatDataset(x)
-        args = GaudiTrainingArguments("./test", use_habana=True, use_lazy_mode=True, report_to="none")
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, eval_dataset=eval_dataset)
-        # By default the past_key_values are removed
-        result = trainer.predict(eval_dataset)
-        self.assertTrue(isinstance(result.predictions, np.ndarray))
-        # We can still get them by setting ignore_keys to []
-        result = trainer.predict(eval_dataset, ignore_keys=[])
-        self.assertTrue(isinstance(result.predictions, tuple))
-        self.assertEqual(len(result.predictions), 2)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
+            tiny_gpt2 = GaudiGPT2LMHeadModel(config)
+            x = torch.randint(0, 100, (128,))
+            eval_dataset = RepeatDataset(x)
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=True, report_to="none")
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, eval_dataset=eval_dataset)
+            # By default the past_key_values are removed
+            result = trainer.predict(eval_dataset)
+            self.assertTrue(isinstance(result.predictions, np.ndarray))
+            # We can still get them by setting ignore_keys to []
+            result = trainer.predict(eval_dataset, ignore_keys=[])
+            self.assertTrue(isinstance(result.predictions, tuple))
+            self.assertEqual(len(result.predictions), 2)
 
     def test_training_arguments_are_left_untouched(self):
-        trainer = get_regression_trainer()
-        trainer.train()
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True, report_to=[])
-        dict1, dict2 = args.to_dict(), trainer.args.to_dict()
-        for key in dict1.keys():
-            # Logging dir can be slightly different as they default to something with the time.
-            if key != "logging_dir":
-                self.assertEqual(dict1[key], dict2[key])
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(output_dir=tmpdir)
+            trainer.train()
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=True, report_to=[])
+            dict1, dict2 = args.to_dict(), trainer.args.to_dict()
+            for key in dict1.keys():
+                # Logging dir can be slightly different as they default to something with the time.
+                if key != "logging_dir":
+                    self.assertEqual(dict1[key], dict2[key])
 
     def test_number_of_steps_in_training(self):
-        # Regular training has n_epochs * len(train_dl) steps
-        trainer = get_regression_trainer(learning_rate=0.1)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Regular training has n_epochs * len(train_dl) steps
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1)
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
 
-        # Check passing num_train_epochs works (and a float version too):
-        trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))
+            # Check passing num_train_epochs works (and a float version too):
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1, num_train_epochs=1.5)
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))
 
-        # If we pass a max_steps, num_train_epochs is ignored
-        trainer = get_regression_trainer(learning_rate=0.1, max_steps=10)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, 10)
+            # If we pass a max_steps, num_train_epochs is ignored
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1, max_steps=10)
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, 10)
 
     @require_peft
     def test_multiple_peft_adapters(self):
@@ -1082,69 +1105,77 @@ def test_logging_inf_nan_filter(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        # GaudiTrainer without inf/nan filter
-        gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments(
-            "./test",
-            learning_rate=1e9,
-            logging_steps=5,
-            logging_nan_inf_filter=False,
-            use_habana=True,
-            use_lazy_mode=True,
-            report_to="none",
-        )
-        trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        log_history_no_filter = trainer.state.log_history
-
-        # GaudiTrainer with inf/nan filter
-        args = GaudiTrainingArguments(
-            "./test",
-            learning_rate=1e9,
-            logging_steps=5,
-            logging_nan_inf_filter=True,
-            use_habana=True,
-            use_lazy_mode=True,
-            report_to="none",
-        )
-        trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        log_history_filter = trainer.state.log_history
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # GaudiTrainer without inf/nan filter
+            gaudi_config = get_gaudi_config()
+            args = GaudiTrainingArguments(
+                tmpdir,
+                learning_rate=1e9,
+                logging_steps=5,
+                logging_nan_inf_filter=False,
+                use_habana=True,
+                use_lazy_mode=True,
+                report_to="none",
+            )
+            trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset)
+            trainer.train()
+            log_history_no_filter = trainer.state.log_history
+
+            # GaudiTrainer with inf/nan filter
+            args = GaudiTrainingArguments(
+                tmpdir,
+                learning_rate=1e9,
+                logging_steps=5,
+                logging_nan_inf_filter=True,
+                use_habana=True,
+                use_lazy_mode=True,
+                report_to="none",
+            )
+            trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset)
+            trainer.train()
+            log_history_filter = trainer.state.log_history
 
-        def is_any_loss_nan_or_inf(log_history):
-            losses = [l["loss"] for l in log_history[:-1]]
-            return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses)
+            def is_any_loss_nan_or_inf(log_history):
+                losses = [l["loss"] for l in log_history[:-1]]
+                return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses)
 
-        self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter))
-        self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))
+            self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter))
+            self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))
 
     def test_train_and_eval_dataloaders(self):
-        trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16)
-        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
-        trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16)
-        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
-
-        # Check drop_last works
-        trainer = get_regression_trainer(
-            train_len=66, eval_len=74, learning_rate=0.1, per_device_train_batch_size=16, per_device_eval_batch_size=32
-        )
-        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16) + 1)
-        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32) + 1)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1, per_device_train_batch_size=16)
+            self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
+            trainer = get_regression_trainer(output_dir=tmpdir, learning_rate=0.1, per_device_eval_batch_size=16)
+            self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
 
-        trainer = get_regression_trainer(
-            train_len=66,
-            eval_len=74,
-            learning_rate=0.1,
-            per_device_train_batch_size=16,
-            per_device_eval_batch_size=32,
-            dataloader_drop_last=True,
-        )
-        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16))
-        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32))
+            # Check drop_last works
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                train_len=66,
+                eval_len=74,
+                learning_rate=0.1,
+                per_device_train_batch_size=16,
+                per_device_eval_batch_size=32,
+            )
+            self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16) + 1)
+            self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32) + 1)
+
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                train_len=66,
+                eval_len=74,
+                learning_rate=0.1,
+                per_device_train_batch_size=16,
+                per_device_eval_batch_size=32,
+                dataloader_drop_last=True,
+            )
+            self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16))
+            self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32))
 
-        # Check passing a new dataset for evaluation works
-        new_eval_dataset = RegressionDataset(length=128)
-        self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32))
+            # Check passing a new dataset for evaluation works
+            new_eval_dataset = RegressionDataset(length=128)
+            self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32))
 
     # tests that we do not require dataloader to have a .dataset attribute
     def test_dataloader_without_dataset(self):
@@ -1162,416 +1193,452 @@ def test_dataloader_without_dataset(self):
             trainer.evaluate()
 
     def test_get_eval_dataloader_without_persistent_workers(self):
-        train_dataset = RegressionDataset()
-        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
-        tiny_gpt2 = GPT2LMHeadModel(config)
-        args = GaudiTrainingArguments(
-            "./test",
-            report_to="none",
-            dataloader_persistent_workers=False,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-
-        # Single evaluation dataset
-        eval_dataset = RegressionDataset()
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
-        trainer.accelerator.prepare = lambda x: x
-
-        default_dataloader = trainer.get_eval_dataloader()
-        dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
-
-        self.assertEqual(default_dataloader.dataset, eval_dataset)
-        self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
-        self.assertNotEqual(default_dataloader, dataloader_with_dataset)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            train_dataset = RegressionDataset()
+            config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
+            tiny_gpt2 = GPT2LMHeadModel(config)
+            args = GaudiTrainingArguments(
+                tmpdir,
+                report_to="none",
+                dataloader_persistent_workers=False,
+                use_habana=True,
+                use_lazy_mode=True,
+            )
 
-        # Multiple evaluation datasets
-        first_dataset = RegressionDataset()
-        second_dataset = RegressionDataset()
-        trainer = GaudiTrainer(
-            tiny_gpt2,
-            gaudi_config,
-            args,
-            train_dataset=train_dataset,
-            eval_dataset={"first": first_dataset, "second": second_dataset},
-        )
-        # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
-        trainer.accelerator.prepare = lambda x: x
+            # Single evaluation dataset
+            eval_dataset = RegressionDataset()
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(
+                tiny_gpt2, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset
+            )
+            # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
+            trainer.accelerator.prepare = lambda x: x
+
+            default_dataloader = trainer.get_eval_dataloader()
+            dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
+
+            self.assertEqual(default_dataloader.dataset, eval_dataset)
+            self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
+            self.assertNotEqual(default_dataloader, dataloader_with_dataset)
+
+            # Multiple evaluation datasets
+            first_dataset = RegressionDataset()
+            second_dataset = RegressionDataset()
+            trainer = GaudiTrainer(
+                tiny_gpt2,
+                gaudi_config,
+                args,
+                train_dataset=train_dataset,
+                eval_dataset={"first": first_dataset, "second": second_dataset},
+            )
+            # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
+            trainer.accelerator.prepare = lambda x: x
 
-        first_dataloader = trainer.get_eval_dataloader("first")
-        first_dataloader_repeated = trainer.get_eval_dataloader("first")
-        second_dataloader = trainer.get_eval_dataloader("second")
-        second_dataloader_repeated = trainer.get_eval_dataloader("second")
+            first_dataloader = trainer.get_eval_dataloader("first")
+            first_dataloader_repeated = trainer.get_eval_dataloader("first")
+            second_dataloader = trainer.get_eval_dataloader("second")
+            second_dataloader_repeated = trainer.get_eval_dataloader("second")
 
-        self.assertEqual(first_dataset, first_dataloader.dataset)
-        self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
-        self.assertEqual(second_dataset, second_dataloader.dataset)
-        self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
-        self.assertNotEqual(first_dataloader, first_dataloader_repeated)
-        self.assertNotEqual(second_dataloader, second_dataloader_repeated)
+            self.assertEqual(first_dataset, first_dataloader.dataset)
+            self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
+            self.assertEqual(second_dataset, second_dataloader.dataset)
+            self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
+            self.assertNotEqual(first_dataloader, first_dataloader_repeated)
+            self.assertNotEqual(second_dataloader, second_dataloader_repeated)
 
     def test_get_eval_dataloader_with_persistent_workers(self):
-        train_dataset = RegressionDataset()
-        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
-        tiny_gpt2 = GPT2LMHeadModel(config)
-        args = GaudiTrainingArguments(
-            "./test",
-            report_to="none",
-            dataloader_persistent_workers=True,
-            dataloader_num_workers=2,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-
-        # Single evaluation dataset
-        eval_dataset = RegressionDataset()
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
-        trainer.accelerator.prepare = lambda x: x
-
-        default_dataloader = trainer.get_eval_dataloader()
-        dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
-
-        self.assertEqual(default_dataloader.dataset, eval_dataset)
-        self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
-        self.assertEqual(default_dataloader, dataloader_with_dataset)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            train_dataset = RegressionDataset()
+            config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
+            tiny_gpt2 = GPT2LMHeadModel(config)
+            args = GaudiTrainingArguments(
+                tmpdir,
+                report_to="none",
+                dataloader_persistent_workers=True,
+                dataloader_num_workers=2,
+                use_habana=True,
+                use_lazy_mode=True,
+            )
 
-        # Multiple evaluation datasets
-        first_dataset = RegressionDataset()
-        second_dataset = RegressionDataset()
-        trainer = GaudiTrainer(
-            tiny_gpt2,
-            gaudi_config,
-            args,
-            train_dataset=train_dataset,
-            eval_dataset={"first": first_dataset, "second": second_dataset},
-        )
-        # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
-        trainer.accelerator.prepare = lambda x: x
+            # Single evaluation dataset
+            eval_dataset = RegressionDataset()
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(
+                tiny_gpt2, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset
+            )
+            # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
+            trainer.accelerator.prepare = lambda x: x
+
+            default_dataloader = trainer.get_eval_dataloader()
+            dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
+
+            self.assertEqual(default_dataloader.dataset, eval_dataset)
+            self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
+            self.assertEqual(default_dataloader, dataloader_with_dataset)
+
+            # Multiple evaluation datasets
+            first_dataset = RegressionDataset()
+            second_dataset = RegressionDataset()
+            trainer = GaudiTrainer(
+                tiny_gpt2,
+                gaudi_config,
+                args,
+                train_dataset=train_dataset,
+                eval_dataset={"first": first_dataset, "second": second_dataset},
+            )
+            # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
+            trainer.accelerator.prepare = lambda x: x
 
-        first_dataloader = trainer.get_eval_dataloader("first")
-        first_dataloader_repeated = trainer.get_eval_dataloader("first")
-        second_dataloader = trainer.get_eval_dataloader("second")
-        second_dataloader_repeated = trainer.get_eval_dataloader("second")
+            first_dataloader = trainer.get_eval_dataloader("first")
+            first_dataloader_repeated = trainer.get_eval_dataloader("first")
+            second_dataloader = trainer.get_eval_dataloader("second")
+            second_dataloader_repeated = trainer.get_eval_dataloader("second")
 
-        self.assertEqual(first_dataset, first_dataloader.dataset)
-        self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
-        self.assertEqual(second_dataset, second_dataloader.dataset)
-        self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
-        self.assertEqual(first_dataloader, first_dataloader_repeated)
-        self.assertEqual(second_dataloader, second_dataloader_repeated)
+            self.assertEqual(first_dataset, first_dataloader.dataset)
+            self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
+            self.assertEqual(second_dataset, second_dataloader.dataset)
+            self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
+            self.assertEqual(first_dataloader, first_dataloader_repeated)
+            self.assertEqual(second_dataloader, second_dataloader_repeated)
 
     def test_data_is_not_parallelized_when_model_is_parallel(self):
         model = RegressionModel()
         # Make the Trainer believe it's a parallelized model
         model.is_parallelizable = True
         model.model_parallel = True
-        args = GaudiTrainingArguments(
-            "./regression",
-            per_device_train_batch_size=16,
-            per_device_eval_batch_size=16,
-            use_habana=True,
-            use_lazy_mode=True,
-            report_to="none",
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(
-            model, gaudi_config, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()
-        )
-        # Check the Trainer was fooled
-        self.assertTrue(trainer.is_model_parallel)
-        self.assertEqual(trainer.args.n_gpu, 1)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir,
+                per_device_train_batch_size=16,
+                per_device_eval_batch_size=16,
+                use_habana=True,
+                use_lazy_mode=True,
+                report_to="none",
+            )
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(
+                model, gaudi_config, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()
+            )
+            # Check the Trainer was fooled
+            self.assertTrue(trainer.is_model_parallel)
+            self.assertEqual(trainer.args.n_gpu, 1)
 
-        # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
-        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
-        self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
-        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
-        self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
+            # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
+            self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
+            self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
+            self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
+            self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
 
     def test_evaluate(self):
-        trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy())
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy())
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With logits preprocess
-        trainer = get_regression_trainer(
-            a=1.5,
-            b=2.5,
-            compute_metrics=AlmostAccuracy(),
-            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
-        )
-        results = trainer.evaluate()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(output_dir=tmpdir, a=1.5, b=2.5, compute_metrics=AlmostAccuracy())
+            results = trainer.evaluate()
+
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+            # With a number of elements not a round multiple of the batch size
+            trainer = get_regression_trainer(
+                output_dir=tmpdir, a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy()
+            )
+            results = trainer.evaluate()
 
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+            # With logits preprocess
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                a=1.5,
+                b=2.5,
+                compute_metrics=AlmostAccuracy(),
+                preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
+            )
+            results = trainer.evaluate()
+
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
     def test_evaluate_with_batch_eval_metrics(self):
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With logits preprocess
-        trainer = get_regression_trainer(
-            a=1.5,
-            b=2.5,
-            compute_metrics=AlmostAccuracyBatched(),
-            batch_eval_metrics=True,
-            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
-        )
-        results = trainer.evaluate()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir, a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
+            )
+            results = trainer.evaluate()
+
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+            # With a number of elements not a round multiple of the batch size
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                a=1.5,
+                b=2.5,
+                eval_len=66,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+            )
+            results = trainer.evaluate()
+
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            # With logits preprocess
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                a=1.5,
+                b=2.5,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+                preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
+            )
+            results = trainer.evaluate()
+
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
     def test_predict(self):
-        trainer = get_regression_trainer(a=1.5, b=2.5)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With more than one output of the model
-        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-
-        # With more than one output/label of the model
-        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"])
-        outputs = trainer.predict(trainer.eval_dataset)
-        preds = outputs.predictions
-        labels = outputs.label_ids
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
-        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(output_dir=tmpdir, a=1.5, b=2.5)
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = trainer.eval_dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+            # With a number of elements not a round multiple of the batch size
+            trainer = get_regression_trainer(output_dir=tmpdir, a=1.5, b=2.5, eval_len=66)
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = trainer.eval_dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+            # With more than one output of the model
+            trainer = get_regression_trainer(output_dir=tmpdir, a=1.5, b=2.5, double_output=True)
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = trainer.eval_dataset.x
+            self.assertEqual(len(preds), 2)
+            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+
+            # With more than one output/label of the model
+            trainer = get_regression_trainer(
+                output_dir=tmpdir, a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"]
+            )
+            outputs = trainer.predict(trainer.eval_dataset)
+            preds = outputs.predictions
+            labels = outputs.label_ids
+            x = trainer.eval_dataset.x
+            self.assertEqual(len(preds), 2)
+            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+            self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
+            self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
 
     def test_predict_with_batch_eval_metrics(self):
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        results = trainer.predict(trainer.eval_dataset)
-        preds = results.predictions
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        gt = 1.5 * x + 2.5
-        self.assertTrue(np.allclose(preds, gt))
-        expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
-        self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        results = trainer.predict(trainer.eval_dataset)
-        preds = results.predictions
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-        expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
-        self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
-
-        # With more than one output of the model
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, double_output=True, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-
-        # With more than one output/label of the model
-        trainer = get_regression_trainer(
-            a=1.5,
-            b=2.5,
-            double_output=True,
-            label_names=["labels", "labels_2"],
-            compute_metrics=AlmostAccuracyBatched(),
-            batch_eval_metrics=True,
-        )
-        outputs = trainer.predict(trainer.eval_dataset)
-        preds = outputs.predictions
-        labels = outputs.label_ids
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
-        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir, a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
+            )
+            results = trainer.predict(trainer.eval_dataset)
+            preds = results.predictions
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            gt = 1.5 * x + 2.5
+            self.assertTrue(np.allclose(preds, gt))
+            expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
+            self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
+
+            # With a number of elements not a round multiple of the batch size
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                a=1.5,
+                b=2.5,
+                eval_len=66,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+            )
+            results = trainer.predict(trainer.eval_dataset)
+            preds = results.predictions
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+            expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
+            self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
+
+            # With more than one output of the model
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                a=1.5,
+                b=2.5,
+                double_output=True,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+            )
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = trainer.eval_dataset.x
+            self.assertEqual(len(preds), 2)
+            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+
+            # With more than one output/label of the model
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                a=1.5,
+                b=2.5,
+                double_output=True,
+                label_names=["labels", "labels_2"],
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+            )
+            outputs = trainer.predict(trainer.eval_dataset)
+            preds = outputs.predictions
+            labels = outputs.label_ids
+            x = trainer.eval_dataset.x
+            self.assertEqual(len(preds), 2)
+            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+            self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
+            self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
 
     def test_dynamic_shapes(self):
         eval_dataset = DynamicShapesDataset(batch_size=self.batch_size)
         model = RegressionModel(a=2, b=1)
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True)
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_dynamic_shapes = True
-        trainer = GaudiTrainer(model, gaudi_config, args, eval_dataset=eval_dataset)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=True)
+            gaudi_config = get_gaudi_config()
+            gaudi_config.use_dynamic_shapes = True
+            trainer = GaudiTrainer(model, gaudi_config, args, eval_dataset=eval_dataset)
 
-        # Check evaluation can run to completion
-        _ = trainer.evaluate()
+            # Check evaluation can run to completion
+            _ = trainer.evaluate()
 
-        # Check predictions
-        preds = trainer.predict(eval_dataset)
-        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
-            self.assertTrue(np.allclose(expected, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+            # Check predictions
+            preds = trainer.predict(eval_dataset)
+            for expected, seen in zip(eval_dataset.ys, preds.label_ids):
+                self.assertTrue(np.allclose(expected, seen[: expected.shape[0]]))
+                self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
 
-        for expected, seen in zip(eval_dataset.xs, preds.predictions):
-            self.assertTrue(np.allclose(2 * expected + 1, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+            for expected, seen in zip(eval_dataset.xs, preds.predictions):
+                self.assertTrue(np.allclose(2 * expected + 1, seen[: expected.shape[0]]))
+                self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
 
-        # Same tests with eval accumulation
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True, eval_accumulation_steps=2)
-        trainer = GaudiTrainer(model, gaudi_config, args, eval_dataset=eval_dataset)
+            # Same tests with eval accumulation
+            args = GaudiTrainingArguments(tmpdir, use_habana=True, use_lazy_mode=True, eval_accumulation_steps=2)
+            trainer = GaudiTrainer(model, gaudi_config, args, eval_dataset=eval_dataset)
 
-        # Check evaluation can run to completion
-        _ = trainer.evaluate()
+            # Check evaluation can run to completion
+            _ = trainer.evaluate()
 
-        # Check predictions
-        preds = trainer.predict(eval_dataset)
-        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
-            self.assertTrue(np.allclose(expected, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+            # Check predictions
+            preds = trainer.predict(eval_dataset)
+            for expected, seen in zip(eval_dataset.ys, preds.label_ids):
+                self.assertTrue(np.allclose(expected, seen[: expected.shape[0]]))
+                self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
 
-        for expected, seen in zip(eval_dataset.xs, preds.predictions):
-            self.assertTrue(np.allclose(2 * expected + 1, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+            for expected, seen in zip(eval_dataset.xs, preds.predictions):
+                self.assertTrue(np.allclose(2 * expected + 1, seen[: expected.shape[0]]))
+                self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
 
     def test_dynamic_shape_feature(self):
         # Run training with variable length inputs and enable dynamic shapes support
         train_dataset = RegressionDatasetDynamic(length=256)
         gaudi_config = get_gaudi_config()
         gaudi_config.use_dynamic_shapes = True
-        args = GaudiTrainingArguments(
-            "./regression",
-            use_habana=True,
-            use_lazy_mode=True,
-            per_device_train_batch_size=1,
-            num_train_epochs=1,
-            report_to="none",
-        )
-        model = RegressionModel()
-        trainer = GaudiTrainer(
-            model,
-            gaudi_config,
-            args,
-            train_dataset=train_dataset,
-        )
-        train_output_ds = trainer.train()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir,
+                use_habana=True,
+                use_lazy_mode=True,
+                per_device_train_batch_size=1,
+                num_train_epochs=1,
+                report_to="none",
+            )
+            model = RegressionModel()
+            trainer = GaudiTrainer(
+                model,
+                gaudi_config,
+                args,
+                train_dataset=train_dataset,
+            )
+            train_output_ds = trainer.train()
 
-        # Run training again with variable length inputs and disable dynamic shapes support
-        train_dataset = RegressionDatasetDynamic(length=256)
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_dynamic_shapes = False
-        args = GaudiTrainingArguments(
-            "./regression",
-            use_habana=True,
-            use_lazy_mode=True,
-            per_device_train_batch_size=1,
-            num_train_epochs=1,
-            report_to="none",
-        )
-        model = RegressionModel()
-        trainer = GaudiTrainer(
-            model,
-            gaudi_config,
-            args,
-            train_dataset=train_dataset,
-        )
-        train_output_static = trainer.train()
+            # Run training again with variable length inputs and disable dynamic shapes support
+            train_dataset = RegressionDatasetDynamic(length=256)
+            gaudi_config = get_gaudi_config()
+            gaudi_config.use_dynamic_shapes = False
+            args = GaudiTrainingArguments(
+                tmpdir,
+                use_habana=True,
+                use_lazy_mode=True,
+                per_device_train_batch_size=1,
+                num_train_epochs=1,
+                report_to="none",
+            )
+            model = RegressionModel()
+            trainer = GaudiTrainer(
+                model,
+                gaudi_config,
+                args,
+                train_dataset=train_dataset,
+            )
+            train_output_static = trainer.train()
 
-        # Check if performance with dynamic shapes support is at least 5 times that without dynamic shapes
-        # Note "5x" number is not applicable across models, it is tuned for this particular dummy model
-        self.assertGreaterEqual(
-            train_output_ds.metrics["train_samples_per_second"],
-            5 * train_output_static.metrics["train_samples_per_second"],
-        )
+            # Check if performance with dynamic shapes support is at least 5 times that without dynamic shapes
+            # Note "5x" number is not applicable across models, it is tuned for this particular dummy model
+            self.assertGreaterEqual(
+                train_output_ds.metrics["train_samples_per_second"],
+                5 * train_output_static.metrics["train_samples_per_second"],
+            )
 
     def test_log_level(self):
         # testing only --log_level (--log_level_replica requires multiple gpus and DDP and is tested elsewhere)
         logger = logging.get_logger()
         log_info_string = "Running training"
 
-        # test with the default log_level - should be the same as before and thus we test depending on is_info
-        is_info = logging.get_verbosity() <= 20
-        with CaptureLogger(logger) as cl:
-            trainer = get_regression_trainer()
-            trainer.train()
-        if is_info:
-            self.assertIn(log_info_string, cl.out)
-        else:
-            self.assertNotIn(log_info_string, cl.out)
-
-        with LoggingLevel(logging.INFO):
-            # test with low log_level - lower than info
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # test with the default log_level - should be the same as before and thus we test depending on is_info
+            is_info = logging.get_verbosity() <= 20
             with CaptureLogger(logger) as cl:
-                trainer = get_regression_trainer(log_level="debug")
+                trainer = get_regression_trainer(output_dir=tmpdir)
                 trainer.train()
-            self.assertIn(log_info_string, cl.out)
+            if is_info:
+                self.assertIn(log_info_string, cl.out)
+            else:
+                self.assertNotIn(log_info_string, cl.out)
 
-        with LoggingLevel(logging.INFO):
-            # test with high log_level - should be quiet
-            with CaptureLogger(logger) as cl:
-                trainer = get_regression_trainer(log_level="error")
-                trainer.train()
-            self.assertNotIn(log_info_string, cl.out)
+            with LoggingLevel(logging.INFO):
+                # test with low log_level - lower than info
+                with CaptureLogger(logger) as cl:
+                    trainer = get_regression_trainer(output_dir=tmpdir, log_level="debug")
+                    trainer.train()
+                self.assertIn(log_info_string, cl.out)
+
+            with LoggingLevel(logging.INFO):
+                # test with high log_level - should be quiet
+                with CaptureLogger(logger) as cl:
+                    trainer = get_regression_trainer(output_dir=tmpdir, log_level="error")
+                    trainer.train()
+                self.assertNotIn(log_info_string, cl.out)
 
     def test_save_checkpoints(self):
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -2063,98 +2130,108 @@ def test_load_best_model_from_safetensors(self):
                 )
 
     def test_training_iterable_dataset(self):
-        config = RegressionModelConfig()
-        model = RegressionPreTrainedModel(config)
-        # Adding one column not used by the model should have no impact
-        train_dataset = SampleIterableDataset(label_names=["labels", "extra"])
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = RegressionModelConfig()
+            model = RegressionPreTrainedModel(config)
+            # Adding one column not used by the model should have no impact
+            train_dataset = SampleIterableDataset(label_names=["labels", "extra"])
 
-        args = RegressionGaudiTrainingArguments(
-            output_dir="./examples", max_steps=4, use_habana=True, use_lazy_mode=True
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, train_dataset=train_dataset)
-        trainer.train()
-        self.assertEqual(trainer.state.global_step, 4)
+            args = RegressionGaudiTrainingArguments(
+                output_dir=tmpdir, max_steps=4, use_habana=True, use_lazy_mode=True
+            )
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, train_dataset=train_dataset)
+            trainer.train()
+            self.assertEqual(trainer.state.global_step, 4)
 
-        loader = trainer.get_train_dataloader()
-        self.assertIsInstance(loader, torch.utils.data.DataLoader)
-        self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
+            loader = trainer.get_train_dataloader()
+            self.assertIsInstance(loader, torch.utils.data.DataLoader)
+            self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
 
     def test_evaluation_iterable_dataset(self):
-        config = RegressionModelConfig(a=1.5, b=2.5)
-        model = RegressionPreTrainedModel(config)
-        # Adding one column not used by the model should have no impact
-        eval_dataset = SampleIterableDataset(label_names=["labels", "extra"])
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = RegressionModelConfig(a=1.5, b=2.5)
+            model = RegressionPreTrainedModel(config)
+            # Adding one column not used by the model should have no impact
+            eval_dataset = SampleIterableDataset(label_names=["labels", "extra"])
 
-        args = RegressionGaudiTrainingArguments(output_dir="./examples", use_habana=True, use_lazy_mode=True)
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(
-            model=model,
-            gaudi_config=gaudi_config,
-            args=args,
-            eval_dataset=eval_dataset,
-            compute_metrics=AlmostAccuracy(),
-        )
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With a number of elements not a round multiple of the batch size
-        eval_dataset = SampleIterableDataset(length=66)
-        results = trainer.evaluate(eval_dataset)
-
-        x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            args = RegressionGaudiTrainingArguments(output_dir=tmpdir, use_habana=True, use_lazy_mode=True)
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(
+                model=model,
+                gaudi_config=gaudi_config,
+                args=args,
+                eval_dataset=eval_dataset,
+                compute_metrics=AlmostAccuracy(),
+            )
+            results = trainer.evaluate()
+
+            x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+            # With a number of elements not a round multiple of the batch size
+            eval_dataset = SampleIterableDataset(length=66)
+            results = trainer.evaluate(eval_dataset)
+
+            x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
     def test_predict_iterable_dataset(self):
-        config = RegressionModelConfig(a=1.5, b=2.5)
-        model = RegressionPreTrainedModel(config)
-        eval_dataset = SampleIterableDataset()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = RegressionModelConfig(a=1.5, b=2.5)
+            model = RegressionPreTrainedModel(config)
+            eval_dataset = SampleIterableDataset()
 
-        args = RegressionGaudiTrainingArguments(output_dir="./examples", use_habana=True, use_lazy_mode=True)
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(
-            model=model,
-            gaudi_config=gaudi_config,
-            args=args,
-            eval_dataset=eval_dataset,
-            compute_metrics=AlmostAccuracy(),
-        )
+            args = RegressionGaudiTrainingArguments(output_dir=tmpdir, use_habana=True, use_lazy_mode=True)
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(
+                model=model,
+                gaudi_config=gaudi_config,
+                args=args,
+                eval_dataset=eval_dataset,
+                compute_metrics=AlmostAccuracy(),
+            )
 
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = eval_dataset.dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = eval_dataset.dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
 
-        # With a number of elements not a round multiple of the batch size
-        # Adding one column not used by the model should have no impact
-        test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"])
-        preds = trainer.predict(test_dataset).predictions
-        x = test_dataset.dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+            # With a number of elements not a round multiple of the batch size
+            # Adding one column not used by the model should have no impact
+            test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"])
+            preds = trainer.predict(test_dataset).predictions
+            x = test_dataset.dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
 
     def test_num_train_epochs_in_training(self):
-        # len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given.
-        # It should give 1 update step for each epoch.
-        trainer = get_regression_trainer(
-            max_steps=3, train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5
-        )
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, 3)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given.
+            # It should give 1 update step for each epoch.
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                max_steps=3,
+                train_len=64,
+                per_device_train_batch_size=16,
+                gradient_accumulation_steps=5,
+            )
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, 3)
 
-        # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
-        # len(train_dl) < gradient_accumulation_steps.
-        trainer = get_regression_trainer(train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, int(self.n_epochs))
+            # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
+            # len(train_dl) < gradient_accumulation_steps.
+            trainer = get_regression_trainer(
+                output_dir=tmpdir, train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5
+            )
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, int(self.n_epochs))
 
     def test_early_stopping_callback(self):
         # early stopping stops training before num_training_epochs
@@ -2193,22 +2270,23 @@ def test_early_stopping_callback(self):
                 self.assertEqual(trainer.state.global_step, 0)
 
     def test_flos_extraction(self):
-        trainer = get_regression_trainer(learning_rate=0.1)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(output_dir=tmp_dir, learning_rate=0.1)
 
-        def assert_flos_extraction(trainer, wrapped_model_to_check):
-            self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check))
-            self.assertGreaterEqual(
-                getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0
-            )
+            def assert_flos_extraction(trainer, wrapped_model_to_check):
+                self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check))
+                self.assertGreaterEqual(
+                    getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0
+                )
 
-        # with plain model
-        assert_flos_extraction(trainer, trainer.model)
+            # with plain model
+            assert_flos_extraction(trainer, trainer.model)
 
-        # # with enforced DataParallel
-        # assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
+            # # with enforced DataParallel
+            # assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
 
-        trainer.train()
-        self.assertTrue(isinstance(trainer.state.total_flos, float))
+            trainer.train()
+            self.assertTrue(isinstance(trainer.state.total_flos, float))
 
     def check_checkpoint_deletion(self, trainer, output_dir, expected):
         # Make fake checkpoints
@@ -2262,13 +2340,14 @@ def check_mem_metrics(self, trainer, check_func):
             check_func("test_mem_gpu_alloc_delta", metrics)
 
     def test_mem_metrics(self):
-        # with mem metrics enabled
-        trainer = get_regression_trainer(skip_memory_metrics=False)
-        self.check_mem_metrics(trainer, self.assertIn)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # with mem metrics enabled
+            trainer = get_regression_trainer(output_dir=tmp_dir, skip_memory_metrics=False)
+            self.check_mem_metrics(trainer, self.assertIn)
 
-        # with mem metrics disabled
-        trainer = get_regression_trainer(skip_memory_metrics=True)
-        self.check_mem_metrics(trainer, self.assertNotIn)
+            # with mem metrics disabled
+            trainer = get_regression_trainer(output_dir=tmp_dir, skip_memory_metrics=True)
+            self.check_mem_metrics(trainer, self.assertNotIn)
 
     def test_no_wd_param_group(self):
         model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
@@ -2622,19 +2701,21 @@ def test_eval_use_gather_object(self):
         train_dataset = RegressionDataset()
         eval_dataset = RegressionDataset()
         model = RegressionDictModel()
-        args = GaudiTrainingArguments(
-            "./regression", use_habana=True, use_lazy_mode=True, report_to="none", eval_use_gather_object=True
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-        _ = trainer.evaluate()
-        _ = trainer.predict(eval_dataset)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = GaudiTrainingArguments(
+                tmpdir, use_habana=True, use_lazy_mode=True, report_to="none", eval_use_gather_object=True
+            )
+            gaudi_config = get_gaudi_config()
+            trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+            trainer.train()
+            _ = trainer.evaluate()
+            _ = trainer.predict(eval_dataset)
 
     def test_profiling(self):
-        # 24 total steps and compilation takes place during the 1st three steps
-        trainer = get_regression_trainer(profiling_warmup_steps=3, profiling_steps=21)
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # 24 total steps and compilation takes place during the 1st three steps
+            trainer = get_regression_trainer(output_dir=tmp_dir, profiling_warmup_steps=3, profiling_steps=21)
+            trainer.train()
 
 
 @require_torch
@@ -3093,12 +3174,13 @@ def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expect
 
     @parameterized.expand(optim_test_params, skip_on_empty=True)
     def test_optim_supported(self, name: str, expected_cls, mandatory_kwargs):
-        # exercises all the valid --optim options
-        self.check_optim_and_kwargs(name, mandatory_kwargs, expected_cls)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # exercises all the valid --optim options
+            self.check_optim_and_kwargs(name, mandatory_kwargs, expected_cls)
 
-        trainer = get_regression_trainer(optim=name)
-        trainer.gaudi_config.use_fused_adam = False
-        trainer.train()
+            trainer = get_regression_trainer(output_dir=tmp_dir, optim=name)
+            trainer.gaudi_config.use_fused_adam = False
+            trainer.train()
 
 
 # TODO: solve the Git error returned by this test

From 18249d4aa3a0fa0226d16c534ce4fd14e14e5e1d Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Tue, 20 Aug 2024 12:40:21 -0700
Subject: [PATCH 04/24] Fix memory regression for modeling llama (#1271)

---
 .../transformers/models/llama/modeling_llama.py   | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
index 1d9478c68f..7d41126390 100755
--- a/optimum/habana/transformers/models/llama/modeling_llama.py
+++ b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -170,10 +170,16 @@ def forward(self, x, seq_len=None):
         if seq_len > self.max_seq_len_cached:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 
-        return (
-            self._cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
-            self._sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
-        )
+        if self.attention_scaling == 1.0:
+            return (
+                self._cos_cached[:seq_len].to(dtype=x.dtype),
+                self._sin_cached[:seq_len].to(dtype=x.dtype),
+            )
+        else:
+            return (
+                self._cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
+                self._sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
+            )
 
 
 class GaudiLlamaLinearScalingRotaryEmbedding(GaudiLlamaRotaryEmbedding):
@@ -977,7 +983,6 @@ def __init__(self, config: LlamaConfig):
         config.parallel_strategy = None
 
         self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = GaudiLlamaRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
 
         # Initialize weights and apply final processing

From 0d3e0f441249bfa4a0e38454139b3f7c935b89e7 Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 22 Aug 2024 09:34:10 -0700
Subject: [PATCH 05/24] Fix profiling step with device finish execution for
 text-generation (#1283)

---
 optimum/habana/transformers/generation/utils.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index 284f646a48..aa342b392e 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -2032,8 +2032,6 @@ def _contrastive_search(
                 self._pad_past_key_values(model_kwargs)
                 model_kwargs["pad_done"] = True
 
-            hb_profer.step()
-
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -2041,6 +2039,7 @@ def _contrastive_search(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
+            hb_profer.step()
 
         if (
             model_kwargs.get("use_hpu_graphs", False)
@@ -2366,7 +2365,6 @@ def _sample(
                 )
                 this_peer_finished = unfinished_sequences.max() == 0
 
-            hb_profer.step()
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -2374,6 +2372,7 @@ def _sample(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
+            hb_profer.step()
 
             if (
                 not model_kwargs.get("pad_done", False)
@@ -3628,7 +3627,6 @@ def _assisted_decoding(
                 )
                 this_peer_finished = unfinished_sequences.max() == 0
 
-            hb_profer.step()
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -3636,6 +3634,7 @@ def _assisted_decoding(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
+            hb_profer.step()
 
             if this_peer_finished and not synced_gpus:
                 break

From 18efdc1f8fe0c76f8f88b975d2fc93525d816107 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Thu, 22 Aug 2024 09:39:42 -0700
Subject: [PATCH 06/24] Revert mark_step in mixtral model from PR #1260 (#1273)

---
 .../transformers/models/mixtral/modeling_mixtral.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
index fc414e6d76..43dfc7e48a 100644
--- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
+++ b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
@@ -471,7 +471,6 @@ def forward(
         reuse_cache: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
         cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
@@ -481,10 +480,7 @@ def forward(
         - add new args reuse_cache
         - add new args flash_attention_recompute
         - add new args cache_idx
-        - add new args lazy_mode
         """
-        if lazy_mode:
-            htcore.mark_step()
         residual = hidden_states
 
         hidden_states = self.input_layernorm(hidden_states)
@@ -504,16 +500,12 @@ def forward(
             cache_idx=cache_idx,
         )
         hidden_states = residual + hidden_states
-        if lazy_mode:
-            htcore.mark_step()
 
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states, router_logits = self.block_sparse_moe(hidden_states)
         hidden_states = residual + hidden_states
-        if lazy_mode:
-            htcore.mark_step()
 
         outputs = (hidden_states,)
 
@@ -554,7 +546,6 @@ def forward(
         reuse_cache: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
         cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
     ) -> Union[Tuple, MoeModelOutputWithPast]:
         """
         Copied from MixtralModel.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py#L1069
@@ -684,7 +675,6 @@ def forward(
                     reuse_cache=reuse_cache,
                     flash_attention_recompute=flash_attention_recompute,
                     cache_idx=cache_idx,
-                    lazy_mode=lazy_mode,
                 )
 
             hidden_states = layer_outputs[0]
@@ -759,7 +749,6 @@ def forward(
         reuse_cache: Optional[bool] = None,
         flash_attention_recompute: Optional[bool] = False,
         cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
@@ -788,7 +777,6 @@ def forward(
             reuse_cache=reuse_cache,
             flash_attention_recompute=flash_attention_recompute,
             cache_idx=cache_idx,
-            lazy_mode=lazy_mode,
         )
 
         hidden_states = outputs[0]
@@ -893,7 +881,6 @@ def prepare_inputs_for_generation(
                 "reuse_cache": reuse_cache,
                 "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
                 "cache_idx": kwargs.get("cache_idx"),
-                "lazy_mode": kwargs.get("lazy_mode"),
             }
         )
         return model_inputs

From 7891a867404ebc13a14e6e14f21263a253ffc800 Mon Sep 17 00:00:00 2001
From: Dina Suehiro Jones <dina.s.jones@intel.com>
Date: Fri, 23 Aug 2024 05:56:42 -0700
Subject: [PATCH 07/24] Remove huggingface_hub install that is no longer needed
 in the kubernetes example (#1286)

Signed-off-by: dmsuehir <dina.s.jones@intel.com>
---
 examples/kubernetes/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/kubernetes/requirements.txt b/examples/kubernetes/requirements.txt
index ee4e985e93..dc06e3de79 100644
--- a/examples/kubernetes/requirements.txt
+++ b/examples/kubernetes/requirements.txt
@@ -1,3 +1,2 @@
-huggingface_hub==0.23.0
 -r optimum-habana/examples/language-modeling/requirements.txt
 -r optimum-habana/examples/text-classification/requirements.txt

From 55667210ffaf3a4be8c61d8b68bb8e2f557d16f1 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Fri, 23 Aug 2024 06:14:55 -0700
Subject: [PATCH 08/24] Add missing condtion check in tensor creation in greedy
 search (#1288)

---
 optimum/habana/transformers/generation/utils.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index aa342b392e..d333986679 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -2194,7 +2194,8 @@ def _sample(
         # keep track of which sequences are already finished
         batch_size, cur_len = input_ids.shape
         this_peer_finished = False
-        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        if not ignore_eos:
+            unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 
         bucket_size = model_kwargs.get("bucket_size", -1)
@@ -2269,9 +2270,7 @@ def _sample(
                         next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
                     next_token_scores = logits_processor(input_ids, next_token_logits)
             else:
-                # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
-                # (the clone itself is always small)
-                next_token_logits = outputs.logits[:, -1, :].clone()
+                next_token_logits = outputs.logits[:, -1, :]
                 if token_idx is not None and self.config.is_encoder_decoder:
                     # case2 (with KV caching): outputs.logits.shape: [batch_size, 1, vocab_size]
                     next_token_scores = logits_processor(input_ids[:, :token_idx], next_token_logits)

From a909e6b2f2c31d405e4be95e44bb9c5148d740d9 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 23 Aug 2024 15:15:28 +0200
Subject: [PATCH 09/24] Fix BERT FSDP test (#1281)

---
 optimum/habana/accelerate/utils/__init__.py   |  1 +
 optimum/habana/accelerate/utils/other.py      | 77 +++++++++++++++++++
 optimum/habana/transformers/modeling_utils.py |  5 ++
 3 files changed, 83 insertions(+)
 create mode 100644 optimum/habana/accelerate/utils/other.py

diff --git a/optimum/habana/accelerate/utils/__init__.py b/optimum/habana/accelerate/utils/__init__.py
index ee25954b95..b862697de1 100755
--- a/optimum/habana/accelerate/utils/__init__.py
+++ b/optimum/habana/accelerate/utils/__init__.py
@@ -5,6 +5,7 @@
     GaudiFullyShardedDataParallelPlugin,
     GaudiTorchDynamoPlugin,
 )
+from .other import extract_model_from_parallel
 from .transformer_engine import (
     FP8ContextWrapper,
     convert_model,
diff --git a/optimum/habana/accelerate/utils/other.py b/optimum/habana/accelerate/utils/other.py
new file mode 100644
index 0000000000..8062f9d860
--- /dev/null
+++ b/optimum/habana/accelerate/utils/other.py
@@ -0,0 +1,77 @@
+from types import MethodType
+
+import torch
+from accelerate.utils.constants import FSDP_PYTORCH_VERSION
+from accelerate.utils.imports import is_deepspeed_available, is_torch_distributed_available
+from accelerate.utils.other import is_compiled_module
+from accelerate.utils.transformer_engine import convert_model
+from accelerate.utils.versions import is_torch_version
+
+
+def extract_model_from_parallel(model, keep_fp32_wrapper: bool = True, recursive: bool = False):
+    """
+    Adapted from: https://github.com/huggingface/accelerate/blob/v0.33.0/src/accelerate/utils/other.py#L56
+
+    Changes:
+    - add a `distributed_model` variable to keep track of the distributed wrapper
+      and not lose it when setting it back at the end (for compiled models)
+
+    See https://github.com/huggingface/optimum-habana/pull/1281 for more information.
+    """
+    options = (torch.nn.parallel.DistributedDataParallel, torch.nn.DataParallel)
+
+    is_compiled = is_compiled_module(model)
+    if is_compiled:
+        compiled_model = model
+        model = model._orig_mod
+
+    if is_deepspeed_available():
+        from deepspeed import DeepSpeedEngine
+
+        options += (DeepSpeedEngine,)
+
+    if is_torch_version(">=", FSDP_PYTORCH_VERSION) and is_torch_distributed_available():
+        from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+
+        options += (FSDP,)
+
+    # Keep track of the distributed wrapper
+    # TODO: to revisit as lines 44 to 71 are now useless
+    distributed_model = model
+    while isinstance(model, options):
+        model = model.module
+
+    if recursive:
+        # This is needed in cases such as using FSDPv2 on XLA
+        def _recursive_unwrap(module):
+            # Wrapped modules are standardly wrapped as `module`, similar to the cases earlier
+            # with DDP, DataParallel, DeepSpeed, and FSDP
+            if hasattr(module, "module"):
+                unwrapped_module = _recursive_unwrap(module.module)
+            else:
+                unwrapped_module = module
+            # Next unwrap child sublayers recursively
+            for name, child in unwrapped_module.named_children():
+                setattr(unwrapped_module, name, _recursive_unwrap(child))
+            return unwrapped_module
+
+        # Start with top-level
+        model = _recursive_unwrap(model)
+
+    if not keep_fp32_wrapper:
+        forward = model.forward
+        original_forward = model.__dict__.pop("_original_forward", None)
+        if original_forward is not None:
+            while hasattr(forward, "__wrapped__"):
+                forward = forward.__wrapped__
+                if forward == original_forward:
+                    break
+            model.forward = MethodType(forward, model)
+        if getattr(model, "_converted_to_transformer_engine", False):
+            convert_model(model, to_transformer_engine=False)
+
+    if is_compiled:
+        compiled_model._orig_mod = distributed_model
+        model = compiled_model
+
+    return model
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
index 97e290157c..d7f98f8376 100644
--- a/optimum/habana/transformers/modeling_utils.py
+++ b/optimum/habana/transformers/modeling_utils.py
@@ -13,9 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import accelerate
 import transformers
 import transformers.utils.fx
 
+from ..accelerate.utils import extract_model_from_parallel
 from .generation import (
     GaudiGenerationConfig,
     GaudiGenerationMixin,
@@ -201,6 +203,9 @@ def adapt_transformers_to_gaudi():
     Replaces some Transformers' methods for equivalent methods optimized
     for Gaudi.
     """
+    accelerate.utils.extract_model_from_parallel = extract_model_from_parallel
+    accelerate.utils.other.extract_model_from_parallel = extract_model_from_parallel
+    accelerate.accelerator.extract_model_from_parallel = extract_model_from_parallel
 
     # models that support symbolic tracing should be added to this list
     models_with_tracing_support = []

From 5c567d399958f331e8bbb8bfcf4ea842cbb7387c Mon Sep 17 00:00:00 2001
From: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai>
Date: Fri, 23 Aug 2024 13:16:16 -0700
Subject: [PATCH 10/24] Llava: Added flash_attention_recompute arg to provide
 an option to enable/disable recompute (#1278)

---
 examples/image-to-text/README.md               | 14 ++++++++++----
 examples/image-to-text/run_pipeline.py         |  6 ++++++
 .../transformers/models/clip/modeling_clip.py  | 18 +++++++++++++++---
 .../models/llava/modeling_llava.py             | 10 ++++++++--
 .../models/llava_next/modeling_llava_next.py   | 12 ++++++++++--
 5 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 0f1a2624d4..97494e6846 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -145,7 +145,8 @@ python3 run_pipeline.py \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
     --bf16 \
-    --use_flash_attention
+    --use_flash_attention \
+    --flash_attention_recompute
 ```
 
 
@@ -156,7 +157,8 @@ python3 run_pipeline.py \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
     --bf16 \
-    --use_flash_attention
+    --use_flash_attention \
+    --flash_attention_recompute
 ```
 
 
@@ -168,7 +170,9 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
 --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
 --image_path "https://llava-vl.github.io/static/images/view.jpg" \
 --use_hpu_graphs \
---bf16 --use_flash_attention
+--bf16 \
+--use_flash_attention \
+--flash_attention_recompute
 ```
 
 Here is an example of quantizing the model based on previous measurements for Llava-v1.6-mistral-7b:
@@ -177,5 +181,7 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python run_pipeline.py \
 --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
 --image_path "https://llava-vl.github.io/static/images/view.jpg" \
 --use_hpu_graphs \
---bf16 --use_flash_attention
+--bf16 \
+--use_flash_attention \
+--flash_attention_recompute
 ```
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index 239d6fa4e4..8ef45c4f61 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -96,6 +96,11 @@ def main():
         action="store_true",
         help="Whether to enable Habana Flash Attention, provided that the model supports it.",
     )
+    parser.add_argument(
+        "--flash_attention_recompute",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
+    )
 
     args = parser.parse_args()
 
@@ -156,6 +161,7 @@ def main():
         "max_new_tokens": args.max_new_tokens,
         "ignore_eos": args.ignore_eos,
         "use_flash_attention": args.use_flash_attention,
+        "flash_attention_recompute": args.flash_attention_recompute,
     }
     if args.use_hpu_graphs:
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
diff --git a/optimum/habana/transformers/models/clip/modeling_clip.py b/optimum/habana/transformers/models/clip/modeling_clip.py
index b22c61972d..b7fb3a222e 100644
--- a/optimum/habana/transformers/models/clip/modeling_clip.py
+++ b/optimum/habana/transformers/models/clip/modeling_clip.py
@@ -78,11 +78,13 @@ def forward(
         causal_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         Copied from CLIPAttention.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention to enable FusedSDPA
+        - add new args flash_attention_recompute
         """
         bsz, tgt_len, embed_dim = hidden_states.size()
         attn_weights_reshaped = None
@@ -100,8 +102,7 @@ def forward(
         if FusedSDPA and use_flash_attention:
             import habana_frameworks.torch.hpu as ht
 
-            use_recompute = not self.training
-            with ht.sdp_kernel(enable_recompute=use_recompute):
+            with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
                 attn_output = self.fused_scaled_dot_product_attention(
                     query_states, key_states, value_states, attention_mask, self.dropout, False, 1, "fast"
                 )
@@ -178,11 +179,13 @@ def forward(
         causal_attention_mask: torch.Tensor,
         output_attentions: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor]:
         """
         Copied from CLIPEncoderLayer.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention
+        - add new args flash_attention_recompute
         """
         residual = hidden_states
 
@@ -193,6 +196,7 @@ def forward(
             causal_attention_mask=causal_attention_mask,
             output_attentions=output_attentions,
             use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
         )
         hidden_states = residual + hidden_states
 
@@ -219,11 +223,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutput]:
         """
         Copied from CLIPEncoder.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention
+        - add new args flash_attention_recompute
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -245,7 +251,6 @@ def forward(
                     attention_mask,
                     causal_attention_mask,
                     output_attentions,
-                    use_flash_attention=use_flash_attention,
                 )
             else:
                 layer_outputs = encoder_layer(
@@ -254,6 +259,7 @@ def forward(
                     causal_attention_mask,
                     output_attentions=output_attentions,
                     use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
                 )
 
             hidden_states = layer_outputs[0]
@@ -279,11 +285,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         """
         Copied from CLIPVisionTransformer.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention
+        - add new args flash_attention_recompute
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -303,6 +311,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
         )
 
         last_hidden_state = encoder_outputs[0]
@@ -328,11 +337,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         """
         Copied from CLIPVisionModel.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention
+        - add new args flash_attention_recompute
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -342,4 +353,5 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
         )
diff --git a/optimum/habana/transformers/models/llava/modeling_llava.py b/optimum/habana/transformers/models/llava/modeling_llava.py
index fa3a321e77..8119f442c5 100644
--- a/optimum/habana/transformers/models/llava/modeling_llava.py
+++ b/optimum/habana/transformers/models/llava/modeling_llava.py
@@ -124,6 +124,7 @@ def forward(
         image_offset: Optional[int] = None,
         tokens_pos: Optional[torch.LongTensor] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
         """
         Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llava/modeling_llava.py
@@ -154,7 +155,10 @@ def forward(
             # 2. Merge text and images
             if pixel_values is not None and input_ids.shape[1] != 1:
                 image_outputs = self.vision_tower(
-                    pixel_values, output_hidden_states=True, use_flash_attention=use_flash_attention
+                    pixel_values,
+                    output_hidden_states=True,
+                    use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
                 )
                 # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
                 selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
@@ -184,7 +188,7 @@ def forward(
                 return_dict=return_dict,
                 token_idx=token_idx + image_offset,
                 use_flash_attention=use_flash_attention,
-                flash_attention_recompute=use_flash_attention,
+                flash_attention_recompute=flash_attention_recompute,
             )
 
             if input_ids.shape[1] != 1 and pixel_values is not None:
@@ -296,6 +300,7 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
         use_flash_attention = kwargs.get("use_flash_attention", False)
+        flash_attention_recompute = kwargs.get("flash_attention_recompute", False)
         model_inputs.update(
             {
                 "position_ids": position_ids,
@@ -307,6 +312,7 @@ def prepare_inputs_for_generation(
                 "image_offset": image_offset,
                 "tokens_pos": tokens_pos,
                 "use_flash_attention": use_flash_attention,
+                "flash_attention_recompute": flash_attention_recompute,
             }
         )
 
diff --git a/optimum/habana/transformers/models/llava_next/modeling_llava_next.py b/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
index fdf9276123..4670469e9e 100644
--- a/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
+++ b/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
@@ -55,11 +55,14 @@ def forward(
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
         """
         Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L433
         The only differences are:
         - add new args token_idx
+        - add new args use_flash_attention
+        - add new args flash_attention_recompute
         - Moved the process of merging images into inputs_embeds into prepare_inputs_for_generation
         """
 
@@ -83,7 +86,7 @@ def forward(
                 return_dict=return_dict,
                 token_idx=token_idx + self.image_offset,
                 use_flash_attention=use_flash_attention,
-                flash_attention_recompute=use_flash_attention,
+                flash_attention_recompute=flash_attention_recompute,
             )
 
             if inputs_embeds.shape[1] != 1 and pixel_values is not None:
@@ -248,6 +251,7 @@ def prepare_inputs_for_generation(
             )
         else:
             use_flash_attention = kwargs.get("use_flash_attention", False)
+            flash_attention_recompute = kwargs.get("flash_attention_recompute", False)
             position_ids = kwargs.get("position_ids", None)
             labels = kwargs.get("labels", None)
             if past_key_values is None and pixel_values is not None and input_ids.shape[1] != 1:
@@ -268,7 +272,10 @@ def prepare_inputs_for_generation(
                 batch_size, num_patches, num_channels, height, width = pixel_values.shape
                 reshaped_pixel_values = pixel_values.view(batch_size * num_patches, num_channels, height, width)
                 image_features = self.vision_tower(
-                    reshaped_pixel_values, output_hidden_states=True, use_flash_attention=use_flash_attention
+                    reshaped_pixel_values,
+                    output_hidden_states=True,
+                    use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
                 )
 
                 selected_image_feature = image_features.hidden_states[vision_feature_layer]
@@ -390,6 +397,7 @@ def prepare_inputs_for_generation(
                     "image_sizes": image_sizes,
                     "labels": labels,
                     "use_flash_attention": use_flash_attention,
+                    "flash_attention_recompute": flash_attention_recompute,
                 }
             )
 

From 886ab678a8da3c1f138ecdf3520a2f1d721ef535 Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <supersarkar@gmail.com>
Date: Fri, 23 Aug 2024 13:42:33 -0700
Subject: [PATCH 11/24] Get seq len fix propagate (#1291)

Co-authored-by: Libin Tang <litang@habana.ai>
Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>
---
 examples/text-generation/README.md                     | 2 +-
 optimum/habana/transformers/models/phi/modeling_phi.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 29b754731d..b720936ff4 100755
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -284,7 +284,7 @@ PT_ENABLE_INT64_SUPPORT=1 PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py  --world_s
 
 ### Running with FP8
 
-Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B and phi-2 in FP8 are enabled using the Intel Neural Compressor (INC), which provides model measurement and quantization capabilities in PyTorch.
+Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B and phi-2 in FP8 are enabled using the [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. From synapse 1.17 / optimum-habana 1.13 release, INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
 
 More information on enabling fp8 in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
diff --git a/optimum/habana/transformers/models/phi/modeling_phi.py b/optimum/habana/transformers/models/phi/modeling_phi.py
index 07f4d0cd71..1e21735add 100644
--- a/optimum/habana/transformers/models/phi/modeling_phi.py
+++ b/optimum/habana/transformers/models/phi/modeling_phi.py
@@ -430,7 +430,12 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            past_seen_tokens = 0
+            if past_key_values is not None:
+                if isinstance(past_key_values, Cache):
+                    past_seen_tokens = past_key_values.get_seq_length()
+                else:
+                    past_seen_tokens = past_key_values[0][0].shape[2]
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )

From 9a51d343205bc9717277570910e03c6e03f0d9b0 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Sun, 25 Aug 2024 13:35:39 +0000
Subject: [PATCH 12/24] Update last stable release in README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2edc24d2f8..b78a19080e 100644
--- a/README.md
+++ b/README.md
@@ -59,9 +59,9 @@ The `--upgrade-strategy eager` option is needed to ensure `optimum-habana` is up
 To use the example associated with the latest stable release, run:
 > ```
 > git clone https://github.com/huggingface/optimum-habana
-> cd optimum-habana && git checkout v1.13.0
+> cd optimum-habana && git checkout v1.13.1
 > ```
-> with `v1.13.0` the version number of this release.
+> with `v1.13.1` the version number of this release.
 
 ### Option 2: Use the latest main branch under development
 

From 08e30aadc8354f8ea2c179117f67b81defdee7d5 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 28 Aug 2024 16:38:52 +0000
Subject: [PATCH 13/24] Update minimal required versions in examples

---
 examples/audio-classification/run_audio_classification.py     | 2 +-
 examples/contrastive-image-text/run_bridgetower.py            | 2 +-
 examples/contrastive-image-text/run_clip.py                   | 2 +-
 examples/image-classification/run_image_classification.py     | 2 +-
 examples/language-modeling/run_clm.py                         | 2 +-
 examples/language-modeling/run_lora_clm.py                    | 2 +-
 examples/language-modeling/run_mlm.py                         | 2 +-
 examples/language-modeling/run_multitask_prompt_tuning.py     | 4 ++--
 examples/language-modeling/run_prompt_tuning_clm.py           | 4 ++--
 examples/protein-folding/run_esmfold.py                       | 2 +-
 examples/protein-folding/run_sequence_classification.py       | 2 +-
 examples/protein-folding/run_zero_shot_eval.py                | 2 +-
 examples/question-answering/run_qa.py                         | 2 +-
 examples/question-answering/run_seq2seq_qa.py                 | 2 +-
 examples/speech-recognition/run_speech_recognition_ctc.py     | 2 +-
 examples/speech-recognition/run_speech_recognition_seq2seq.py | 2 +-
 examples/stable-diffusion/image_to_image_generation.py        | 2 +-
 examples/stable-diffusion/image_to_video_generation.py        | 2 +-
 examples/stable-diffusion/text_to_image_generation.py         | 2 +-
 examples/stable-diffusion/training/textual_inversion.py       | 2 +-
 examples/stable-diffusion/training/train_controlnet.py        | 4 ++--
 .../stable-diffusion/training/train_dreambooth_lora_sdxl.py   | 2 +-
 .../stable-diffusion/training/train_text_to_image_sdxl.py     | 2 +-
 examples/stable-diffusion/unconditional_image_generation.py   | 4 ++--
 examples/summarization/run_summarization.py                   | 2 +-
 examples/text-classification/run_glue.py                      | 2 +-
 examples/translation/run_translation.py                       | 2 +-
 tests/example_diff/run_audio_classification.txt               | 2 +-
 tests/example_diff/run_clip.txt                               | 2 +-
 tests/example_diff/run_clm.txt                                | 2 +-
 tests/example_diff/run_glue.txt                               | 2 +-
 tests/example_diff/run_image_classification.txt               | 2 +-
 tests/example_diff/run_mlm.txt                                | 2 +-
 tests/example_diff/run_qa.txt                                 | 2 +-
 tests/example_diff/run_seq2seq_qa.txt                         | 2 +-
 tests/example_diff/run_speech_recognition_ctc.txt             | 2 +-
 tests/example_diff/run_speech_recognition_seq2seq.txt         | 2 +-
 tests/example_diff/run_summarization.txt                      | 2 +-
 tests/example_diff/run_translation.txt                        | 2 +-
 39 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
index 86dc6627dd..b05e6dfb51 100644
--- a/examples/audio-classification/run_audio_classification.py
+++ b/examples/audio-classification/run_audio_classification.py
@@ -47,7 +47,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py
index c22682203e..11ff5a55b0 100644
--- a/examples/contrastive-image-text/run_bridgetower.py
+++ b/examples/contrastive-image-text/run_bridgetower.py
@@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py
index 2358412de6..941dade8f9 100644
--- a/examples/contrastive-image-text/run_clip.py
+++ b/examples/contrastive-image-text/run_clip.py
@@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py
index 7bd1d23c4d..4d2e229db1 100644
--- a/examples/image-classification/run_image_classification.py
+++ b/examples/image-classification/run_image_classification.py
@@ -64,7 +64,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index 3e372d17a6..ec6b345d89 100644
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
index f1c39f6db7..4640bb0dc9 100644
--- a/examples/language-modeling/run_lora_clm.py
+++ b/examples/language-modeling/run_lora_clm.py
@@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b):
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.10.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 @dataclass
diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
index 7fb0ce8494..7a660447b8 100644
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_multitask_prompt_tuning.py b/examples/language-modeling/run_multitask_prompt_tuning.py
index 9f7d10655c..48f9cefcb7 100644
--- a/examples/language-modeling/run_multitask_prompt_tuning.py
+++ b/examples/language-modeling/run_multitask_prompt_tuning.py
@@ -60,8 +60,8 @@ def check_optimum_habana_min_version(*a, **b):
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risk.
-check_min_version("4.38.0")
-check_optimum_habana_min_version("1.10.0")
+check_min_version("4.43.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_prompt_tuning_clm.py b/examples/language-modeling/run_prompt_tuning_clm.py
index 42798c0d5e..2d2b9c4c3e 100644
--- a/examples/language-modeling/run_prompt_tuning_clm.py
+++ b/examples/language-modeling/run_prompt_tuning_clm.py
@@ -62,8 +62,8 @@ def check_optimum_habana_min_version(*a, **b):
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.38.0")
-check_optimum_habana_min_version("1.10.0")
+check_min_version("4.43.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/protein-folding/run_esmfold.py b/examples/protein-folding/run_esmfold.py
index 489faea855..4fa8d7a441 100644
--- a/examples/protein-folding/run_esmfold.py
+++ b/examples/protein-folding/run_esmfold.py
@@ -40,7 +40,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 def convert_outputs_to_pdb(outputs):
diff --git a/examples/protein-folding/run_sequence_classification.py b/examples/protein-folding/run_sequence_classification.py
index 8590e4eaa9..dde75a2564 100644
--- a/examples/protein-folding/run_sequence_classification.py
+++ b/examples/protein-folding/run_sequence_classification.py
@@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
diff --git a/examples/protein-folding/run_zero_shot_eval.py b/examples/protein-folding/run_zero_shot_eval.py
index 348665c59d..3b475883e8 100644
--- a/examples/protein-folding/run_zero_shot_eval.py
+++ b/examples/protein-folding/run_zero_shot_eval.py
@@ -36,7 +36,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 logging.basicConfig(
diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
index 6e0c35620f..b983055f31 100644
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py
index e9e789b440..8249e659a1 100644
--- a/examples/question-answering/run_seq2seq_qa.py
+++ b/examples/question-answering/run_seq2seq_qa.py
@@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py
index f494d5ea29..c1367e0668 100644
--- a/examples/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/speech-recognition/run_speech_recognition_ctc.py
@@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py
index 66ed34f476..e9abca3b92 100755
--- a/examples/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/speech-recognition/run_speech_recognition_seq2seq.py
@@ -56,7 +56,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/stable-diffusion/image_to_image_generation.py b/examples/stable-diffusion/image_to_image_generation.py
index 1a11d6eef9..64fecf4c91 100755
--- a/examples/stable-diffusion/image_to_image_generation.py
+++ b/examples/stable-diffusion/image_to_image_generation.py
@@ -40,7 +40,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py
index c9142f0c0e..048f699ce2 100755
--- a/examples/stable-diffusion/image_to_video_generation.py
+++ b/examples/stable-diffusion/image_to_video_generation.py
@@ -34,7 +34,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
index 035e486061..1fa231f992 100755
--- a/examples/stable-diffusion/text_to_image_generation.py
+++ b/examples/stable-diffusion/text_to_image_generation.py
@@ -39,7 +39,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/stable-diffusion/training/textual_inversion.py b/examples/stable-diffusion/training/textual_inversion.py
index f968ac808c..db488f8749 100644
--- a/examples/stable-diffusion/training/textual_inversion.py
+++ b/examples/stable-diffusion/training/textual_inversion.py
@@ -79,7 +79,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
+check_min_version("0.29.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/stable-diffusion/training/train_controlnet.py b/examples/stable-diffusion/training/train_controlnet.py
index 0dd6a0102b..696b54baca 100644
--- a/examples/stable-diffusion/training/train_controlnet.py
+++ b/examples/stable-diffusion/training/train_controlnet.py
@@ -68,12 +68,12 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.10.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 if is_wandb_available():
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
+check_min_version("0.29.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
index ea34c50773..b177cf12e6 100644
--- a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
+++ b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
@@ -75,7 +75,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
+check_min_version("0.29.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
index 46a1f1c150..c9d84ae1b9 100644
--- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py
+++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
@@ -73,7 +73,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
+check_min_version("0.29.0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/stable-diffusion/unconditional_image_generation.py b/examples/stable-diffusion/unconditional_image_generation.py
index 93ebb59824..df0575c0a7 100644
--- a/examples/stable-diffusion/unconditional_image_generation.py
+++ b/examples/stable-diffusion/unconditional_image_generation.py
@@ -19,8 +19,8 @@ def check_optimum_habana_min_version(*a, **b):
         return ()
 
 
-check_min_version("4.37.0")
-check_optimum_habana_min_version("1.10.4")
+check_min_version("4.43.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 # Setup logging
 logging.basicConfig(
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 122477aed4..ea5e002450 100755
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -66,7 +66,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index 5f5cb45b1b..9dfd2adcfc 100755
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -58,7 +58,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py
index 0dec28ed39..8d13b39923 100644
--- a/examples/translation/run_translation.py
+++ b/examples/translation/run_translation.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/tests/example_diff/run_audio_classification.txt b/tests/example_diff/run_audio_classification.txt
index 1314c4bebd..5e98ce8248 100644
--- a/tests/example_diff/run_audio_classification.txt
+++ b/tests/example_diff/run_audio_classification.txt
@@ -34,7 +34,7 @@
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 174,176d175
 <     freeze_feature_extractor: Optional[bool] = field(
 <         default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
diff --git a/tests/example_diff/run_clip.txt b/tests/example_diff/run_clip.txt
index 3999665da1..f57b3b3240 100644
--- a/tests/example_diff/run_clip.txt
+++ b/tests/example_diff/run_clip.txt
@@ -29,7 +29,7 @@
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 181a190,192
 >     mediapipe_dataloader: bool = field(
 >         default=False, metadata={"help": "Turn on MediaPipe hardware-based accelerated data loading."}
diff --git a/tests/example_diff/run_clm.txt b/tests/example_diff/run_clm.txt
index 00bb6f6097..580f3c9684 100644
--- a/tests/example_diff/run_clm.txt
+++ b/tests/example_diff/run_clm.txt
@@ -39,7 +39,7 @@
 63a64,69
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 > 
diff --git a/tests/example_diff/run_glue.txt b/tests/example_diff/run_glue.txt
index 282e3cd6b2..26d2e245c0 100644
--- a/tests/example_diff/run_glue.txt
+++ b/tests/example_diff/run_glue.txt
@@ -28,7 +28,7 @@
 > 
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 67,68d76
 < logger = logging.getLogger(__name__)
 < 
diff --git a/tests/example_diff/run_image_classification.txt b/tests/example_diff/run_image_classification.txt
index 31b247a8ab..7a3e696fd6 100644
--- a/tests/example_diff/run_image_classification.txt
+++ b/tests/example_diff/run_image_classification.txt
@@ -29,7 +29,7 @@
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 184c192
 <     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
 ---
diff --git a/tests/example_diff/run_mlm.txt b/tests/example_diff/run_mlm.txt
index 698be685c5..a3e97b56c7 100644
--- a/tests/example_diff/run_mlm.txt
+++ b/tests/example_diff/run_mlm.txt
@@ -35,7 +35,7 @@
 > 
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 > 
diff --git a/tests/example_diff/run_qa.txt b/tests/example_diff/run_qa.txt
index 60c1e52e31..4d289c5faa 100644
--- a/tests/example_diff/run_qa.txt
+++ b/tests/example_diff/run_qa.txt
@@ -33,7 +33,7 @@
 58a62,67
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 > 
diff --git a/tests/example_diff/run_seq2seq_qa.txt b/tests/example_diff/run_seq2seq_qa.txt
index 78033eeafa..96bcd84b82 100644
--- a/tests/example_diff/run_seq2seq_qa.txt
+++ b/tests/example_diff/run_seq2seq_qa.txt
@@ -25,7 +25,7 @@
 54a58,63
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 > 
diff --git a/tests/example_diff/run_speech_recognition_ctc.txt b/tests/example_diff/run_speech_recognition_ctc.txt
index 3d366814c3..d9bb9d115e 100644
--- a/tests/example_diff/run_speech_recognition_ctc.txt
+++ b/tests/example_diff/run_speech_recognition_ctc.txt
@@ -26,7 +26,7 @@
 59a61,66
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 > 
diff --git a/tests/example_diff/run_speech_recognition_seq2seq.txt b/tests/example_diff/run_speech_recognition_seq2seq.txt
index 847b742bb4..0fce8cc3e0 100644
--- a/tests/example_diff/run_speech_recognition_seq2seq.txt
+++ b/tests/example_diff/run_speech_recognition_seq2seq.txt
@@ -23,7 +23,7 @@
 < check_min_version("4.45.0.dev0")
 ---
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 230a239,242
 >     label_features_max_length: int = field(
 >         default=None,
diff --git a/tests/example_diff/run_summarization.txt b/tests/example_diff/run_summarization.txt
index 6bf6dc6aba..aaa348da39 100644
--- a/tests/example_diff/run_summarization.txt
+++ b/tests/example_diff/run_summarization.txt
@@ -37,7 +37,7 @@
 60a67,72
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 > 
diff --git a/tests/example_diff/run_translation.txt b/tests/example_diff/run_translation.txt
index 8d17c3c087..95f2749242 100644
--- a/tests/example_diff/run_translation.txt
+++ b/tests/example_diff/run_translation.txt
@@ -29,7 +29,7 @@
 60a64,69
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 > 

From 2e6a0da524974bb21586ec6b2fb8d246f7c42f63 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Thu, 29 Aug 2024 01:17:47 -0700
Subject: [PATCH 14/24] Update FusedSDPA calling method as Gaudi documentation
 (#1285)

---
 .../gpt_bigcode/modeling_gpt_bigcode.py       | 66 +++++++++++--------
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 8aee605480..7d2a065593 100644
--- a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -21,7 +21,16 @@
 
 
 def gaudi_flash_attn_v1(
-    query_layer, key_layer, value_layer, attention_mask, dropout_rate, is_causal, scale, softmax_mode, q_block_size
+    query_layer,
+    key_layer,
+    value_layer,
+    attention_mask,
+    dropout_rate,
+    is_causal,
+    scale,
+    softmax_mode,
+    enable_recompute,
+    q_block_size,
 ):
     """
     Gaudi version of Flash Attention V1 to support long sequence at prompt phase
@@ -42,7 +51,7 @@ def gaudi_flash_attn_v1(
         row_q = query_layer[:, :, s:e, :]
         row_mask = attention_mask[:, :, s:e, :]
         attn_output_partial = FusedSDPA.apply(
-            row_q, key_layer, value_layer, row_mask, dropout_rate, is_causal, scale, softmax_mode
+            row_q, key_layer, value_layer, row_mask, dropout_rate, is_causal, scale, softmax_mode, enable_recompute
         )
         row_o_list.append(attn_output_partial)
     attn_output = torch.cat(row_o_list, dim=-2)
@@ -106,33 +115,32 @@ def apply_FusedSDPA(
     else:
         use_causal_mask = self.is_causal and attention_mask is None and query_length > 1
 
-    import habana_frameworks.torch.hpu as ht
-
-    with ht.sdp_kernel(enable_recompute=enable_recompute):
-        if query_length > 8192:
-            sdpa_result = gaudi_flash_attn_v1(
-                query,
-                key,
-                value,
-                attention_mask,
-                self.attn_pdrop if self.training else 0.0,
-                use_causal_mask,
-                scale,
-                "fast" if flash_attention_fast_softmax else "None",
-                4096,
-            )
-            htcore.mark_step()
-        else:
-            sdpa_result = FusedSDPA.apply(
-                query,
-                key,
-                value,
-                attention_mask,
-                self.attn_pdrop if self.training else 0.0,
-                use_causal_mask,
-                scale,
-                "fast" if flash_attention_fast_softmax else "None",
-            )
+    if query_length > 8192:
+        sdpa_result = gaudi_flash_attn_v1(
+            query,
+            key,
+            value,
+            attention_mask,
+            self.attn_pdrop if self.training else 0.0,
+            use_causal_mask,
+            scale,
+            "fast" if flash_attention_fast_softmax else "None",
+            enable_recompute,
+            4096,
+        )
+        htcore.mark_step()
+    else:
+        sdpa_result = FusedSDPA.apply(
+            query,
+            key,
+            value,
+            attention_mask,
+            self.attn_pdrop if self.training else 0.0,
+            use_causal_mask,
+            scale,
+            "fast" if flash_attention_fast_softmax else "None",
+            enable_recompute,
+        )
 
     if self.multi_query:
         # (batch_size, num_heads, seq_len, head_dim) --> (batch_size, seq_len, num_heads, head_dim)

From e019bce0d77cd2d1321745eda05241ccaa508cef Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Thu, 29 Aug 2024 07:10:52 -0700
Subject: [PATCH 15/24] Mixtral fp8 tests (#1269)

---
 tests/test_text_generation_example.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index c2ac38e873..9c4e983576 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -57,7 +57,11 @@
             ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 128, 2048, 6979.225194247115),
             ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 2048, 128, 1681.4401450088983),
             ("mistralai/Mistral-7B-Instruct-v0.2", 1, 44, True, 2048, 2048, 3393.149396451692),
-            ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128, 39.26845661768185),
+            ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128, 40.94),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 768, True, 128, 128, 3428.65),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 128, 2048, 2570.34),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 2048, 128, 379.03),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 48, True, 2048, 2048, 1147.50),
             ("microsoft/phi-2", 1, 1, True, 128, 128, 254.08932787178165),
         ],
         "deepspeed": [
@@ -200,6 +204,9 @@ def _test_text_generation(
             command.insert(-2, "--flash_attention_recompute")
             command.insert(-2, "--attn_softmax_bf16")
             command.insert(-2, "--trim_logits")
+        if "Mixtral" in model_name:
+            command.insert(-2, "--bucket_size 128")
+            command.insert(-2, "--bucket_internal")
         elif "falcon-180b" in model_name.lower():
             command.insert(-2, "--flash_attention_recompute")
 
@@ -254,9 +261,14 @@ def _test_text_generation(
                         e.args = (f"The following command failed:\n{' '.join(measure_command[:-2])}",)
                     raise
 
-            env_variables["QUANT_CONFIG"] = os.path.join(
-                path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json"
-            )
+            if "Mixtral" in model_name:
+                env_variables["QUANT_CONFIG"] = os.path.join(
+                    path_to_example_dir, "text-generation/quantization_config/maxabs_quant_mixtral.json"
+                )
+            else:
+                env_variables["QUANT_CONFIG"] = os.path.join(
+                    path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json"
+                )
 
         command = [x for y in command for x in re.split(pattern, y) if x]
         print(f"\n\nCommand to test: {' '.join(command[:-2])}\n")

From b05a1a5120d0b72dee1fcda416612c2d02a00b10 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Thu, 29 Aug 2024 18:52:47 +0200
Subject: [PATCH 16/24] Switch failed code quality check comment to
 `workflow_run` (#1297)

---
 .github/workflows/check_code_quality.yml      | 16 +++++++-----
 .../failed_code_quality_check_comment.yml     | 26 ++++++++++++++-----
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index be453de234..d68db62369 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -35,9 +35,13 @@ jobs:
         source venv/bin/activate
         ruff check . setup.py
         ruff format --check . setup.py
-  post-comment:
-    if: failure() && github.event_name == 'pull_request'
-    needs: check
-    uses: ./.github/workflows/failed_code_quality_check_comment.yml
-    with:
-      pr-number: ${{ github.event.number }}
+    - name: Store PR number if failure to post comment
+      if: failure() && github.event_name == 'pull_request'
+      env:
+        PR_NUMBER: ${{ github.event.number }}
+      run: echo $PR_NUMBER > ./pr_number
+    - uses: actions/upload-artifact@v4
+      if: failure() && github.event_name == 'pull_request'
+      with:
+        name: pr-number
+        path: ./pr_number
diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index 94aa22c70d..c5dfce0e2f 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -1,18 +1,32 @@
 name: Post comment in PR for failed code quality check
 
 on:
-  workflow_call:
-    inputs:
-      pr-number:
-        required: true
-        type: number
+  workflow_run:
+    workflows: ["Check code quality"]
+    types:
+      - completed
 
 jobs:
   post-comment:
     runs-on: ubuntu-latest
+    if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: pr-number
+      - name: Get PR number
+        id: github-context
+        run: |
+          content_pr_number=$(cat ./pr_number)
+          if [[ $content_pr_number =~ ^[0-9]+$ ]]; then
+            echo "pr_number=$content_pr_number" >> $GITHUB_OUTPUT
+            rm -rf ./pr_number
+          else
+            echo "Encountered an invalid PR number"
+            exit 1
+          fi
       - uses: peter-evans/create-or-update-comment@v4
         with:
-          issue-number: ${{ inputs.pr-number }}
+          issue-number: ${{ steps.github-context.outputs.pr_number }}
           body: The code quality check failed, please run `make style`.

From 7c409ad060cd62d081484342e04228860a8ed3c4 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 10:57:50 +0200
Subject: [PATCH 17/24] Potential fix for the failed code quality check comment
 workflow (#1299)

---
 .../failed_code_quality_check_comment.yml     | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index c5dfce0e2f..89d978f5e1 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -12,9 +12,29 @@ jobs:
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
-      - uses: actions/download-artifact@v4
+      - name: Download artifact
+        uses: actions/github-script@v3.1.0
         with:
-          name: pr-number
+          script: |
+            var artifacts = await github.actions.listWorkflowRunArtifacts({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              run_id: ${{github.event.workflow_run.id }},
+            });
+            var matchArtifact = artifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "pr-number"
+            })[0];
+            var download = await github.actions.downloadArtifact({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              artifact_id: matchArtifact.id,
+              archive_format: 'zip',
+            });
+            var fs = require('fs');
+            fs.writeFileSync('${{steps.setup-env.outputs.current_work_dir}}/pr-number.zip', Buffer.from(download.data));
+      - run: unzip pr-number.zip
+      - name: Display structure of downloaded files
+        run: ls -l
       - name: Get PR number
         id: github-context
         run: |

From 5092e4cd4c5c1b049fda6e861204098777e81c1a Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 09:04:56 +0000
Subject: [PATCH 18/24] Potential fix 2 for failed code quality check comment
 workflow

---
 .../failed_code_quality_check_comment.yml     | 25 +++----------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index 89d978f5e1..11db76982c 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -12,29 +12,10 @@ jobs:
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
-      - name: Download artifact
-        uses: actions/github-script@v3.1.0
+      - uses: actions/checkout@v2
+      - uses: actions/download-artifact@v4
         with:
-          script: |
-            var artifacts = await github.actions.listWorkflowRunArtifacts({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              run_id: ${{github.event.workflow_run.id }},
-            });
-            var matchArtifact = artifacts.data.artifacts.filter((artifact) => {
-              return artifact.name == "pr-number"
-            })[0];
-            var download = await github.actions.downloadArtifact({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              artifact_id: matchArtifact.id,
-              archive_format: 'zip',
-            });
-            var fs = require('fs');
-            fs.writeFileSync('${{steps.setup-env.outputs.current_work_dir}}/pr-number.zip', Buffer.from(download.data));
-      - run: unzip pr-number.zip
-      - name: Display structure of downloaded files
-        run: ls -l
+          name: pr-number
       - name: Get PR number
         id: github-context
         run: |

From 9a29cc205c5d1b32a3c3c1123bfb074d7c71c5ac Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 09:08:12 +0000
Subject: [PATCH 19/24] Potential fix 3 for failed code quality check workflow

---
 .github/workflows/failed_code_quality_check_comment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index 11db76982c..f81f16e3fe 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -12,10 +12,10 @@ jobs:
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
-      - uses: actions/checkout@v2
       - uses: actions/download-artifact@v4
         with:
           name: pr-number
+          run-id: ${{github.event.workflow_run.id }}
       - name: Get PR number
         id: github-context
         run: |

From 46c2d5910069e6a87e53032b0558fdd7bb75c106 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 09:22:00 +0000
Subject: [PATCH 20/24] Other potentiel fix

---
 .github/workflows/failed_code_quality_check_comment.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index f81f16e3fe..0e6b42fb27 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -12,10 +12,11 @@ jobs:
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
-      - uses: actions/download-artifact@v4
+      - name: Download artifact
+        uses: dawidd6/action-download-artifact@v6
         with:
           name: pr-number
-          run-id: ${{github.event.workflow_run.id }}
+          run_id: ${{github.event.workflow_run.id }}
       - name: Get PR number
         id: github-context
         run: |

From f9d46eb3f00651f708873fd7282332b2d20c9508 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 09:39:50 +0000
Subject: [PATCH 21/24] New potential fix

---
 .github/workflows/failed_code_quality_check_comment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index 0e6b42fb27..9d376fc62f 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -11,6 +11,8 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
+    permissions:
+      pull-requests: write
     steps:
       - name: Download artifact
         uses: dawidd6/action-download-artifact@v6

From e7d62b35958266038c2247e59eae4d8f24f04db7 Mon Sep 17 00:00:00 2001
From: Daniel Huang <daniel1.huang@intel.com>
Date: Fri, 30 Aug 2024 04:59:04 -0700
Subject: [PATCH 22/24] Enabling Text to Video Diffusion Model Generation
 (#1109)

---
 README.md                                     |   1 +
 docs/source/index.mdx                         |   1 +
 examples/text-to-video/README.md              |  41 ++
 examples/text-to-video/requirements.txt       |   1 +
 .../text-to-video/text_to_video_generation.py | 216 ++++++++
 optimum/habana/diffusers/__init__.py          |   1 +
 .../pipeline_text_to_video_synth.py           | 465 ++++++++++++++++++
 tests/test_diffusers.py                       | 127 +++++
 8 files changed, 853 insertions(+)
 create mode 100644 examples/text-to-video/README.md
 create mode 100644 examples/text-to-video/requirements.txt
 create mode 100755 examples/text-to-video/text_to_video_generation.py
 create mode 100644 optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py

diff --git a/README.md b/README.md
index b78a19080e..2decc5ec47 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,7 @@ The following model architectures, tasks and device distributions have been vali
 | Stable Diffusion | <li>[textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)</li><li>[ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Stable Diffusion XL | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D            |          | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
+| Text to Video    |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
 
 </div>
 
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 9b6de456c5..17f099f8ee 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -84,6 +84,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | Stable Diffusion    | <li>[textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)</li><li>[ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)</li> | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Stable Diffusion XL | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)</li> | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D               |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
+| Text to Video    |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
 
 - PyTorch Image Models/TIMM:
 
diff --git a/examples/text-to-video/README.md b/examples/text-to-video/README.md
new file mode 100644
index 0000000000..1df4e44e59
--- /dev/null
+++ b/examples/text-to-video/README.md
@@ -0,0 +1,41 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Text to Video Examples
+
+This directory contains a script that showcases how to use the `GaudiTextToVideoSDPipeline` to run text-to-video generation tasks on HPUs.
+
+## Requirements
+
+First, you should install the requirements:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Single-HPU inference
+
+```bash
+python3 text_to_video_generation.py \
+    --model_name_or_path ali-vilab/text-to-video-ms-1.7b \
+    --prompts "An astronaut riding a horse" \
+    --use_habana \
+    --use_hpu_graphs \
+    --dtype bf16
+```
+
+Models that have been validated:
+  - [ali-vilab/text-to-video-ms-1.7b](https://huggingface.co/ali-vilab/text-to-video-ms-1.7b)
diff --git a/examples/text-to-video/requirements.txt b/examples/text-to-video/requirements.txt
new file mode 100644
index 0000000000..6ab6d0d570
--- /dev/null
+++ b/examples/text-to-video/requirements.txt
@@ -0,0 +1 @@
+opencv-python-headless
diff --git a/examples/text-to-video/text_to_video_generation.py b/examples/text-to-video/text_to_video_generation.py
new file mode 100755
index 0000000000..4a91359617
--- /dev/null
+++ b/examples/text-to-video/text_to_video_generation.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# Adapted from ../stable-diffusion/text_to_image_generation.py
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+import torch
+from diffusers.utils.export_utils import export_to_video
+
+from optimum.habana.diffusers import GaudiTextToVideoSDPipeline
+from optimum.habana.transformers.gaudi_configuration import GaudiConfig
+from optimum.habana.utils import set_seed
+
+
+try:
+    from optimum.habana.utils import check_optimum_habana_min_version
+except ImportError:
+
+    def check_optimum_habana_min_version(*a, **b):
+        return ()
+
+
+# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
+check_optimum_habana_min_version("1.14.0.dev0")
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default="ali-vilab/text-to-video-ms-1.7b",
+        type=str,
+        help="Path to pre-trained model",
+    )
+    # Pipeline arguments
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        nargs="*",
+        default="Spiderman is surfing",
+        help="The prompt or prompts to guide the video generation.",
+    )
+    parser.add_argument(
+        "--num_videos_per_prompt", type=int, default=1, help="The number of videos to generate per prompt."
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="The number of videos in a batch.")
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=0,
+        help="The height in pixels of the generated videos (0=default from model config).",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=0,
+        help="The width in pixels of the generated videos (0=default from model config).",
+    )
+    parser.add_argument("--num_frames", type=int, default=20, help="The number of frames in the generated videos.")
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=50,
+        help=(
+            "The number of denoising steps. More denoising steps usually lead to a higher quality videos at the expense"
+            " of slower inference."
+        ),
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=7.5,
+        help=(
+            "Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598)."
+            " Higher guidance scale encourages to generate videos that are closely linked to the text `prompt`,"
+            " usually at the expense of lower video quality."
+        ),
+    )
+    parser.add_argument(
+        "--negative_prompts",
+        type=str,
+        nargs="*",
+        default=None,
+        help="The prompt or prompts not to guide the video generation.",
+    )
+    parser.add_argument(
+        "--eta",
+        type=float,
+        default=0.0,
+        help="Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502.",
+    )
+    parser.add_argument(
+        "--output_type",
+        type=str,
+        choices=["mp4", "np"],
+        default="mp4",
+        help="Whether to return mp4 or Numpy arrays.",
+    )
+
+    parser.add_argument(
+        "--pipeline_save_dir",
+        type=str,
+        default=None,
+        help="The directory where the generation pipeline will be saved.",
+    )
+    parser.add_argument(
+        "--video_save_dir",
+        type=str,
+        default="./generated-videos",
+        help="The directory where videos will be saved.",
+    )
+
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for initialization.")
+
+    # HPU-specific arguments
+    parser.add_argument("--use_habana", action="store_true", help="Use HPU.")
+    parser.add_argument(
+        "--use_hpu_graphs", action="store_true", help="Use HPU graphs on HPU. This should lead to faster generations."
+    )
+    parser.add_argument(
+        "--dtype",
+        default="bf16",
+        choices=["bf16", "fp32", "autocast_bf16"],
+        help="Which runtime dtype to perform generation in.",
+    )
+    args = parser.parse_args()
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO)
+    logger.info(f"Arguments: {args}")
+
+    # Set video resolution
+    kwargs_call = {}
+    if args.width > 0 and args.height > 0:
+        kwargs_call["width"] = args.width
+        kwargs_call["height"] = args.height
+    kwargs_call["num_frames"] = args.num_frames
+
+    gaudi_config_kwargs = {"use_fused_adam": True, "use_fused_clip_norm": True}
+    if args.dtype == "autocast_bf16":
+        gaudi_config_kwargs["use_torch_autocast"] = True
+
+    gaudi_config = GaudiConfig(**gaudi_config_kwargs)
+    logger.info(f"Gaudi Config: {gaudi_config}")
+
+    kwargs = {
+        "use_habana": args.use_habana,
+        "use_hpu_graphs": args.use_hpu_graphs,
+        "gaudi_config": gaudi_config,
+    }
+    if args.dtype == "bf16":
+        kwargs["torch_dtype"] = torch.bfloat16
+    elif args.dtype == "fp32":
+        kwargs["torch_dtype"] = torch.float32
+
+    # Generate images
+    pipeline: GaudiTextToVideoSDPipeline = GaudiTextToVideoSDPipeline.from_pretrained(
+        args.model_name_or_path, **kwargs
+    )
+    set_seed(args.seed)
+    outputs = pipeline(
+        prompt=args.prompts,
+        num_videos_per_prompt=args.num_videos_per_prompt,
+        batch_size=args.batch_size,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        negative_prompt=args.negative_prompts,
+        eta=args.eta,
+        output_type="pil" if args.output_type == "mp4" else args.output_type,  # Naming inconsistency in base class
+        **kwargs_call,
+    )
+
+    # Save the pipeline in the specified directory if not None
+    if args.pipeline_save_dir is not None:
+        pipeline.save_pretrained(args.pipeline_save_dir)
+
+    # Save images in the specified directory if not None and if they are in PIL format
+    if args.video_save_dir is not None:
+        if args.output_type == "mp4":
+            video_save_dir = Path(args.video_save_dir)
+            video_save_dir.mkdir(parents=True, exist_ok=True)
+            logger.info(f"Saving images in {video_save_dir.resolve()}...")
+
+            for i, video in enumerate(outputs.videos):
+                filename = video_save_dir / f"video_{i + 1}.mp4"
+                export_to_video(video, str(filename.resolve()))
+        else:
+            logger.warning("--output_type should be equal to 'mp4' to save images in --video_save_dir.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optimum/habana/diffusers/__init__.py b/optimum/habana/diffusers/__init__.py
index 860a97e382..49ea1851bf 100644
--- a/optimum/habana/diffusers/__init__.py
+++ b/optimum/habana/diffusers/__init__.py
@@ -17,4 +17,5 @@
 from .pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img import GaudiStableDiffusionXLImg2ImgPipeline
 from .pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint import GaudiStableDiffusionXLInpaintPipeline
 from .pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import GaudiStableVideoDiffusionPipeline
+from .pipelines.text_to_video_synthesis.pipeline_text_to_video_synth import GaudiTextToVideoSDPipeline
 from .schedulers import GaudiDDIMScheduler, GaudiEulerAncestralDiscreteScheduler, GaudiEulerDiscreteScheduler
diff --git a/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
new file mode 100644
index 0000000000..ffaf25df11
--- /dev/null
+++ b/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -0,0 +1,465 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from math import ceil
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from diffusers.models import AutoencoderKL, UNet3DConditionModel
+from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth import TextToVideoSDPipeline
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import logging
+from diffusers.utils.outputs import BaseOutput
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ....transformers.gaudi_configuration import GaudiConfig
+from ..pipeline_utils import GaudiDiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class GaudiTextToVideoSDPipelineOutput(BaseOutput):
+    videos: Union[List[PIL.Image.Image], np.ndarray]
+
+
+class GaudiTextToVideoSDPipeline(GaudiDiffusionPipeline, TextToVideoSDPipeline):
+    r"""
+    Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py#L84
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        use_habana: bool = False,
+        use_hpu_graphs: bool = False,
+        gaudi_config: Union[str, GaudiConfig] = None,
+        bf16_full_eval: bool = False,
+    ):
+        GaudiDiffusionPipeline.__init__(
+            self,
+            use_habana,
+            use_hpu_graphs,
+            gaudi_config,
+            bf16_full_eval,
+        )
+        TextToVideoSDPipeline.__init__(
+            self,
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+        )
+        self.to(self._device)
+
+    def enable_model_cpu_offload(self, *args, **kwargs):
+        if self.use_habana:
+            raise NotImplementedError("enable_model_cpu_offload() is not implemented for HPU")
+        else:
+            return super().enable_model_cpu_offload(*args, **kwargs)
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            # torch.randn is broken on HPU so running it on CPU
+            rand_device = "cpu" if device.type == "hpu" else device
+            if isinstance(generator, list):
+                shape = (1,) + shape[1:]
+                latents = [
+                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                    for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from optimum.habana.diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipeline._split_inputs_into_batches
+    @classmethod
+    def _split_inputs_into_batches(cls, batch_size, latents, prompt_embeds, negative_prompt_embeds):
+        # Use torch.split to generate num_batches batches of size batch_size
+        latents_batches = list(torch.split(latents, batch_size))
+        prompt_embeds_batches = list(torch.split(prompt_embeds, batch_size))
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds_batches = list(torch.split(negative_prompt_embeds, batch_size))
+
+        # If the last batch has less samples than batch_size, pad it with dummy samples
+        num_dummy_samples = 0
+        if latents_batches[-1].shape[0] < batch_size:
+            num_dummy_samples = batch_size - latents_batches[-1].shape[0]
+            # Pad latents_batches
+            sequence_to_stack = (latents_batches[-1],) + tuple(
+                torch.zeros_like(latents_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+            )
+            latents_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad prompt_embeds_batches
+            sequence_to_stack = (prompt_embeds_batches[-1],) + tuple(
+                torch.zeros_like(prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+            )
+            prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad negative_prompt_embeds_batches if necessary
+            if negative_prompt_embeds is not None:
+                sequence_to_stack = (negative_prompt_embeds_batches[-1],) + tuple(
+                    torch.zeros_like(negative_prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+                )
+                negative_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+
+        # Stack batches in the same tensor
+        latents_batches = torch.stack(latents_batches)
+        if negative_prompt_embeds is not None:
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            for i, (negative_prompt_embeds_batch, prompt_embeds_batch) in enumerate(
+                zip(negative_prompt_embeds_batches, prompt_embeds_batches[:])
+            ):
+                prompt_embeds_batches[i] = torch.cat([negative_prompt_embeds_batch, prompt_embeds_batch])
+        prompt_embeds_batches = torch.stack(prompt_embeds_batches)
+
+        return latents_batches, prompt_embeds_batches, num_dummy_samples
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 16,
+        batch_size: int = 1,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide video generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of videos in a batch.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate videos closely linked to the text
+                `prompt` at the expense of lower video quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in video generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_videos_per_prompt (`int`, defaults to 1):
+                The number of videos to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
+            # 0. Default height and width to unet
+            height = height or self.unet.config.sample_size * self.vae_scale_factor
+            width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                height,
+                width,
+                callback_steps,
+                negative_prompt,
+                prompt_embeds,
+                negative_prompt_embeds,
+            )
+
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                num_prompts = 1
+            elif prompt is not None and isinstance(prompt, list):
+                num_prompts = len(prompt)
+            else:
+                num_prompts = prompt_embeds.shape[0]
+            num_videos = num_videos_per_prompt * num_prompts
+            num_batches = ceil((num_videos) / batch_size)
+            logger.info(
+                f"{num_prompts} prompt(s) received, {num_videos_per_prompt} generation(s) per prompt, "
+                f"{batch_size} sample(s) per batch, {num_batches} total batch(es)."
+            )
+            if num_batches < 3:
+                logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
+
+            device = self._execution_device
+            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+            # corresponds to doing no classifier free guidance.
+            do_classifier_free_guidance = guidance_scale > 1.0
+
+            # 3. Encode input prompt
+            text_encoder_lora_scale = (
+                cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+            )
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=clip_skip,
+            )
+
+            # 4. Prepare timesteps
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+            # 5. Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                num_prompts * num_videos_per_prompt,
+                num_channels_latents,
+                num_frames,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+
+            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 7. Split into batches (HPU-specific step)
+            latents_batches, text_embeddings_batches, num_dummy_samples = self._split_inputs_into_batches(
+                batch_size,
+                latents,
+                prompt_embeds,
+                negative_prompt_embeds,
+            )
+
+            # 8. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            outputs = []
+            for j in self.progress_bar(range(num_batches)):
+                latents_batch = latents_batches[0]
+                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
+                text_embeddings_batch = text_embeddings_batches[0]
+                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
+                for i in self.progress_bar(range(len(timesteps))):
+                    t = timesteps[0]
+                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = (
+                        torch.cat([latents_batch] * 2) if do_classifier_free_guidance else latents_batch
+                    )
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet_hpu(
+                        latent_model_input,
+                        t,
+                        text_embeddings_batch,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )
+
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # reshape latents
+                    bsz, channel, frames, width, height = latents_batch.shape
+                    latents_batch = latents_batch.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+                    noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_batch = self.scheduler.step(noise_pred, t, latents_batch, **extra_step_kwargs).prev_sample
+
+                    # reshape latents_batch back
+                    latents_batch = (
+                        latents_batch[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+                    )
+
+                    if not self.use_hpu_graphs:
+                        self.htcore.mark_step()
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents_batch)
+                if output_type == "latent":
+                    video_tensor = latents_batch
+                else:
+                    video_tensor = self.decode_latents(latents_batch)
+                outputs.append(video_tensor)
+
+                if not self.use_hpu_graphs:
+                    self.htcore.mark_step()
+
+            # Remove dummy generations if needed
+            if num_dummy_samples > 0:
+                outputs[-1] = outputs[-1][:-num_dummy_samples]
+
+            # 9. Post processing
+            videos = []
+            for video_tensor in outputs:
+                if output_type == "latent":
+                    videos.extend(list(video_tensor))
+                    continue
+                video_batch = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+                if output_type == "pil" and isinstance(video_batch, list):
+                    videos += video_batch
+                elif output_type in ["np", "numpy"] and isinstance(video_batch, np.ndarray):
+                    if len(videos) == 0:
+                        videos = video_batch
+                    else:
+                        videos = np.concatenate((videos, video_batch), axis=0)
+                else:  # Torch Tensor
+                    if len(videos) == 0:
+                        videos = video_batch
+                    else:
+                        videos = torch.cat((videos, video_batch), 0)
+
+            # Offload all models
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (videos,)
+
+            return GaudiTextToVideoSDPipelineOutput(videos=videos)
+
+    @torch.no_grad()
+    def unet_hpu(self, latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs):
+        if self.use_hpu_graphs:
+            return self.capture_replay(latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs)
+        else:
+            return self.unet(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+    @torch.no_grad()
+    def capture_replay(self, latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs):
+        inputs = [latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs, False]
+        h = self.ht.hpu.graphs.input_hash(inputs)
+        cached = self.cache.get(h)
+
+        if cached is None:
+            # Capture the graph and cache it
+            with self.ht.hpu.stream(self.hpu_stream):
+                graph = self.ht.hpu.HPUGraph()
+                graph.capture_begin()
+                outputs = self.unet(
+                    inputs[0], inputs[1], inputs[2], cross_attention_kwargs=inputs[3], return_dict=inputs[4]
+                )[0]
+                graph.capture_end()
+                graph_inputs = inputs
+                graph_outputs = outputs
+                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
+            return outputs
+
+        # Replay the cached graph with updated inputs
+        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
+        cached.graph.replay()
+        self.ht.core.hpu.default_stream().synchronize()
+
+        return cached.graph_outputs
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index fd4d0fca08..98edfc2243 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -24,6 +24,7 @@
 import re
 import subprocess
 import tempfile
+import time
 from io import BytesIO, StringIO
 from pathlib import Path
 from typing import Callable, Union
@@ -51,6 +52,7 @@
     StableVideoDiffusionPipeline,
     UNet2DConditionModel,
     UNet2DModel,
+    UNet3DConditionModel,
     UNetSpatioTemporalConditionModel,
     UniPCMultistepScheduler,
 )
@@ -101,6 +103,7 @@
     GaudiStableDiffusionXLInpaintPipeline,
     GaudiStableDiffusionXLPipeline,
     GaudiStableVideoDiffusionPipeline,
+    GaudiTextToVideoSDPipeline,
 )
 from optimum.habana.utils import set_seed
 
@@ -119,6 +122,7 @@
     CONTROLNET_RUNTIME = 537.4276602957398
     INPAINT_THROUGHPUT_BASELINE_BF16 = 4.584
     INPAINT_XL_THROUGHPUT_BASELINE_BF16 = 1.151
+    TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE = 70
     DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT = 0.946
     THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 7.671212047338486
 else:
@@ -132,6 +136,7 @@
     INPAINT_XL_THROUGHPUT_BASELINE_BF16 = 0.271
     DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT = 0.302
     THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 3.095533166996529
+    TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE = 1000  # TODO: Get Gaudi 1 benchmark numbers
 
 
 _run_custom_bf16_ops_test_ = parse_flag_from_env("CUSTOM_BF16_OPS", default=False)
@@ -3009,6 +3014,128 @@ def test_deterministic_image_generation_no_throughput_regression_bf16(self):
         self.assertGreaterEqual(outputs.throughput, 0.95 * DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT)
 
 
+class GaudiTextToVideoSDPipelineTester(TestCase):
+    """
+    Tests the TextToVideoSDPipeline for Gaudi.
+    Adapted from https://github.com/huggingface/diffusers/blob/v0.24.0-release/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
+    """
+
+    def get_dummy_components(self):
+        set_seed(0)
+        unet = UNet3DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
+            cross_attention_dim=4,
+            attention_head_dim=4,
+            norm_num_groups=2,
+        )
+        scheduler = GaudiEulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            interpolation_type="linear",
+            num_train_timesteps=1000,
+            prediction_type="v_prediction",
+            sigma_max=700.0,
+            sigma_min=0.002,
+            steps_offset=1,
+            timestep_spacing="leading",
+            timestep_type="continuous",
+            trained_betas=None,
+            use_karras_sigmas=True,
+        )
+        set_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=(8,),
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownEncoderBlock2D",),
+            up_block_types=("UpDecoderBlock2D",),
+            latent_channels=4,
+            sample_size=32,
+            norm_num_groups=2,
+        )
+        set_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=4,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_text_to_video_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        gaudi_config = GaudiConfig(use_torch_autocast=False)
+        sd_pipe = GaudiTextToVideoSDPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "np"
+        video = sd_pipe(**inputs).videos[0]
+        image_slice = video[0][-3:, -3:, -1]
+
+        assert video[0].shape == (32, 32, 3)
+        expected_slice = np.array(
+            [0.32823694, 0.5277065, 0.5257378, 0.51532686, 0.62792695, 0.5966803, 0.55225205, 0.6153607, 0.60387087]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    @slow
+    def test_stable_video_diffusion_no_latency_regression_bf16(self):
+        model_name = "ali-vilab/text-to-video-ms-1.7b"
+        pipeline = GaudiTextToVideoSDPipeline.from_pretrained(
+            model_name,
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
+            torch_dtype=torch.bfloat16,
+        )
+        set_seed(42)
+        start_time = time.time()
+        prompt = "Spiderman is surfing"
+        outputs = pipeline(prompt, num_inference_steps=50, output_type="pil")
+        latency = time.time() - start_time
+        assert len(outputs.videos[0]) == 16
+
+        assert latency < 1.05 * TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE
+
+
 """
 Copied from: https://github.com/huggingface/diffusers/blob/v0.26.3/tests/pipelines/test_pipelines_common.py
 - Remove PipelinePushToHubTester testcase.

From fe8ae86b12d62dac25a3ac6ba2a21993de7239bf Mon Sep 17 00:00:00 2001
From: Pramod Kumar <144990617+pramodkumar-habanalabs@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:37:08 +0530
Subject: [PATCH 23/24] Prevent Graph break in Llama when using flash attention
 (#1301)

---
 optimum/habana/transformers/models/llama/modeling_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
index 7d41126390..1abbfab12d 100755
--- a/optimum/habana/transformers/models/llama/modeling_llama.py
+++ b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -617,7 +617,7 @@ def pre_attn_forward(
         else:
             past_key_value = None
 
-        if use_flash_attention and FusedSDPA:
+        if use_flash_attention and FusedSDPA is not None:
             import habana_frameworks.torch.hpu as ht
 
             softmax_mode = "fast" if flash_attention_fast_softmax else "None"

From dc7d72eb77ab64f8d087b206add1345f8ef4426c Mon Sep 17 00:00:00 2001
From: Akihiro Takahashi <akihiro.takahashi@intel.com>
Date: Fri, 2 Aug 2024 10:38:42 -0700
Subject: [PATCH 24/24] Enable MPT fp8 support

Add Softmax and FusedSDPA
Update GaudiMptAttention foward to r4.44.1 base

Co-authored-by: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai>
---
 optimum/habana/transformers/models/mpt/modeling_mpt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/habana/transformers/models/mpt/modeling_mpt.py b/optimum/habana/transformers/models/mpt/modeling_mpt.py
index 2c632619b7..d3563c766e 100755
--- a/optimum/habana/transformers/models/mpt/modeling_mpt.py
+++ b/optimum/habana/transformers/models/mpt/modeling_mpt.py
@@ -16,9 +16,9 @@
 # Copyright (C) 2022-2023 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 import os
-import torch
-
 from typing import Optional, Tuple, Union
+
+import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions