openai · ain-soph · Jul 3, 2023 · Jul 3, 2023
diff --git a/cm/unet.py b/cm/unet.py
@@ -344,7 +344,7 @@ def __init__(
         from flash_attn.flash_attention import FlashAttention
 
         assert batch_first
-        factory_kwargs = {"device": device, "dtype": dtype}
+        # factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
@@ -357,7 +357,7 @@ def __init__(
         assert self.head_dim in [16, 32, 64], "Only support head_dim == 16, 32, or 64"
 
         self.inner_attn = FlashAttention(
-            attention_dropout=attention_dropout, **factory_kwargs
+            attention_dropout=attention_dropout,  # **factory_kwargs
         )
         self.rearrange = rearrange
 
@@ -366,7 +366,7 @@ def forward(self, qkv, attn_mask=None, key_padding_mask=None, need_weights=False
             qkv, "b (three h d) s -> b s three h d", three=3, h=self.num_heads
         )
         qkv, _ = self.inner_attn(
-            qkv,
+            qkv.contiguous(),
             key_padding_mask=key_padding_mask,
             need_weights=need_weights,
             causal=self.causal,