bugfix: fix AOT mode unittests (#665)

Follow up of #657
flashinfer-ai · Dec 16, 2024 · d9d8eb1 · d9d8eb1
1 parent b1b1fb8
commit d9d8eb1
Show file tree

Hide file tree

Showing 10 changed files with 235 additions and 225 deletions.
diff --git a/tests/test_alibi.py b/tests/test_alibi.py
@@ -27,31 +27,32 @@
 def warmup_jit():
     if flashinfer.jit.has_prebuilt_ops:
         yield
-    try:
-        flashinfer.jit.parallel_load_modules(
-            jit_decode_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0, 2],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False],  # use_logits_soft_caps
+    else:
+        try:
+            flashinfer.jit.parallel_load_modules(
+                jit_decode_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0, 2],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False],  # use_logits_soft_caps
+                )
+                + jit_prefill_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0, 2],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False],  # use_logits_soft_caps
+                    [False],  # allow_fp16_qk_reductions
+                )
             )
-            + jit_prefill_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0, 2],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False],  # use_logits_soft_caps
-                [False],  # allow_fp16_qk_reductions
-            )
-        )
-    except Exception as e:
-        # abort the test session if warmup fails
-        pytest.exit(str(e))
-    finally:
-        yield
+        except Exception as e:
+            # abort the test session if warmup fails
+            pytest.exit(str(e))
+        finally:
+            yield
 
 
 @pytest.mark.parametrize("seq_len", [1, 9, 81, 729])

diff --git a/tests/test_batch_decode_kernels.py b/tests/test_batch_decode_kernels.py
@@ -25,31 +25,32 @@
 def warmup_jit():
     if flashinfer.jit.has_prebuilt_ops:
         yield
-    try:
-        flashinfer.jit.parallel_load_modules(
-            jit_decode_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16, torch.float8_e4m3fn, torch.float8_e5m2],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0, 1, 2],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False, True],  # use_logits_soft_caps
-            )
-            + jit_prefill_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16, torch.float8_e4m3fn, torch.float8_e5m2],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0, 1, 2],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False, True],  # use_logits_soft_caps
-                [False],  # allow_fp16_qk_reductions
+    else:
+        try:
+            flashinfer.jit.parallel_load_modules(
+                jit_decode_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16, torch.float8_e4m3fn, torch.float8_e5m2],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0, 1, 2],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False, True],  # use_logits_soft_caps
+                )
+                + jit_prefill_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16, torch.float8_e4m3fn, torch.float8_e5m2],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0, 1, 2],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False, True],  # use_logits_soft_caps
+                    [False],  # allow_fp16_qk_reductions
+                )
             )
-        )
-    except Exception as e:
-        # abort the test session if warmup fails
-        pytest.exit(str(e))
-    finally:
-        yield
+        except Exception as e:
+            # abort the test session if warmup fails
+            pytest.exit(str(e))
+        finally:
+            yield
 
 
 @pytest.mark.parametrize("batch_size", [12, 17])

diff --git a/tests/test_batch_prefill_kernels.py b/tests/test_batch_prefill_kernels.py
@@ -25,23 +25,24 @@
 def warmup_jit():
     if flashinfer.jit.has_prebuilt_ops:
         yield
-    try:
-        flashinfer.jit.parallel_load_modules(
-            jit_prefill_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16, torch.float8_e4m3fn, torch.float8_e5m2],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0, 1, 2],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False, True],  # use_logits_soft_caps
-                [False],  # allow_fp16_qk_reductions
+    else:
+        try:
+            flashinfer.jit.parallel_load_modules(
+                jit_prefill_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16, torch.float8_e4m3fn, torch.float8_e5m2],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0, 1, 2],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False, True],  # use_logits_soft_caps
+                    [False],  # allow_fp16_qk_reductions
+                )
             )
-        )
-    except Exception as e:
-        # abort the test session if warmup fails
-        pytest.exit(str(e))
-    finally:
-        yield
+        except Exception as e:
+            # abort the test session if warmup fails
+            pytest.exit(str(e))
+        finally:
+            yield
 
 
 @pytest.mark.parametrize("batch_size", [12, 17])

diff --git a/tests/test_block_sparse.py b/tests/test_block_sparse.py
@@ -27,31 +27,32 @@
 def warmup_jit():
     if flashinfer.jit.has_prebuilt_ops:
         yield
-    try:
-        flashinfer.jit.parallel_load_modules(
-            jit_decode_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False],  # use_logits_soft_caps
-            )
-            + jit_prefill_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False],  # use_logits_soft_caps
-                [False],  # allow_fp16_qk_reductions
+    else:
+        try:
+            flashinfer.jit.parallel_load_modules(
+                jit_decode_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False],  # use_logits_soft_caps
+                )
+                + jit_prefill_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False],  # use_logits_soft_caps
+                    [False],  # allow_fp16_qk_reductions
+                )
             )
-        )
-    except Exception as e:
-        # abort the test session if warmup fails
-        pytest.exit(str(e))
-    finally:
-        yield
+        except Exception as e:
+            # abort the test session if warmup fails
+            pytest.exit(str(e))
+        finally:
+            yield
 
 
 def bsr_attention_ref(

diff --git a/tests/test_logits_cap.py b/tests/test_logits_cap.py
@@ -27,31 +27,32 @@
 def warmup_jit():
     if flashinfer.jit.has_prebuilt_ops:
         yield
-    try:
-        flashinfer.jit.parallel_load_modules(
-            jit_decode_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False, True],  # use_logits_soft_caps
+    else:
+        try:
+            flashinfer.jit.parallel_load_modules(
+                jit_decode_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False, True],  # use_logits_soft_caps
+                )
+                + jit_prefill_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [128, 256],  # head_dims
+                    [0],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False, True],  # use_logits_soft_caps
+                    [False],  # allow_fp16_qk_reductions
+                )
             )
-            + jit_prefill_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [128, 256],  # head_dims
-                [0],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False, True],  # use_logits_soft_caps
-                [False],  # allow_fp16_qk_reductions
-            )
-        )
-    except Exception as e:
-        # abort the test session if warmup fails
-        pytest.exit(str(e))
-    finally:
-        yield
+        except Exception as e:
+            # abort the test session if warmup fails
+            pytest.exit(str(e))
+        finally:
+            yield
 
 
 def attention_logits_soft_cap_torch(q, k, v, soft_cap):

diff --git a/tests/test_non_contiguous_decode.py b/tests/test_non_contiguous_decode.py
@@ -9,31 +9,32 @@
 def warmup_jit():
     if flashinfer.jit.has_prebuilt_ops:
         yield
-    try:
-        flashinfer.jit.parallel_load_modules(
-            jit_decode_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [64, 128, 256],  # head_dims
-                [0],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False],  # use_logits_soft_caps
+    else:
+        try:
+            flashinfer.jit.parallel_load_modules(
+                jit_decode_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [64, 128, 256],  # head_dims
+                    [0],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False],  # use_logits_soft_caps
+                )
+                + jit_prefill_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [64, 128, 256],  # head_dims
+                    [0],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False],  # use_logits_soft_caps
+                    [False],  # allow_fp16_qk_reductions
+                )
             )
-            + jit_prefill_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [64, 128, 256],  # head_dims
-                [0],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False],  # use_logits_soft_caps
-                [False],  # allow_fp16_qk_reductions
-            )
-        )
-    except Exception as e:
-        # abort the test session if warmup fails
-        pytest.exit(str(e))
-    finally:
-        yield
+        except Exception as e:
+            # abort the test session if warmup fails
+            pytest.exit(str(e))
+        finally:
+            yield
 
 
 @pytest.mark.parametrize("batch_size", [1, 19, 99])

diff --git a/tests/test_non_contiguous_prefill.py b/tests/test_non_contiguous_prefill.py
@@ -25,23 +25,24 @@
 def warmup_jit():
     if flashinfer.jit.has_prebuilt_ops:
         yield
-    try:
-        flashinfer.jit.parallel_load_modules(
-            jit_prefill_attention_func_args(
-                [torch.float16],  # q_dtypes
-                [torch.float16],  # kv_dtypes
-                [64, 128, 256],  # head_dims
-                [0],  # pos_encoding_modes
-                [False],  # use_sliding_windows
-                [False],  # use_logits_soft_caps
-                [False],  # allow_fp16_qk_reductions
+    else:
+        try:
+            flashinfer.jit.parallel_load_modules(
+                jit_prefill_attention_func_args(
+                    [torch.float16],  # q_dtypes
+                    [torch.float16],  # kv_dtypes
+                    [64, 128, 256],  # head_dims
+                    [0],  # pos_encoding_modes
+                    [False],  # use_sliding_windows
+                    [False],  # use_logits_soft_caps
+                    [False],  # allow_fp16_qk_reductions
+                )
             )
-        )
-    except Exception as e:
-        # abort the test session if warmup fails
-        pytest.exit(str(e))
-    finally:
-        yield
+        except Exception as e:
+            # abort the test session if warmup fails
+            pytest.exit(str(e))
+        finally:
+            yield
 
 
 @pytest.mark.parametrize("seq_len", [1, 7, 127, 999, 3579])