diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f16ee0091..0210e1b55 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,28 +59,28 @@ jobs:
       run: |
         modal run dev.modal.unit_tests
 
-  convergence-tests:
-    runs-on: ubuntu-latest
-    needs: [checkstyle]
-
-    env:
-      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
-      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-
-    - name: Set up Python
-      uses: actions/setup-python@v3
-      with:
-        python-version: '3.10'
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install modal
-
-    - name: Run convergence tests
-      run: |
-        modal run dev.modal.conv_tests
\ No newline at end of file
+  # convergence-tests:
+  #   runs-on: ubuntu-latest
+  #   needs: [checkstyle]
+
+  #   env:
+  #     MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+  #     MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+
+  #   steps:
+  #   - name: Checkout code
+  #     uses: actions/checkout@v3
+
+  #   - name: Set up Python
+  #     uses: actions/setup-python@v3
+  #     with:
+  #       python-version: '3.10'
+
+  #   - name: Install dependencies
+  #     run: |
+  #       python -m pip install --upgrade pip
+  #       pip install modal
+
+  #   - name: Run convergence tests
+  #     run: |
+  #       modal run dev.modal.conv_tests
\ No newline at end of file
diff --git a/dev/modal/unit_tests.py b/dev/modal/unit_tests.py
index dc3fb5369..9a2fef4e5 100644
--- a/dev/modal/unit_tests.py
+++ b/dev/modal/unit_tests.py
@@ -14,7 +14,7 @@
 repo = modal.Mount.from_local_dir(ROOT_PATH, remote_path="/root/liger-kernel")
 
 
-@app.function(gpu="A10G", mounts=[repo], timeout=60 * 20)
+@app.function(gpu="A10G", mounts=[repo], timeout=60 * 5)
 def liger_unit_test():
     import subprocess
 
diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py
index 1a970573e..43a904a50 100644
--- a/test/transformers/test_cross_entropy.py
+++ b/test/transformers/test_cross_entropy.py
@@ -170,26 +170,14 @@ def _test_correctness_functional(B, T, V, scalar, dtype, atol, rtol):
 @pytest.mark.parametrize(
     "B, T, V",
     [
-        (2, 4096, 32000),  # llama2, mistral
-        (2, 4096, 32000),  # llama2, mistral
-        (1, 4096, 128256),  # llama3
-        # # weird shapes
-        (3, 423, 32000),
+        (2, 4096, 32000),  # llama
+        (3, 423, 32000),  # weird shapes
     ],
 )
 @pytest.mark.parametrize("reduction", ["sum", "mean"])
 @pytest.mark.parametrize(
     "scalar, dtype, atol, rtol",
     [
-        pytest.param(
-            0.1,
-            torch.bfloat16,
-            1e-8,
-            5e-2,
-            marks=pytest.mark.skipif(
-                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
-            ),
-        ),
         pytest.param(
             1.0,
             torch.bfloat16,
@@ -199,24 +187,9 @@ def _test_correctness_functional(B, T, V, scalar, dtype, atol, rtol):
                 not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
             ),
         ),
-        pytest.param(
-            10.0,
-            torch.bfloat16,
-            1e-7,
-            5e-2,
-            marks=pytest.mark.skipif(
-                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
-            ),
-        ),
-        (0.1, torch.float32, 1e-8, 1e-6),
         (1.0, torch.float32, 1e-8, 1e-6),
-        (10.0, torch.float32, 1e-8, 1e-6),
     ],
 )
-@pytest.mark.skipif(
-    torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000,
-    reason="Needs 16GB+ GPU memory.",
-)
 def test_correctness(B, T, V, scalar, dtype, reduction, atol, rtol):
     liger_ce = LigerCrossEntropyLoss(reduction=reduction)
     _test_correctness_once(liger_ce, B, T, V, reduction, scalar, dtype, atol, rtol)
@@ -233,12 +206,8 @@ def test_correctness(B, T, V, scalar, dtype, reduction, atol, rtol):
 @pytest.mark.parametrize(
     "scalar, dtype, atol, rtol",
     [
-        (0.1, torch.bfloat16, 1e-8, 5e-2),
         (1.0, torch.bfloat16, 1e-8, 5e-2),
-        (10.0, torch.bfloat16, 1e-7, 5e-2),
-        (0.1, torch.float32, 1e-8, 1e-6),
         (1.0, torch.float32, 1e-8, 1e-6),
-        (10.0, torch.float32, 1e-8, 1e-6),
     ],
 )
 def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol):
@@ -248,9 +217,7 @@ def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol):
 @pytest.mark.parametrize(
     "B, T, V, ignore_index",
     [
-        (2, 4096, 32000, -100),  # llama2, mistral
-        (2, 4096, 32000, 2),  # llama2, mistral
-        (1, 4096, 128256, -300),  # llama3
+        (2, 4096, 32000, 2),
         # weird shapes
         (3, 423, 32000, -123),
     ],
@@ -259,15 +226,6 @@ def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol):
 @pytest.mark.parametrize(
     "scalar, dtype, atol, rtol",
     [
-        pytest.param(
-            0.1,
-            torch.bfloat16,
-            1e-8,
-            5e-2,
-            marks=pytest.mark.skipif(
-                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
-            ),
-        ),
         pytest.param(
             1.0,
             torch.bfloat16,
@@ -277,24 +235,9 @@ def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol):
                 not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
             ),
         ),
-        pytest.param(
-            10.0,
-            torch.bfloat16,
-            1e-8,
-            5e-2,
-            marks=pytest.mark.skipif(
-                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
-            ),
-        ),
-        (0.1, torch.float32, 1e-8, 1e-6),
         (1.0, torch.float32, 1e-8, 1e-6),
-        (10.0, torch.float32, 1e-8, 1e-6),
     ],
 )
-@pytest.mark.skipif(
-    torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000,
-    reason="Needs 16GB+ GPU memory.",
-)
 def test_correctness_with_ignore_index(
     B, T, V, ignore_index, reduction, scalar, dtype, atol, rtol
 ):
@@ -307,9 +250,7 @@ def test_correctness_with_ignore_index(
 @pytest.mark.parametrize(
     "B, T, V, label_smoothing",
     [
-        (2, 4096, 32000, 0.1),  # llama2, mistral
-        (2, 4096, 32000, 0.1),  # llama2, mistral
-        (1, 4096, 128256, 0.1),  # llama3
+        (2, 4096, 32000, 0.1),
         # weird shapes
         (3, 423, 32000, 0.1),
     ],
@@ -317,15 +258,6 @@ def test_correctness_with_ignore_index(
 @pytest.mark.parametrize(
     "scalar, dtype, atol, rtol",
     [
-        pytest.param(
-            0.1,
-            torch.bfloat16,
-            1e-8,
-            5e-2,
-            marks=pytest.mark.skipif(
-                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
-            ),
-        ),
         pytest.param(
             1.0,
             torch.bfloat16,
@@ -335,24 +267,9 @@ def test_correctness_with_ignore_index(
                 not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
             ),
         ),
-        pytest.param(
-            10.0,
-            torch.bfloat16,
-            1e-8,
-            5e-2,
-            marks=pytest.mark.skipif(
-                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
-            ),
-        ),
-        (0.1, torch.float32, 1e-8, 1e-6),
         (1.0, torch.float32, 1e-8, 1e-6),
-        (10.0, torch.float32, 1e-8, 1e-6),
     ],
 )
-@pytest.mark.skipif(
-    torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000,
-    reason="Needs 16GB+ GPU memory.",
-)
 def test_correctness_with_label_smoothing_once(
     B, T, V, label_smoothing, scalar, dtype, atol, rtol
 ):
@@ -365,9 +282,7 @@ def test_correctness_with_label_smoothing_once(
 @pytest.mark.parametrize(
     "B, T, V, ignore_index, label_smoothing",
     [
-        (2, 4096, 32000, 1, 0.1),  # llama2, mistral
-        (2, 4096, 32000, -100, 0.2),  # llama2, mistral
-        (1, 4096, 128256, 2, 0.1),  # llama3
+        (2, 4096, 32000, 1, 0.1),
         # weird shapes
         (3, 423, 32000, -300, 0.2),
     ],
@@ -375,15 +290,6 @@ def test_correctness_with_label_smoothing_once(
 @pytest.mark.parametrize(
     "scalar, dtype, atol, rtol",
     [
-        pytest.param(
-            0.1,
-            torch.bfloat16,
-            1e-8,
-            5e-2,
-            marks=pytest.mark.skipif(
-                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
-            ),
-        ),
         pytest.param(
             1.0,
             torch.bfloat16,
@@ -393,24 +299,9 @@ def test_correctness_with_label_smoothing_once(
                 not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
             ),
         ),
-        pytest.param(
-            10.0,
-            torch.bfloat16,
-            1e-6,
-            5e-2,
-            marks=pytest.mark.skipif(
-                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
-            ),
-        ),
-        (0.1, torch.float32, 1e-8, 1e-6),
         (1.0, torch.float32, 1e-8, 1e-6),
-        (10.0, torch.float32, 1e-8, 1e-6),
     ],
 )
-@pytest.mark.skipif(
-    torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000,
-    reason="Needs 16GB+ GPU memory.",
-)
 def test_correctness_with_label_smoothing_with_ignore_index_once(
     B, T, V, ignore_index, label_smoothing, scalar, dtype, atol, rtol
 ):
@@ -427,8 +318,6 @@ def test_correctness_with_label_smoothing_with_ignore_index_once(
     "B, T, V",
     [
         (2, 4096, 32000),  # llama2, mistral
-        (2, 4096, 32000),  # llama2, mistral
-        (1, 4096, 128256),  # llama3
         # # weird shapes
         (3, 423, 32000),
     ],
@@ -449,52 +338,8 @@ def test_correctness_with_label_smoothing_with_ignore_index_once(
         (1.0, torch.float32, 1e-8, 1e-6),
     ],
 )
-@pytest.mark.skipif(
-    torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000,
-    reason="Needs 16GB+ GPU memory.",
-)
 def test_correctness_not_last_layer(B, T, V, reduction, scalar, dtype, atol, rtol):
     liger_ce = LigerCrossEntropyLoss(reduction=reduction)
     _test_correctness_not_last_layer_once(
         liger_ce, B, T, V, reduction, scalar, dtype, atol, rtol
     )
-
-
-#############################################################################
-# Test full pass of the liger cross entropy loss to ensure it doesn't crash
-#############################################################################
-
-
-def _full_pass_once(B, T, V, reduction):
-
-    liger_ce = LigerCrossEntropyLoss(reduction=reduction)
-
-    _input = torch.randn(
-        B * T, V, requires_grad=True, device="cuda", dtype=torch.bfloat16
-    )
-    target = torch.randint(V, (B * T, 1), device="cuda").squeeze(1)
-
-    output = liger_ce(_input, target)
-    output.backward()
-
-
-@pytest.mark.parametrize(
-    "B, T, V",
-    [
-        (
-            8,
-            8192,
-            128256,
-        ),  # _input = 16GB, total = ~32GB, 8405385216 > 2,147,483,647, so we need int64
-        (8, 16384, 128256),  # _input = 32GB, total = ~64GB
-    ],
-)
-@pytest.mark.parametrize("reduction", ["sum", "mean"])
-@pytest.mark.skipif(
-    torch.cuda.get_device_properties(0).total_memory < 64 * 1000 * 1000 * 1000,
-    reason="Needs 64GB+ GPU memory.",
-)
-def test_large_no_exception(B, T, V, reduction):
-    # The large inputs were hitting cuda illegal memory access because of
-    # https://github.com/triton-lang/triton/issues/1058
-    _full_pass_once(B, T, V, reduction)
diff --git a/test/transformers/test_embedding.py b/test/transformers/test_embedding.py
index b192835e3..998a544c5 100644
--- a/test/transformers/test_embedding.py
+++ b/test/transformers/test_embedding.py
@@ -7,6 +7,7 @@
 SLEEP_SECONDS = 0.1
 
 
+@pytest.mark.skip(reason="LigerEmbedding is under experimentation")
 @pytest.mark.parametrize(
     "num_embeddings, embedding_dim, padding_idx",
     [
diff --git a/test/transformers/test_fused_linear_cross_entropy.py b/test/transformers/test_fused_linear_cross_entropy.py
index 1711e5ee6..c93488667 100644
--- a/test/transformers/test_fused_linear_cross_entropy.py
+++ b/test/transformers/test_fused_linear_cross_entropy.py
@@ -86,12 +86,8 @@ def forward(self, x, y):
 @pytest.mark.parametrize(
     "B, T, H, V",
     [
-        # (2, 4, 512, 512),  # The test does not work on some CI GPUs. Issue #160
-        (8, 2048, 4096, 32000),  # llama2, mistral
-        # Comment out to speed up testing
-        # (4, 2048, 4096, 128256),  # llama3 8B
-        # (4, 1024, 8192, 128256),  # llama3 70B
-        (4, 423, 8192, 32000),  # random shape
+        (8, 128, 1024, 4096),
+        (4, 47, 31, 123),  # random shape
     ],
 )
 @pytest.mark.parametrize(
@@ -233,12 +229,8 @@ def test_correctness_functional(B, T, H, V, scalar, dtype, bias, atol, rtol):
 @pytest.mark.parametrize(
     "B, T, H, V",
     [
-        (2, 4, 512, 512),  # The test does not work on some CI GPUs. Issue #160
-        (8, 2048, 4096, 32000),  # llama2, mistral
-        # Comment out to speed up testing
-        (4, 2048, 4096, 128256),  # llama3 8B
-        (4, 1024, 8192, 128256),  # llama3 70B
-        (4, 423, 8192, 32000),  # random shape
+        (8, 128, 1024, 4096),
+        (4, 47, 31, 123),  # random shape
     ],
 )
 @pytest.mark.parametrize(
diff --git a/test/transformers/test_fused_linear_jsd.py b/test/transformers/test_fused_linear_jsd.py
index cd6d24ef1..31a3ea103 100644
--- a/test/transformers/test_fused_linear_jsd.py
+++ b/test/transformers/test_fused_linear_jsd.py
@@ -89,11 +89,7 @@ def forward(self, student_input, teacher_input, label=None):
 @pytest.mark.parametrize(
     "B, T, H, V",
     [
-        (2, 2, 512, 1600),
-        (2, 4, 1024, 1600),
-        # Comment out to speed up testing
-        # (4, 2048, 4096, 128256),  # llama3 8B
-        # (4, 1024, 8192, 128256),  # llama3 70B
+        (8, 128, 1024, 4096),
         (4, 423, 167, 1423),  # random shape
     ],
 )
@@ -166,12 +162,8 @@ def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):
 @pytest.mark.parametrize(
     "B, T, H, V",
     [
-        (2, 4, 2048, 3200),
-        (2, 2048, 4096, 32000),  # llama2, mistral
-        # Comment out to speed up testing
-        # (4, 2048, 4096, 128256),  # llama3 8B
-        # (4, 1024, 8192, 128256),  # llama3 70B
-        (4, 423, 8192, 32000),  # random shape
+        (8, 128, 1024, 4096),
+        (4, 423, 167, 1423),  # random shape
     ],
 )
 @pytest.mark.parametrize(
@@ -257,12 +249,9 @@ def test_correctness_with_ignore_index(
 @pytest.mark.parametrize(
     "B, T, H, V",
     [
-        (2, 4, 2048, 3200),
-        (2, 2048, 4096, 32000),  # llama2, mistral
-        # Comment out to speed up testing
-        # (4, 2048, 4096, 128256),  # llama3 8B
-        # (4, 1024, 8192, 128256),  # llama3 70B
-        (4, 423, 8192, 32000),  # random shape
+        (2, 2, 8, 8),
+        # weird shapes
+        (9, 7, 41, 41),
     ],
 )
 @pytest.mark.parametrize(
@@ -336,7 +325,8 @@ def test_correctness_functional(
 @pytest.mark.parametrize(
     "B, T, H, V",
     [
-        (2, 4, 2048, 3200),
+        (8, 128, 1024, 4096),
+        (4, 423, 167, 1423),  # random shape
     ],
 )
 @pytest.mark.parametrize(
diff --git a/test/transformers/test_geglu.py b/test/transformers/test_geglu.py
index 4fa744656..cf7c5a3c5 100644
--- a/test/transformers/test_geglu.py
+++ b/test/transformers/test_geglu.py
@@ -20,11 +20,9 @@
 @pytest.mark.parametrize(
     "bsz, seq_len, hidden_size, intermediate_size",
     [
-        (2, 2048, 4096, 11008),
         (2, 2048, 2048, 4096),
         # weird shapes
         (9, 41, 341, 4231),
-        (6, 42, 256, 2048),
     ],
 )
 @pytest.mark.parametrize(
diff --git a/test/transformers/test_jsd.py b/test/transformers/test_jsd.py
index 220e87271..388b3a5c3 100644
--- a/test/transformers/test_jsd.py
+++ b/test/transformers/test_jsd.py
@@ -52,21 +52,9 @@ def forward(
 _SHAPE_PARAMS = (
     "B, T, V",
     [
-        (2, 1024, 3200),
         (2, 1024, 3200),
         # weird shape
         (41, 401, 1271),
-        pytest.param(
-            1,
-            4096,
-            128256,
-            marks=pytest.mark.skipif(
-                torch.cuda.get_device_properties(0).total_memory
-                < 36 * 1000 * 1000 * 1000,
-                reason="This test requires a GPU with at least 36GB of memory",
-            ),
-        ),
-        (3, 423, 1600),
     ],
 )
 
diff --git a/test/transformers/test_kl_div.py b/test/transformers/test_kl_div.py
index a624d5f0c..5cc3eba6a 100644
--- a/test/transformers/test_kl_div.py
+++ b/test/transformers/test_kl_div.py
@@ -10,20 +10,8 @@
     "B, T, V",
     [
         (1, 4096, 32000),
-        (32, 4096, 1024),
         # weird shape
         (41, 401, 1271),
-        pytest.param(
-            1,
-            4096,
-            128256,
-            marks=pytest.mark.skipif(
-                torch.cuda.get_device_properties(0).total_memory
-                < 36 * 1000 * 1000 * 1000,
-                reason="This test requires a GPU with at least 36GB of memory",
-            ),
-        ),
-        (3, 423, 32000),
     ],
 )
 
diff --git a/test/transformers/test_layer_norm.py b/test/transformers/test_layer_norm.py
index ae2412c72..69aa1b252 100644
--- a/test/transformers/test_layer_norm.py
+++ b/test/transformers/test_layer_norm.py
@@ -7,20 +7,10 @@
 
 
 @pytest.mark.parametrize(
-    "hidden_size",
+    "batch_size, seq_len, hidden_size",
     [
-        64,
-        128,
-        256,
-        512,
-    ],
-)
-@pytest.mark.parametrize(
-    "batch_size, seq_len",
-    [
-        (2, 8),
-        (4, 16),
-        (8, 32),
+        (2, 8, 64),
+        (4, 16, 128),
     ],
 )
 @pytest.mark.parametrize(
@@ -59,14 +49,10 @@ def test_liger_layer_norm(batch_size, seq_len, hidden_size, dtype, atol, rtol):
 
 
 @pytest.mark.parametrize(
-    "hidden_size",
-    [8, 41],
-)
-@pytest.mark.parametrize(
-    "batch_size, seq_len",
+    "batch_size, seq_len, hidden_size",
     [
-        (2, 2),
-        (9, 7),
+        (2, 8, 64),
+        (4, 16, 128),
     ],
 )
 @pytest.mark.parametrize(
diff --git a/test/transformers/test_mm_int8int2.py b/test/transformers/test_mm_int8int2.py
index d9de0780e..d7d13a958 100644
--- a/test/transformers/test_mm_int8int2.py
+++ b/test/transformers/test_mm_int8int2.py
@@ -9,6 +9,7 @@
 
 
 # input_features = size*4 when the weight matrix is unpacked
+@pytest.mark.skip(reason="mm_int8int2 is under experimentation")
 @pytest.mark.parametrize(
     "size",
     [
@@ -73,6 +74,7 @@ def test_kernel_correctness(
     ), "Results differ"
 
 
+@pytest.mark.skip(reason="mm_int8int2 is under experimentation")
 @pytest.mark.parametrize(
     "size",
     [
diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py
index 9578fb937..1dd2299b8 100644
--- a/test/transformers/test_rms_norm.py
+++ b/test/transformers/test_rms_norm.py
@@ -74,14 +74,8 @@ def forward(self, x):
     "bs, sl, hd",
     [
         (2, 128, 512),
-        (4, 256, 1024),
-        (8, 512, 2048),
-        (8, 1024, 4096),
-        # # # weird shapes
-        (3, 423, 213),
+        # weird shapes
         (5, 123, 123),
-        (7, 341, 234),
-        (9, 236, 345),
     ],
 )
 @pytest.mark.parametrize(
@@ -96,7 +90,6 @@ def forward(self, x):
                 not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
             ),
         ),
-        (torch.float16, 2e-1, 2e-2),
     ],
 )
 @pytest.mark.parametrize(
@@ -108,9 +101,6 @@ def forward(self, x):
     ],
 )
 def test_correctness(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_mode):
-    if reference == BaseRMSNorm and dtype == torch.bfloat16:
-        pytest.skip("bfloat16 has larger errors for BaseRMSNorm")
-
     _tensor = torch.randn(bs, sl, hd, device="cuda", dtype=dtype)
 
     h1 = _tensor.clone().requires_grad_(True)
@@ -146,7 +136,7 @@ def test_correctness(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_m
     "bs, sl, hd",
     [
         (2, 2, 8),
-        # # weird shapes
+        # weird shapes
         (9, 7, 41),
     ],
 )
@@ -155,7 +145,6 @@ def test_correctness(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_m
     [
         (torch.float32, 1e-4, 1e-6),
         (torch.bfloat16, 2e-1, 2e-2),
-        (torch.float16, 2e-1, 2e-2),
     ],
 )
 @pytest.mark.parametrize(
diff --git a/test/transformers/test_swiglu.py b/test/transformers/test_swiglu.py
index ccb395c98..be7aaef42 100644
--- a/test/transformers/test_swiglu.py
+++ b/test/transformers/test_swiglu.py
@@ -27,11 +27,9 @@
 @pytest.mark.parametrize(
     "bsz, seq_len, hidden_size, intermediate_size",
     [
-        (2, 2048, 4096, 11008),
-        (2, 2048, 2048, 4096),
+        (2, 256, 256, 512),
         # weird shapes
-        (9, 41, 341, 4231),
-        (6, 42, 256, 2048),
+        (6, 42, 123, 431),
     ],
 )
 @pytest.mark.parametrize(
@@ -109,11 +107,9 @@ def test_correctness_llamamlp(
 @pytest.mark.parametrize(
     "bsz, seq_len, hidden_size, intermediate_size",
     [
-        (2, 2048, 4096, 11008),
-        (2, 2048, 2048, 4096),
+        (2, 256, 256, 512),
         # weird shapes
-        (9, 41, 341, 4231),
-        (6, 42, 256, 2048),
+        (6, 42, 123, 431),
     ],
 )
 @pytest.mark.parametrize(