diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f16ee0091..0210e1b55 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,28 +59,28 @@ jobs: run: | modal run dev.modal.unit_tests - convergence-tests: - runs-on: ubuntu-latest - needs: [checkstyle] - - env: - MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} - MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.10' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install modal - - - name: Run convergence tests - run: | - modal run dev.modal.conv_tests \ No newline at end of file + # convergence-tests: + # runs-on: ubuntu-latest + # needs: [checkstyle] + + # env: + # MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + # MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + + # steps: + # - name: Checkout code + # uses: actions/checkout@v3 + + # - name: Set up Python + # uses: actions/setup-python@v3 + # with: + # python-version: '3.10' + + # - name: Install dependencies + # run: | + # python -m pip install --upgrade pip + # pip install modal + + # - name: Run convergence tests + # run: | + # modal run dev.modal.conv_tests \ No newline at end of file diff --git a/dev/modal/unit_tests.py b/dev/modal/unit_tests.py index dc3fb5369..9a2fef4e5 100644 --- a/dev/modal/unit_tests.py +++ b/dev/modal/unit_tests.py @@ -14,7 +14,7 @@ repo = modal.Mount.from_local_dir(ROOT_PATH, remote_path="/root/liger-kernel") -@app.function(gpu="A10G", mounts=[repo], timeout=60 * 20) +@app.function(gpu="A10G", mounts=[repo], timeout=60 * 5) def liger_unit_test(): import subprocess diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py index 1a970573e..43a904a50 100644 --- a/test/transformers/test_cross_entropy.py +++ b/test/transformers/test_cross_entropy.py @@ -170,26 +170,14 @@ def _test_correctness_functional(B, T, V, scalar, dtype, atol, rtol): @pytest.mark.parametrize( "B, T, V", [ - (2, 4096, 32000), # llama2, mistral - (2, 4096, 32000), # llama2, mistral - (1, 4096, 128256), # llama3 - # # weird shapes - (3, 423, 32000), + (2, 4096, 32000), # llama + (3, 423, 32000), # weird shapes ], ) @pytest.mark.parametrize("reduction", ["sum", "mean"]) @pytest.mark.parametrize( "scalar, dtype, atol, rtol", [ - pytest.param( - 0.1, - torch.bfloat16, - 1e-8, - 5e-2, - marks=pytest.mark.skipif( - not supports_bfloat16(), reason="bfloat16 not supported on this GPU" - ), - ), pytest.param( 1.0, torch.bfloat16, @@ -199,24 +187,9 @@ def _test_correctness_functional(B, T, V, scalar, dtype, atol, rtol): not supports_bfloat16(), reason="bfloat16 not supported on this GPU" ), ), - pytest.param( - 10.0, - torch.bfloat16, - 1e-7, - 5e-2, - marks=pytest.mark.skipif( - not supports_bfloat16(), reason="bfloat16 not supported on this GPU" - ), - ), - (0.1, torch.float32, 1e-8, 1e-6), (1.0, torch.float32, 1e-8, 1e-6), - (10.0, torch.float32, 1e-8, 1e-6), ], ) -@pytest.mark.skipif( - torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000, - reason="Needs 16GB+ GPU memory.", -) def test_correctness(B, T, V, scalar, dtype, reduction, atol, rtol): liger_ce = LigerCrossEntropyLoss(reduction=reduction) _test_correctness_once(liger_ce, B, T, V, reduction, scalar, dtype, atol, rtol) @@ -233,12 +206,8 @@ def test_correctness(B, T, V, scalar, dtype, reduction, atol, rtol): @pytest.mark.parametrize( "scalar, dtype, atol, rtol", [ - (0.1, torch.bfloat16, 1e-8, 5e-2), (1.0, torch.bfloat16, 1e-8, 5e-2), - (10.0, torch.bfloat16, 1e-7, 5e-2), - (0.1, torch.float32, 1e-8, 1e-6), (1.0, torch.float32, 1e-8, 1e-6), - (10.0, torch.float32, 1e-8, 1e-6), ], ) def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol): @@ -248,9 +217,7 @@ def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol): @pytest.mark.parametrize( "B, T, V, ignore_index", [ - (2, 4096, 32000, -100), # llama2, mistral - (2, 4096, 32000, 2), # llama2, mistral - (1, 4096, 128256, -300), # llama3 + (2, 4096, 32000, 2), # weird shapes (3, 423, 32000, -123), ], @@ -259,15 +226,6 @@ def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol): @pytest.mark.parametrize( "scalar, dtype, atol, rtol", [ - pytest.param( - 0.1, - torch.bfloat16, - 1e-8, - 5e-2, - marks=pytest.mark.skipif( - not supports_bfloat16(), reason="bfloat16 not supported on this GPU" - ), - ), pytest.param( 1.0, torch.bfloat16, @@ -277,24 +235,9 @@ def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol): not supports_bfloat16(), reason="bfloat16 not supported on this GPU" ), ), - pytest.param( - 10.0, - torch.bfloat16, - 1e-8, - 5e-2, - marks=pytest.mark.skipif( - not supports_bfloat16(), reason="bfloat16 not supported on this GPU" - ), - ), - (0.1, torch.float32, 1e-8, 1e-6), (1.0, torch.float32, 1e-8, 1e-6), - (10.0, torch.float32, 1e-8, 1e-6), ], ) -@pytest.mark.skipif( - torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000, - reason="Needs 16GB+ GPU memory.", -) def test_correctness_with_ignore_index( B, T, V, ignore_index, reduction, scalar, dtype, atol, rtol ): @@ -307,9 +250,7 @@ def test_correctness_with_ignore_index( @pytest.mark.parametrize( "B, T, V, label_smoothing", [ - (2, 4096, 32000, 0.1), # llama2, mistral - (2, 4096, 32000, 0.1), # llama2, mistral - (1, 4096, 128256, 0.1), # llama3 + (2, 4096, 32000, 0.1), # weird shapes (3, 423, 32000, 0.1), ], @@ -317,15 +258,6 @@ def test_correctness_with_ignore_index( @pytest.mark.parametrize( "scalar, dtype, atol, rtol", [ - pytest.param( - 0.1, - torch.bfloat16, - 1e-8, - 5e-2, - marks=pytest.mark.skipif( - not supports_bfloat16(), reason="bfloat16 not supported on this GPU" - ), - ), pytest.param( 1.0, torch.bfloat16, @@ -335,24 +267,9 @@ def test_correctness_with_ignore_index( not supports_bfloat16(), reason="bfloat16 not supported on this GPU" ), ), - pytest.param( - 10.0, - torch.bfloat16, - 1e-8, - 5e-2, - marks=pytest.mark.skipif( - not supports_bfloat16(), reason="bfloat16 not supported on this GPU" - ), - ), - (0.1, torch.float32, 1e-8, 1e-6), (1.0, torch.float32, 1e-8, 1e-6), - (10.0, torch.float32, 1e-8, 1e-6), ], ) -@pytest.mark.skipif( - torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000, - reason="Needs 16GB+ GPU memory.", -) def test_correctness_with_label_smoothing_once( B, T, V, label_smoothing, scalar, dtype, atol, rtol ): @@ -365,9 +282,7 @@ def test_correctness_with_label_smoothing_once( @pytest.mark.parametrize( "B, T, V, ignore_index, label_smoothing", [ - (2, 4096, 32000, 1, 0.1), # llama2, mistral - (2, 4096, 32000, -100, 0.2), # llama2, mistral - (1, 4096, 128256, 2, 0.1), # llama3 + (2, 4096, 32000, 1, 0.1), # weird shapes (3, 423, 32000, -300, 0.2), ], @@ -375,15 +290,6 @@ def test_correctness_with_label_smoothing_once( @pytest.mark.parametrize( "scalar, dtype, atol, rtol", [ - pytest.param( - 0.1, - torch.bfloat16, - 1e-8, - 5e-2, - marks=pytest.mark.skipif( - not supports_bfloat16(), reason="bfloat16 not supported on this GPU" - ), - ), pytest.param( 1.0, torch.bfloat16, @@ -393,24 +299,9 @@ def test_correctness_with_label_smoothing_once( not supports_bfloat16(), reason="bfloat16 not supported on this GPU" ), ), - pytest.param( - 10.0, - torch.bfloat16, - 1e-6, - 5e-2, - marks=pytest.mark.skipif( - not supports_bfloat16(), reason="bfloat16 not supported on this GPU" - ), - ), - (0.1, torch.float32, 1e-8, 1e-6), (1.0, torch.float32, 1e-8, 1e-6), - (10.0, torch.float32, 1e-8, 1e-6), ], ) -@pytest.mark.skipif( - torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000, - reason="Needs 16GB+ GPU memory.", -) def test_correctness_with_label_smoothing_with_ignore_index_once( B, T, V, ignore_index, label_smoothing, scalar, dtype, atol, rtol ): @@ -427,8 +318,6 @@ def test_correctness_with_label_smoothing_with_ignore_index_once( "B, T, V", [ (2, 4096, 32000), # llama2, mistral - (2, 4096, 32000), # llama2, mistral - (1, 4096, 128256), # llama3 # # weird shapes (3, 423, 32000), ], @@ -449,52 +338,8 @@ def test_correctness_with_label_smoothing_with_ignore_index_once( (1.0, torch.float32, 1e-8, 1e-6), ], ) -@pytest.mark.skipif( - torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000, - reason="Needs 16GB+ GPU memory.", -) def test_correctness_not_last_layer(B, T, V, reduction, scalar, dtype, atol, rtol): liger_ce = LigerCrossEntropyLoss(reduction=reduction) _test_correctness_not_last_layer_once( liger_ce, B, T, V, reduction, scalar, dtype, atol, rtol ) - - -############################################################################# -# Test full pass of the liger cross entropy loss to ensure it doesn't crash -############################################################################# - - -def _full_pass_once(B, T, V, reduction): - - liger_ce = LigerCrossEntropyLoss(reduction=reduction) - - _input = torch.randn( - B * T, V, requires_grad=True, device="cuda", dtype=torch.bfloat16 - ) - target = torch.randint(V, (B * T, 1), device="cuda").squeeze(1) - - output = liger_ce(_input, target) - output.backward() - - -@pytest.mark.parametrize( - "B, T, V", - [ - ( - 8, - 8192, - 128256, - ), # _input = 16GB, total = ~32GB, 8405385216 > 2,147,483,647, so we need int64 - (8, 16384, 128256), # _input = 32GB, total = ~64GB - ], -) -@pytest.mark.parametrize("reduction", ["sum", "mean"]) -@pytest.mark.skipif( - torch.cuda.get_device_properties(0).total_memory < 64 * 1000 * 1000 * 1000, - reason="Needs 64GB+ GPU memory.", -) -def test_large_no_exception(B, T, V, reduction): - # The large inputs were hitting cuda illegal memory access because of - # https://github.com/triton-lang/triton/issues/1058 - _full_pass_once(B, T, V, reduction) diff --git a/test/transformers/test_embedding.py b/test/transformers/test_embedding.py index b192835e3..998a544c5 100644 --- a/test/transformers/test_embedding.py +++ b/test/transformers/test_embedding.py @@ -7,6 +7,7 @@ SLEEP_SECONDS = 0.1 +@pytest.mark.skip(reason="LigerEmbedding is under experimentation") @pytest.mark.parametrize( "num_embeddings, embedding_dim, padding_idx", [ diff --git a/test/transformers/test_fused_linear_cross_entropy.py b/test/transformers/test_fused_linear_cross_entropy.py index 1711e5ee6..c93488667 100644 --- a/test/transformers/test_fused_linear_cross_entropy.py +++ b/test/transformers/test_fused_linear_cross_entropy.py @@ -86,12 +86,8 @@ def forward(self, x, y): @pytest.mark.parametrize( "B, T, H, V", [ - # (2, 4, 512, 512), # The test does not work on some CI GPUs. Issue #160 - (8, 2048, 4096, 32000), # llama2, mistral - # Comment out to speed up testing - # (4, 2048, 4096, 128256), # llama3 8B - # (4, 1024, 8192, 128256), # llama3 70B - (4, 423, 8192, 32000), # random shape + (8, 128, 1024, 4096), + (4, 47, 31, 123), # random shape ], ) @pytest.mark.parametrize( @@ -233,12 +229,8 @@ def test_correctness_functional(B, T, H, V, scalar, dtype, bias, atol, rtol): @pytest.mark.parametrize( "B, T, H, V", [ - (2, 4, 512, 512), # The test does not work on some CI GPUs. Issue #160 - (8, 2048, 4096, 32000), # llama2, mistral - # Comment out to speed up testing - (4, 2048, 4096, 128256), # llama3 8B - (4, 1024, 8192, 128256), # llama3 70B - (4, 423, 8192, 32000), # random shape + (8, 128, 1024, 4096), + (4, 47, 31, 123), # random shape ], ) @pytest.mark.parametrize( diff --git a/test/transformers/test_fused_linear_jsd.py b/test/transformers/test_fused_linear_jsd.py index cd6d24ef1..31a3ea103 100644 --- a/test/transformers/test_fused_linear_jsd.py +++ b/test/transformers/test_fused_linear_jsd.py @@ -89,11 +89,7 @@ def forward(self, student_input, teacher_input, label=None): @pytest.mark.parametrize( "B, T, H, V", [ - (2, 2, 512, 1600), - (2, 4, 1024, 1600), - # Comment out to speed up testing - # (4, 2048, 4096, 128256), # llama3 8B - # (4, 1024, 8192, 128256), # llama3 70B + (8, 128, 1024, 4096), (4, 423, 167, 1423), # random shape ], ) @@ -166,12 +162,8 @@ def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol): @pytest.mark.parametrize( "B, T, H, V", [ - (2, 4, 2048, 3200), - (2, 2048, 4096, 32000), # llama2, mistral - # Comment out to speed up testing - # (4, 2048, 4096, 128256), # llama3 8B - # (4, 1024, 8192, 128256), # llama3 70B - (4, 423, 8192, 32000), # random shape + (8, 128, 1024, 4096), + (4, 423, 167, 1423), # random shape ], ) @pytest.mark.parametrize( @@ -257,12 +249,9 @@ def test_correctness_with_ignore_index( @pytest.mark.parametrize( "B, T, H, V", [ - (2, 4, 2048, 3200), - (2, 2048, 4096, 32000), # llama2, mistral - # Comment out to speed up testing - # (4, 2048, 4096, 128256), # llama3 8B - # (4, 1024, 8192, 128256), # llama3 70B - (4, 423, 8192, 32000), # random shape + (2, 2, 8, 8), + # weird shapes + (9, 7, 41, 41), ], ) @pytest.mark.parametrize( @@ -336,7 +325,8 @@ def test_correctness_functional( @pytest.mark.parametrize( "B, T, H, V", [ - (2, 4, 2048, 3200), + (8, 128, 1024, 4096), + (4, 423, 167, 1423), # random shape ], ) @pytest.mark.parametrize( diff --git a/test/transformers/test_geglu.py b/test/transformers/test_geglu.py index 4fa744656..cf7c5a3c5 100644 --- a/test/transformers/test_geglu.py +++ b/test/transformers/test_geglu.py @@ -20,11 +20,9 @@ @pytest.mark.parametrize( "bsz, seq_len, hidden_size, intermediate_size", [ - (2, 2048, 4096, 11008), (2, 2048, 2048, 4096), # weird shapes (9, 41, 341, 4231), - (6, 42, 256, 2048), ], ) @pytest.mark.parametrize( diff --git a/test/transformers/test_jsd.py b/test/transformers/test_jsd.py index 220e87271..388b3a5c3 100644 --- a/test/transformers/test_jsd.py +++ b/test/transformers/test_jsd.py @@ -52,21 +52,9 @@ def forward( _SHAPE_PARAMS = ( "B, T, V", [ - (2, 1024, 3200), (2, 1024, 3200), # weird shape (41, 401, 1271), - pytest.param( - 1, - 4096, - 128256, - marks=pytest.mark.skipif( - torch.cuda.get_device_properties(0).total_memory - < 36 * 1000 * 1000 * 1000, - reason="This test requires a GPU with at least 36GB of memory", - ), - ), - (3, 423, 1600), ], ) diff --git a/test/transformers/test_kl_div.py b/test/transformers/test_kl_div.py index a624d5f0c..5cc3eba6a 100644 --- a/test/transformers/test_kl_div.py +++ b/test/transformers/test_kl_div.py @@ -10,20 +10,8 @@ "B, T, V", [ (1, 4096, 32000), - (32, 4096, 1024), # weird shape (41, 401, 1271), - pytest.param( - 1, - 4096, - 128256, - marks=pytest.mark.skipif( - torch.cuda.get_device_properties(0).total_memory - < 36 * 1000 * 1000 * 1000, - reason="This test requires a GPU with at least 36GB of memory", - ), - ), - (3, 423, 32000), ], ) diff --git a/test/transformers/test_layer_norm.py b/test/transformers/test_layer_norm.py index ae2412c72..69aa1b252 100644 --- a/test/transformers/test_layer_norm.py +++ b/test/transformers/test_layer_norm.py @@ -7,20 +7,10 @@ @pytest.mark.parametrize( - "hidden_size", + "batch_size, seq_len, hidden_size", [ - 64, - 128, - 256, - 512, - ], -) -@pytest.mark.parametrize( - "batch_size, seq_len", - [ - (2, 8), - (4, 16), - (8, 32), + (2, 8, 64), + (4, 16, 128), ], ) @pytest.mark.parametrize( @@ -59,14 +49,10 @@ def test_liger_layer_norm(batch_size, seq_len, hidden_size, dtype, atol, rtol): @pytest.mark.parametrize( - "hidden_size", - [8, 41], -) -@pytest.mark.parametrize( - "batch_size, seq_len", + "batch_size, seq_len, hidden_size", [ - (2, 2), - (9, 7), + (2, 8, 64), + (4, 16, 128), ], ) @pytest.mark.parametrize( diff --git a/test/transformers/test_mm_int8int2.py b/test/transformers/test_mm_int8int2.py index d9de0780e..d7d13a958 100644 --- a/test/transformers/test_mm_int8int2.py +++ b/test/transformers/test_mm_int8int2.py @@ -9,6 +9,7 @@ # input_features = size*4 when the weight matrix is unpacked +@pytest.mark.skip(reason="mm_int8int2 is under experimentation") @pytest.mark.parametrize( "size", [ @@ -73,6 +74,7 @@ def test_kernel_correctness( ), "Results differ" +@pytest.mark.skip(reason="mm_int8int2 is under experimentation") @pytest.mark.parametrize( "size", [ diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py index 9578fb937..1dd2299b8 100644 --- a/test/transformers/test_rms_norm.py +++ b/test/transformers/test_rms_norm.py @@ -74,14 +74,8 @@ def forward(self, x): "bs, sl, hd", [ (2, 128, 512), - (4, 256, 1024), - (8, 512, 2048), - (8, 1024, 4096), - # # # weird shapes - (3, 423, 213), + # weird shapes (5, 123, 123), - (7, 341, 234), - (9, 236, 345), ], ) @pytest.mark.parametrize( @@ -96,7 +90,6 @@ def forward(self, x): not supports_bfloat16(), reason="bfloat16 not supported on this GPU" ), ), - (torch.float16, 2e-1, 2e-2), ], ) @pytest.mark.parametrize( @@ -108,9 +101,6 @@ def forward(self, x): ], ) def test_correctness(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_mode): - if reference == BaseRMSNorm and dtype == torch.bfloat16: - pytest.skip("bfloat16 has larger errors for BaseRMSNorm") - _tensor = torch.randn(bs, sl, hd, device="cuda", dtype=dtype) h1 = _tensor.clone().requires_grad_(True) @@ -146,7 +136,7 @@ def test_correctness(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_m "bs, sl, hd", [ (2, 2, 8), - # # weird shapes + # weird shapes (9, 7, 41), ], ) @@ -155,7 +145,6 @@ def test_correctness(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_m [ (torch.float32, 1e-4, 1e-6), (torch.bfloat16, 2e-1, 2e-2), - (torch.float16, 2e-1, 2e-2), ], ) @pytest.mark.parametrize( diff --git a/test/transformers/test_swiglu.py b/test/transformers/test_swiglu.py index ccb395c98..be7aaef42 100644 --- a/test/transformers/test_swiglu.py +++ b/test/transformers/test_swiglu.py @@ -27,11 +27,9 @@ @pytest.mark.parametrize( "bsz, seq_len, hidden_size, intermediate_size", [ - (2, 2048, 4096, 11008), - (2, 2048, 2048, 4096), + (2, 256, 256, 512), # weird shapes - (9, 41, 341, 4231), - (6, 42, 256, 2048), + (6, 42, 123, 431), ], ) @pytest.mark.parametrize( @@ -109,11 +107,9 @@ def test_correctness_llamamlp( @pytest.mark.parametrize( "bsz, seq_len, hidden_size, intermediate_size", [ - (2, 2048, 4096, 11008), - (2, 2048, 2048, 4096), + (2, 256, 256, 512), # weird shapes - (9, 41, 341, 4231), - (6, 42, 256, 2048), + (6, 42, 123, 431), ], ) @pytest.mark.parametrize(