Switch amd-ci to use MI300X runner. (#428)

This commit switches the amd-ci workflow to use MI300x gpu provided by AMD for testing coverage. --------- Co-authored-by: TJian <[email protected]> Co-authored-by: tjtanaa <[email protected]>
linkedin · Dec 7, 2024 · 189c411 · 189c411
1 parent 7a71725
commit 189c411
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 26 deletions.
diff --git a/.github/workflows/amd-ci.yml b/.github/workflows/amd-ci.yml
@@ -1,23 +1,23 @@
 name: GitHub Actions CI (AMD)
 
-# on:
-#   push:
-#     branches:
-#       - main
-#     paths:
-#       - "src/**"
-#       - "test/**"
-#   pull_request:
-#     branches:
-#       - main
-#     paths:
-#       - "src/**"
-#       - "test/**"
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "src/**"
+      - "test/**"
+  pull_request:
+    branches:
+      - main
+    # paths:
+    #   - "src/**"
+    #   - "test/**"
 
-# concurrency:
-#   # This causes it to cancel previous in-progress actions on the same PR / branch,
-#   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-#   cancel-in-progress: true
+concurrency:
+  # This causes it to cancel previous in-progress actions on the same PR / branch,
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   checkstyle:
@@ -36,12 +36,11 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 isort black
-
     - name: Run checkstyle
       run: make checkstyle
 
   tests:
-    runs-on: ubuntu-latest
+    runs-on: linux-mi300-gpu-1
     needs: [checkstyle]
 
     steps:
@@ -53,12 +52,40 @@ jobs:
       with:
         python-version: '3.10'
 
-    - name: Install dependencies
+    - name: Check Docker Version
+      run: docker version
+
+    - name: Check Ubuntu version
+      run: lsb_release -a
+
+    - name: Check Hardware Specs
+      run: lscpu
+
+    - name: ROCM-SMI Output
       run: |
-        python -m pip install --upgrade pip
-        pip install -e ".[dev]"
+        rocm-smi
+        rocm-smi --showproductname
 
-    - name: Run tests
+    - name: Setup Dependencies
+      run: |
+        cp -r /opt/rocm/share/amd_smi ./
+        cd amd_smi
+        python -m pip install -e .
+        cd ..
+        python -m pip install pytest pytest-xdist pytest-rerunfailures pytest-flakefinder pytest-cpp
+        python -m pip uninstall -y torch torchvision
+        python -m pip install --pre \
+                torch==2.6.0.dev20241113+rocm6.2 \
+                'setuptools-scm>=8' \
+                torchvision==0.20.0.dev20241113+rocm6.2 \
+                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
+        python -m pip install triton==3.1.0 transformers==4.46.3
+        python -m pip install -e .[dev]
+    
+    - name: List Python Environments
+      run: python -m pip list
+
+    - name: Run Unit Tests
       run: |
         make test
-        make test-convergence
+        make test-convergence
diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py
@@ -12,6 +12,7 @@
 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
 from liger_kernel.transformers.functional import liger_cross_entropy
 from liger_kernel.utils import infer_device
+from liger_kernel.ops.utils import is_hip
 
 device = infer_device()
 set_seed(42)
@@ -763,7 +764,7 @@ def test_float32_internal():
         RETURN_Z_LOSS=0,  # False
         HAS_SOFTCAPPING=False,
         BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=32,
+        num_warps=32 if not is_hip() else 16,
     )
 
     # Run kernel for float32
@@ -787,7 +788,7 @@ def test_float32_internal():
         RETURN_Z_LOSS=0,  # False
         HAS_SOFTCAPPING=False,
         BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=32,
+        num_warps=32 if not is_hip() else 16,
     )
 
     torch.allclose(X_bf16, X_fp32.bfloat16())

diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py
@@ -74,6 +74,7 @@ def forward(self, x):
         return output.type_as(x)
 
 
+@pytest.mark.flaky(reruns=3, reruns_delay=2)
 @pytest.mark.parametrize(
     "bs, sl, hd",
     [