From 189c411e0f3d1a92f4bfee865560e33887b91a9f Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:52:27 -0800
Subject: [PATCH] Switch amd-ci to use MI300X runner. (#428)

This commit switches the amd-ci workflow to use MI300x gpu provided by
AMD for testing coverage.

---------

Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .github/workflows/amd-ci.yml            | 75 +++++++++++++++++--------
 test/transformers/test_cross_entropy.py |  5 +-
 test/transformers/test_rms_norm.py      |  1 +
 3 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/amd-ci.yml b/.github/workflows/amd-ci.yml
index 41be18346..4a74521d2 100644
--- a/.github/workflows/amd-ci.yml
+++ b/.github/workflows/amd-ci.yml
@@ -1,23 +1,23 @@
 name: GitHub Actions CI (AMD)
 
-# on:
-#   push:
-#     branches:
-#       - main
-#     paths:
-#       - "src/**"
-#       - "test/**"
-#   pull_request:
-#     branches:
-#       - main
-#     paths:
-#       - "src/**"
-#       - "test/**"
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "src/**"
+      - "test/**"
+  pull_request:
+    branches:
+      - main
+    # paths:
+    #   - "src/**"
+    #   - "test/**"
 
-# concurrency:
-#   # This causes it to cancel previous in-progress actions on the same PR / branch,
-#   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-#   cancel-in-progress: true
+concurrency:
+  # This causes it to cancel previous in-progress actions on the same PR / branch,
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   checkstyle:
@@ -36,12 +36,11 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install flake8 isort black
-
     - name: Run checkstyle
       run: make checkstyle
 
   tests:
-    runs-on: ubuntu-latest
+    runs-on: linux-mi300-gpu-1
     needs: [checkstyle]
 
     steps:
@@ -53,12 +52,40 @@ jobs:
       with:
         python-version: '3.10'
 
-    - name: Install dependencies
+    - name: Check Docker Version
+      run: docker version
+
+    - name: Check Ubuntu version
+      run: lsb_release -a
+
+    - name: Check Hardware Specs
+      run: lscpu
+    
+    - name: ROCM-SMI Output
       run: |
-        python -m pip install --upgrade pip
-        pip install -e ".[dev]"
+        rocm-smi
+        rocm-smi --showproductname
 
-    - name: Run tests
+    - name: Setup Dependencies
+      run: |
+        cp -r /opt/rocm/share/amd_smi ./
+        cd amd_smi
+        python -m pip install -e .
+        cd ..
+        python -m pip install pytest pytest-xdist pytest-rerunfailures pytest-flakefinder pytest-cpp
+        python -m pip uninstall -y torch torchvision
+        python -m pip install --pre \
+                torch==2.6.0.dev20241113+rocm6.2 \
+                'setuptools-scm>=8' \
+                torchvision==0.20.0.dev20241113+rocm6.2 \
+                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
+        python -m pip install triton==3.1.0 transformers==4.46.3
+        python -m pip install -e .[dev]
+    
+    - name: List Python Environments
+      run: python -m pip list
+    
+    - name: Run Unit Tests
       run: |
         make test
-        make test-convergence
\ No newline at end of file
+        make test-convergence
diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py
index 791ce93b3..f2bf0d62f 100644
--- a/test/transformers/test_cross_entropy.py
+++ b/test/transformers/test_cross_entropy.py
@@ -12,6 +12,7 @@
 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
 from liger_kernel.transformers.functional import liger_cross_entropy
 from liger_kernel.utils import infer_device
+from liger_kernel.ops.utils import is_hip
 
 device = infer_device()
 set_seed(42)
@@ -763,7 +764,7 @@ def test_float32_internal():
         RETURN_Z_LOSS=0,  # False
         HAS_SOFTCAPPING=False,
         BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=32,
+        num_warps=32 if not is_hip() else 16,
     )
 
     # Run kernel for float32
@@ -787,7 +788,7 @@ def test_float32_internal():
         RETURN_Z_LOSS=0,  # False
         HAS_SOFTCAPPING=False,
         BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=32,
+        num_warps=32 if not is_hip() else 16,
     )
 
     torch.allclose(X_bf16, X_fp32.bfloat16())
diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py
index dc0c78643..5831b1ec2 100644
--- a/test/transformers/test_rms_norm.py
+++ b/test/transformers/test_rms_norm.py
@@ -74,6 +74,7 @@ def forward(self, x):
         return output.type_as(x)
 
 
+@pytest.mark.flaky(reruns=3, reruns_delay=2)
 @pytest.mark.parametrize(
     "bs, sl, hd",
     [