From 189c411e0f3d1a92f4bfee865560e33887b91a9f Mon Sep 17 00:00:00 2001 From: saienduri <77521230+saienduri@users.noreply.github.com> Date: Sat, 7 Dec 2024 11:52:27 -0800 Subject: [PATCH] Switch amd-ci to use MI300X runner. (#428) This commit switches the amd-ci workflow to use MI300x gpu provided by AMD for testing coverage. --------- Co-authored-by: TJian Co-authored-by: tjtanaa --- .github/workflows/amd-ci.yml | 75 +++++++++++++++++-------- test/transformers/test_cross_entropy.py | 5 +- test/transformers/test_rms_norm.py | 1 + 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/.github/workflows/amd-ci.yml b/.github/workflows/amd-ci.yml index 41be18346..4a74521d2 100644 --- a/.github/workflows/amd-ci.yml +++ b/.github/workflows/amd-ci.yml @@ -1,23 +1,23 @@ name: GitHub Actions CI (AMD) -# on: -# push: -# branches: -# - main -# paths: -# - "src/**" -# - "test/**" -# pull_request: -# branches: -# - main -# paths: -# - "src/**" -# - "test/**" +on: + push: + branches: + - main + paths: + - "src/**" + - "test/**" + pull_request: + branches: + - main + # paths: + # - "src/**" + # - "test/**" -# concurrency: -# # This causes it to cancel previous in-progress actions on the same PR / branch, -# group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} -# cancel-in-progress: true +concurrency: + # This causes it to cancel previous in-progress actions on the same PR / branch, + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: checkstyle: @@ -36,12 +36,11 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 isort black - - name: Run checkstyle run: make checkstyle tests: - runs-on: ubuntu-latest + runs-on: linux-mi300-gpu-1 needs: [checkstyle] steps: @@ -53,12 +52,40 @@ jobs: with: python-version: '3.10' - - name: Install dependencies + - name: Check Docker Version + run: docker version + + - name: Check Ubuntu version + run: lsb_release -a + + - name: Check Hardware Specs + run: lscpu + + - name: ROCM-SMI Output run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + rocm-smi + rocm-smi --showproductname - - name: Run tests + - name: Setup Dependencies + run: | + cp -r /opt/rocm/share/amd_smi ./ + cd amd_smi + python -m pip install -e . + cd .. + python -m pip install pytest pytest-xdist pytest-rerunfailures pytest-flakefinder pytest-cpp + python -m pip uninstall -y torch torchvision + python -m pip install --pre \ + torch==2.6.0.dev20241113+rocm6.2 \ + 'setuptools-scm>=8' \ + torchvision==0.20.0.dev20241113+rocm6.2 \ + --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2 + python -m pip install triton==3.1.0 transformers==4.46.3 + python -m pip install -e .[dev] + + - name: List Python Environments + run: python -m pip list + + - name: Run Unit Tests run: | make test - make test-convergence \ No newline at end of file + make test-convergence diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py index 791ce93b3..f2bf0d62f 100644 --- a/test/transformers/test_cross_entropy.py +++ b/test/transformers/test_cross_entropy.py @@ -12,6 +12,7 @@ from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss from liger_kernel.transformers.functional import liger_cross_entropy from liger_kernel.utils import infer_device +from liger_kernel.ops.utils import is_hip device = infer_device() set_seed(42) @@ -763,7 +764,7 @@ def test_float32_internal(): RETURN_Z_LOSS=0, # False HAS_SOFTCAPPING=False, BLOCK_SIZE=BLOCK_SIZE, - num_warps=32, + num_warps=32 if not is_hip() else 16, ) # Run kernel for float32 @@ -787,7 +788,7 @@ def test_float32_internal(): RETURN_Z_LOSS=0, # False HAS_SOFTCAPPING=False, BLOCK_SIZE=BLOCK_SIZE, - num_warps=32, + num_warps=32 if not is_hip() else 16, ) torch.allclose(X_bf16, X_fp32.bfloat16()) diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py index dc0c78643..5831b1ec2 100644 --- a/test/transformers/test_rms_norm.py +++ b/test/transformers/test_rms_norm.py @@ -74,6 +74,7 @@ def forward(self, x): return output.type_as(x) +@pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize( "bs, sl, hd", [