Skip to content

Commit

Permalink
Switch amd-ci to use MI300X runner. (#428)
Browse files Browse the repository at this point in the history
This commit switches the amd-ci workflow to use MI300x gpu provided by
AMD for testing coverage.

---------

Co-authored-by: TJian <[email protected]>
Co-authored-by: tjtanaa <[email protected]>
  • Loading branch information
3 people authored Dec 7, 2024
1 parent 7a71725 commit 189c411
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 26 deletions.
75 changes: 51 additions & 24 deletions .github/workflows/amd-ci.yml
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
name: GitHub Actions CI (AMD)

# on:
# push:
# branches:
# - main
# paths:
# - "src/**"
# - "test/**"
# pull_request:
# branches:
# - main
# paths:
# - "src/**"
# - "test/**"
on:
push:
branches:
- main
paths:
- "src/**"
- "test/**"
pull_request:
branches:
- main
# paths:
# - "src/**"
# - "test/**"

# concurrency:
# # This causes it to cancel previous in-progress actions on the same PR / branch,
# group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
# cancel-in-progress: true
concurrency:
# This causes it to cancel previous in-progress actions on the same PR / branch,
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
checkstyle:
Expand All @@ -36,12 +36,11 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install flake8 isort black
- name: Run checkstyle
run: make checkstyle

tests:
runs-on: ubuntu-latest
runs-on: linux-mi300-gpu-1
needs: [checkstyle]

steps:
Expand All @@ -53,12 +52,40 @@ jobs:
with:
python-version: '3.10'

- name: Install dependencies
- name: Check Docker Version
run: docker version

- name: Check Ubuntu version
run: lsb_release -a

- name: Check Hardware Specs
run: lscpu

- name: ROCM-SMI Output
run: |
python -m pip install --upgrade pip
pip install -e ".[dev]"
rocm-smi
rocm-smi --showproductname
- name: Run tests
- name: Setup Dependencies
run: |
cp -r /opt/rocm/share/amd_smi ./
cd amd_smi
python -m pip install -e .
cd ..
python -m pip install pytest pytest-xdist pytest-rerunfailures pytest-flakefinder pytest-cpp
python -m pip uninstall -y torch torchvision
python -m pip install --pre \
torch==2.6.0.dev20241113+rocm6.2 \
'setuptools-scm>=8' \
torchvision==0.20.0.dev20241113+rocm6.2 \
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
python -m pip install triton==3.1.0 transformers==4.46.3
python -m pip install -e .[dev]
- name: List Python Environments
run: python -m pip list

- name: Run Unit Tests
run: |
make test
make test-convergence
make test-convergence
5 changes: 3 additions & 2 deletions test/transformers/test_cross_entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
from liger_kernel.transformers.functional import liger_cross_entropy
from liger_kernel.utils import infer_device
from liger_kernel.ops.utils import is_hip

device = infer_device()
set_seed(42)
Expand Down Expand Up @@ -763,7 +764,7 @@ def test_float32_internal():
RETURN_Z_LOSS=0, # False
HAS_SOFTCAPPING=False,
BLOCK_SIZE=BLOCK_SIZE,
num_warps=32,
num_warps=32 if not is_hip() else 16,
)

# Run kernel for float32
Expand All @@ -787,7 +788,7 @@ def test_float32_internal():
RETURN_Z_LOSS=0, # False
HAS_SOFTCAPPING=False,
BLOCK_SIZE=BLOCK_SIZE,
num_warps=32,
num_warps=32 if not is_hip() else 16,
)

torch.allclose(X_bf16, X_fp32.bfloat16())
Expand Down
1 change: 1 addition & 0 deletions test/transformers/test_rms_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def forward(self, x):
return output.type_as(x)


@pytest.mark.flaky(reruns=3, reruns_delay=2)
@pytest.mark.parametrize(
"bs, sl, hd",
[
Expand Down

0 comments on commit 189c411

Please sign in to comment.