Support writing optimizer checkpoint only on rank0 and make UT pass on A100 #534
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Unit Tests | |
on: | |
pull_request: | |
branches: | |
- main | |
- release/* | |
jobs: | |
unit-tests: | |
name: Build and test with torch-${{ matrix.torch }} | |
runs-on: [self-hosted, linux, x64, gpu] | |
strategy: | |
matrix: | |
include: | |
# 1.14.0a0+410ce96 | |
- torch: "1.14" | |
nvcr: 22.12-py3 | |
dir: torch1 | |
# 2.1.0a0+32f93b1 | |
- torch: "2.1" | |
nvcr: 23.10-py3 | |
dir: torch2 | |
container: | |
image: nvcr.io/nvidia/pytorch:${{ matrix.nvcr }} | |
options: --privileged --ipc=host --gpus=all | |
steps: | |
- name: Checkout msamp | |
uses: actions/checkout@v2 | |
with: | |
submodules: true | |
path: ${{ matrix.dir }} | |
- name: Install MSCCL | |
run: | | |
cd ${{ matrix.dir }}/third_party/msccl | |
make -j src.build NVCC_GENCODE="\ | |
-gencode=arch=compute_70,code=sm_70 \ | |
-gencode=arch=compute_80,code=sm_80 \ | |
-gencode=arch=compute_90,code=sm_90" | |
make install | |
- name: Install dependencies | |
run: | | |
export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH" | |
export DEBIAN_FRONTEND=noninteractive | |
python3 -m pip install --upgrade pip | |
apt-get update && apt-get install -y python3-mpi4py | |
cd ${{ matrix.dir }}/ | |
python3 -m pip install .[test] | |
make postinstall | |
- name: Run code lint | |
run: | | |
cd ${{ matrix.dir }}/ | |
python3 setup.py lint | |
- name: Run unit tests | |
run: | | |
export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH" | |
export LD_PRELOAD="/usr/local/lib/libmsamp_dist.so:/usr/local/lib/libnccl.so:${LD_PRELOAD}" | |
cd ${{ matrix.dir }}/ | |
python3 setup.py test | |
- name: Clean repository | |
if: always() | |
run: | | |
rm -rf ${{ matrix.dir }}/ | |
# - name: Report coverage results | |
# run: | | |
# bash <(curl -s https://codecov.io/bash) |