Skip to content

Support writing optimizer checkpoint only on rank0 and make UT pass on A100 #534

Support writing optimizer checkpoint only on rank0 and make UT pass on A100

Support writing optimizer checkpoint only on rank0 and make UT pass on A100 #534

Workflow file for this run

name: Unit Tests
on:
pull_request:
branches:
- main
- release/*
jobs:
unit-tests:
name: Build and test with torch-${{ matrix.torch }}
runs-on: [self-hosted, linux, x64, gpu]
strategy:
matrix:
include:
# 1.14.0a0+410ce96
- torch: "1.14"
nvcr: 22.12-py3
dir: torch1
# 2.1.0a0+32f93b1
- torch: "2.1"
nvcr: 23.10-py3
dir: torch2
container:
image: nvcr.io/nvidia/pytorch:${{ matrix.nvcr }}
options: --privileged --ipc=host --gpus=all
steps:
- name: Checkout msamp
uses: actions/checkout@v2
with:
submodules: true
path: ${{ matrix.dir }}
- name: Install MSCCL
run: |
cd ${{ matrix.dir }}/third_party/msccl
make -j src.build NVCC_GENCODE="\
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_90,code=sm_90"
make install
- name: Install dependencies
run: |
export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH"
export DEBIAN_FRONTEND=noninteractive
python3 -m pip install --upgrade pip
apt-get update && apt-get install -y python3-mpi4py
cd ${{ matrix.dir }}/
python3 -m pip install .[test]
make postinstall
- name: Run code lint
run: |
cd ${{ matrix.dir }}/
python3 setup.py lint
- name: Run unit tests
run: |
export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH"
export LD_PRELOAD="/usr/local/lib/libmsamp_dist.so:/usr/local/lib/libnccl.so:${LD_PRELOAD}"
cd ${{ matrix.dir }}/
python3 setup.py test
- name: Clean repository
if: always()
run: |
rm -rf ${{ matrix.dir }}/
# - name: Report coverage results
# run: |
# bash <(curl -s https://codecov.io/bash)