Skip to content

Support writing optimizer checkpoint only on rank0 and make UT pass on A100 #354

Support writing optimizer checkpoint only on rank0 and make UT pass on A100

Support writing optimizer checkpoint only on rank0 and make UT pass on A100 #354

Workflow file for this run

name: Build Image
on:
push:
branches:
- main
pull_request:
branches:
- main
release:
types:
- published
jobs:
docker:
name: Docker build ${{ matrix.name }}
runs-on: [self-hosted, linux, x64, gpu]
timeout-minutes: 600
permissions:
contents: read
packages: write
strategy:
matrix:
include:
- name: torch1.14-cuda11.8
tags: ghcr.io/azure/msamp:main-cuda11.8
- name: torch2.1-cuda12.2
tags: ghcr.io/azure/msamp:main-cuda12.2,ghcr.io/azure/msamp:latest
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: true
- name: Free disk space
run: |
mkdir -p /tmp/emptydir
for dir in /usr/share/swift /usr/share/dotnet /usr/local/share/powershell /usr/local/share/chromium /usr/local/lib/android /opt/ghc; do
sudo rsync -a --delete /tmp/emptydir/ ${dir}
done
sudo apt-get clean
# Check if Docker images exist before trying to remove them
if sudo docker images -q --filter=reference="node" --filter=reference="buildpack-deps" | grep -q .; then
sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps")
else
echo "No Docker images found with the specified references."
fi
df -h
- name: Prepare metadata
id: metadata
run: |
TAGS=${{ matrix.tags }}
if [[ "${{ github.event_name }}" == "release" ]]; then
TAGS=$(sed "s/main/${GITHUB_REF##*/}/g" <<< ${TAGS})
fi
DOCKERFILE=dockerfile/${{ matrix.name }}.dockerfile
CACHE_FROM="type=registry,ref=$(cut -d, -f1 <<< ${TAGS})"
CACHE_TO=""
if [[ "${{ github.event_name }}" != "pull_request" ]]; then
CACHE_TO="type=inline,mode=max"
fi
echo ::set-output name=dockerfile::${DOCKERFILE}
echo ::set-output name=build_args::${BUILD_ARGS}
echo ::set-output name=tags::${TAGS}
echo ::set-output name=cache_from::${CACHE_FROM}
echo ::set-output name=cache_to::${CACHE_TO}
- name: Echo build args
run: echo ${{ steps.metadata.outputs.build_args }}
- name: Echo image tag
run: echo ${{ steps.metadata.outputs.tags }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Login to the GitHub Container Registry
uses: docker/login-action@v1
if: ${{ github.event_name != 'pull_request' }}
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push
id: docker_build
uses: docker/build-push-action@v2
with:
platforms: linux/amd64
context: .
file: ${{ steps.metadata.outputs.dockerfile }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }}
cache-from: ${{ steps.metadata.outputs.cache_from }}
cache-to: ${{ steps.metadata.outputs.cache_to }}
build-args: |
${{ steps.metadata.outputs.build_args }}
labels: |
org.opencontainers.image.source=${{ github.event.repository.html_url }}
org.opencontainers.image.created=${{ github.event.repository.pushed_at }}
org.opencontainers.image.revision=${{ github.sha }}
- name: Echo image digest
run: echo ${{ steps.docker_build.outputs.digest }}