From dec879e68a40c5f89c93f1b24d885dfdf93e7122 Mon Sep 17 00:00:00 2001 From: Kevin DeShawn <126115026+KevDevSha@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:53:35 -0500 Subject: [PATCH] add ghcr and update build matrix generator (#3465) Co-authored-by: Mihir Patel --- .../docker-configure-build-push.yaml | 30 +++++++++++------- .github/workflows/pr-docker.yaml | 7 +++-- .github/workflows/release-docker.yaml | 10 ++++-- .github/workflows/release.yaml | 2 ++ docker/README.md | 26 ++++++++-------- docker/build_matrix.yaml | 16 ++++++++++ docker/generate_build_matrix.py | 31 +++++++++++++------ 7 files changed, 83 insertions(+), 39 deletions(-) diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml index a668e75217..8d6884a96c 100644 --- a/.github/workflows/docker-configure-build-push.yaml +++ b/.github/workflows/docker-configure-build-push.yaml @@ -1,4 +1,4 @@ -name: Docker Image Configure-Build-Push +name: Docker/GHCR Image Configure-Build-Push on: workflow_call: inputs: @@ -23,6 +23,9 @@ on: staging-repo: required: false type: string + ghcr-staging-repo: + required: false + type: string tags: required: true type: string @@ -34,18 +37,14 @@ on: required: true password: required: true + ghcr_username: + required: true + ghcr_password: + required: true jobs: configure-build-push: runs-on: mosaic-4wide steps: - - name: Maximize Build Space on Worker - uses: easimon/maximize-build-space@v4 - with: - overprovision-lvm: true - remove-dotnet: true - remove-android: true - remove-haskell: true - - name: Checkout uses: actions/checkout@v3 @@ -60,7 +59,12 @@ jobs: with: username: ${{ secrets.username }} password: ${{ secrets.password }} - + - name: Login to GHCR + uses: docker/login-action@v3 + with: + username: ${{ secrets.ghcr_username }} + password: ${{ secrets.ghcr_password }} + registry: ghcr.io - name: Calculate Docker Image Variables run: | set -euo pipefail @@ -70,7 +74,8 @@ jobs: ################### if [ "${{ inputs.staging }}" = "true" ]; then STAGING_REPO=${{ inputs.staging-repo }} - IMAGE_TAG=${STAGING_REPO}:${{ inputs.image-uuid }} + GHCR_STAGING_REPO=${{ inputs.ghcr-staging-repo }} + IMAGE_TAG=${STAGING_REPO}:${{ inputs.image-uuid }},${GHCR_STAGING_REPO}:${{ inputs.image-uuid }} IMAGE_CACHE="${STAGING_REPO}:${{ inputs.image-name }}-buildcache" else IMAGE_TAG=${{ inputs.tags }} @@ -81,7 +86,8 @@ jobs: echo "IMAGE_CACHE=${IMAGE_CACHE}" >> ${GITHUB_ENV} - name: IMAGE_TAG = ${{ env.IMAGE_TAG }} - run: echo ${{ env.IMAGE_TAG }} + run: | + echo ${{ env.IMAGE_TAG }} - name: Build and Push the Docker Image uses: docker/build-push-action@v3 diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml index 352eab881b..f46a934019 100644 --- a/.github/workflows/pr-docker.yaml +++ b/.github/workflows/pr-docker.yaml @@ -1,4 +1,4 @@ -name: PR Docker +name: PR Docker/GHCR on: pull_request: branches: @@ -16,7 +16,7 @@ defaults: jobs: build-image-matrix: if: github.repository_owner == 'mosaicml' - runs-on: ubuntu-latest + runs-on: linux-ubuntu-latest timeout-minutes: 2 outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} @@ -64,8 +64,11 @@ jobs: push: true staging: true staging-repo: mosaicml/ci-staging + ghcr-staging-repo: ghcr.io/databricks-mosaic/ci-staging tags: ${{ matrix.TAGS }} target: ${{ matrix.TARGET }} secrets: username: ${{ secrets.DOCKER_HUB_USERNAME }} password: ${{ secrets.DOCKER_HUB_PASSWORD }} + ghcr_username: ${{ secrets.GHCR_USERNAME }} + ghcr_password: ${{ secrets.GHCR_TOKEN }} diff --git a/.github/workflows/release-docker.yaml b/.github/workflows/release-docker.yaml index e992663994..9d173a97c3 100644 --- a/.github/workflows/release-docker.yaml +++ b/.github/workflows/release-docker.yaml @@ -1,4 +1,4 @@ -name: Release Docker Images +name: Release Docker/GHCR Images on: workflow_dispatch: @@ -8,6 +8,10 @@ on: required: true DOCKER_HUB_PASSWORD: required: true + GHCR_USERNAME: + required: true + GHCR_TOKEN: + required: true defaults: run: @@ -16,7 +20,7 @@ defaults: jobs: build-image-matrix: if: github.repository_owner == 'mosaicml' - runs-on: ubuntu-latest + runs-on: linux-ubuntu-latest timeout-minutes: 2 outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} @@ -64,3 +68,5 @@ jobs: secrets: username: ${{ secrets.DOCKER_HUB_USERNAME }} password: ${{ secrets.DOCKER_HUB_PASSWORD }} + ghcr_username: ${{ secrets.GHCR_USERNAME }} + ghcr_password: ${{ secrets.GHCR_TOKEN }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c841e6c150..2bc86572f9 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -111,3 +111,5 @@ jobs: secrets: DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} DOCKER_HUB_PASSWORD: ${{ secrets.DOCKER_HUB_PASSWORD }} + GHCR_USERNAME: ${{ secrets.GHCR_USERNAME }} + GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }} diff --git a/docker/README.md b/docker/README.md index 09dd2591f5..a8ebfa63e4 100644 --- a/docker/README.md +++ b/docker/README.md @@ -13,9 +13,9 @@ all dependencies for both NLP and Vision models. They are built on top of the `mosaicml/composer:latest` or `mosaicml/composer:latest_cpu`, which will always be up to date. -| Composer Version | CUDA Support | Docker Tag | -|--------------------|----------------|----------------------------------------------------------------| -| 0.23.5 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.5` | +| Composer Version | CUDA Support | Docker Tag | +|--------------------|----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 0.23.5 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.5` | | 0.23.5 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.5_cpu` | @@ -28,17 +28,17 @@ The [`mosaicml/pytorch`](https://hub.docker.com/r/mosaicml/pytorch) images conta To install composer, once inside the image, run `pip install mosaicml`. -| Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | -|----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | +| Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | +|----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.2.2 | cpu | 3.11 | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.2.2 | cpu | 3.11 | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 2fb084a78b..8fb23a2b80 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -11,7 +11,9 @@ PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 - mosaicml/pytorch:latest + - ghcr.io/databricks-mosaic/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: v1.9.1-aws @@ -26,7 +28,9 @@ PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws + - ghcr.io/databricks-mosaic/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' @@ -41,7 +45,9 @@ PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 - mosaicml/pytorch:latest_cpu + - ghcr.io/databricks-mosaic/pytorch:latest_cpu TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' @@ -69,6 +75,7 @@ PYTORCH_VERSION: 2.3.1 TAGS: - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: v1.9.1-aws @@ -96,6 +103,7 @@ PYTORCH_VERSION: 2.3.1 TAGS: - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' @@ -110,6 +118,7 @@ PYTORCH_VERSION: 2.3.1 TAGS: - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' @@ -137,6 +146,7 @@ PYTORCH_VERSION: 2.2.2 TAGS: - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: v1.9.1-aws @@ -164,6 +174,7 @@ PYTORCH_VERSION: 2.2.2 TAGS: - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' @@ -178,6 +189,7 @@ PYTORCH_VERSION: 2.2.2 TAGS: - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.2.2_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' @@ -193,7 +205,9 @@ PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/composer:0.23.5 + - ghcr.io/databricks-mosaic/composer:0.23.5 - mosaicml/composer:latest + - ghcr.io/databricks-mosaic/composer:latest TARGET: composer_stage TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' @@ -209,6 +223,8 @@ PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/composer:0.23.5_cpu + - ghcr.io/databricks-mosaic/composer:0.23.5_cpu - mosaicml/composer:latest_cpu + - ghcr.io/databricks-mosaic/composer:latest_cpu TARGET: composer_stage TORCHVISION_VERSION: 0.19.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index a1cf5bca3b..773a20f6db 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -12,6 +12,7 @@ import itertools import os +import re import sys import packaging.version @@ -105,34 +106,39 @@ def _get_cuda_override(cuda_version: str): def _get_pytorch_tags(python_version: str, pytorch_version: str, cuda_version: str, stage: str, interconnect: str): if stage == 'pytorch_stage': base_image_name = 'mosaicml/pytorch' + ghcr_base_image_name = 'ghcr.io/databricks-mosaic/pytorch' else: raise ValueError(f'Invalid stage: {stage}') + tags = [] cuda_version_tag = _get_cuda_version_tag(cuda_version) - tags = [f'{base_image_name}:{pytorch_version}_{cuda_version_tag}-python{python_version}-ubuntu20.04'] + tags += [ + f'{base_image_name}:{pytorch_version}_{cuda_version_tag}-python{python_version}-ubuntu20.04', + f'{ghcr_base_image_name}:{pytorch_version}_{cuda_version_tag}-python{python_version}-ubuntu20.04', + ] if python_version == PRODUCTION_PYTHON_VERSION and pytorch_version == PRODUCTION_PYTORCH_VERSION: if not cuda_version: - tags.append(f'{base_image_name}:latest_cpu') + tags += [f'{base_image_name}:latest_cpu', f'{ghcr_base_image_name}:latest_cpu'] else: - tags.append(f'{base_image_name}:latest') + tags += [f'{base_image_name}:latest', f'{ghcr_base_image_name}:latest'] if interconnect == 'EFA': tags = [f'{tag}-aws' for tag in tags] - return tags def _get_composer_tags(composer_version: str, use_cuda: bool): base_image_name = 'mosaicml/composer' + ghcr_base_image_name = 'ghcr.io/databricks-mosaic/composer' tags = [] if not use_cuda: - tags.append(f'{base_image_name}:{composer_version}_cpu') - tags.append(f'{base_image_name}:latest_cpu') + tags += [f'{base_image_name}:{composer_version}_cpu', f'{ghcr_base_image_name}:{composer_version}_cpu'] + tags += [f'{base_image_name}:latest_cpu', f'{ghcr_base_image_name}:latest_cpu'] else: - tags.append(f'{base_image_name}:{composer_version}') - tags.append(f'{base_image_name}:latest') - + tags += [f'{base_image_name}:{composer_version}', f'{ghcr_base_image_name}:{composer_version}'] + tags += [f'{base_image_name}:latest', f'{ghcr_base_image_name}:latest'] + print(tags) return tags @@ -161,8 +167,13 @@ def _write_table(table_tag: str, table_contents: str): end_table_tag = f'' pre = contents.split(begin_table_tag)[0] - post = contents.split(end_table_tag)[1] + if end_table_tag in contents: + post = contents.split(end_table_tag)[1] + else: + print(f"Warning: '{end_table_tag}' not found in contents.") + post = '' new_readme = f'{pre}{begin_table_tag}\n{table_contents}\n{end_table_tag}{post}' + new_readme = re.sub(r'`ghcr\.io\S*, ', '', new_readme) with open(os.path.join(os.path.dirname(__name__), 'README.md'), 'w') as f: f.write(new_readme)