Skip to content

Address incosistencies in DataScaling infrastructure #1138

Address incosistencies in DataScaling infrastructure

Address incosistencies in DataScaling infrastructure #1138

Workflow file for this run

name: CPU tests
on:
workflow_dispatch:
pull_request:
# Trigger on pull requests to master or develop that are
# marked as "ready for review" (non-draft PRs)
types:
- opened
- synchronize
- reopened
- ready_for_review
branches:
- master
- develop
push:
# Trigger on pushes to master or develop and for git tag pushes
branches:
- master
- develop
tags:
- v*
env:
IMAGE_NAME: mala_conda_cpu
IMAGE_REGISTRY: ghcr.io
DOCKER_CACHE_PATH: cache/docker
jobs:
build-docker-image-cpu:
# do not trigger on draft PRs
if: ${{ ! github.event.pull_request.draft }}
# Build and push temporary Docker image to GitHub's container registry
runs-on: ubuntu-22.04
steps:
- name: Check out repository
uses: actions/checkout@v4
with:
fetch-depth: '1'
- name: Set environment variables
run: |
# Change all uppercase letters to lowercase
IMAGE_REPO=$IMAGE_REGISTRY/$(echo $GITHUB_REPOSITORY_OWNER | tr '[A-Z]' '[a-z]')
# Create environment variable to which any subsequent steps inside this workflow's job have access
echo "IMAGE_REPO=$IMAGE_REPO" >> $GITHUB_ENV
echo "IMAGE_REPO=$IMAGE_REPO"
- name: Restore cache
uses: actions/cache@v4
id: cache-docker
with:
path: ${{ env.DOCKER_CACHE_PATH }}
key: ${{ github.run_id }}
- name: Check existence of Docker tar archive
id: check_file
run: |
if [[ -f "$DOCKER_CACHE_PATH/docker-image.tar.gz" ]]; then
echo "FILE_EXISTS=true" >> $GITHUB_OUTPUT
else
echo "FILE_EXISTS=false" >> $GITHUB_OUTPUT
fi
- name: Pull latest image from container registry
run: docker pull $IMAGE_REPO/$IMAGE_NAME --quiet || true
- name: Build temporary Docker image
run: |
if [[ "${{ steps.check_file.outputs.FILE_EXISTS }}" == 'true' ]]
then
docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz
CACHE=$IMAGE_NAME:$GITHUB_RUN_ID
else
CACHE=$IMAGE_REPO/$IMAGE_NAME:latest
fi
DOCKER_BUILDKIT=0 docker build . --file Dockerfile --tag $IMAGE_NAME:local --cache-from=$CACHE --build-arg DEVICE=cpu
# Show images
docker images --filter=reference=$IMAGE_NAME --filter=reference=$IMAGE_REPO/$IMAGE_NAME
# Get image IDs (hash of the local image JSON configuration) and check if images are equal
IMAGE_ID_OLD=$(docker images --format "{{.ID}}" --no-trunc --filter=reference=$IMAGE_REPO/$IMAGE_NAME:latest)
IMAGE_ID_NEW=$(docker images --format "{{.ID}}" --no-trunc --filter=reference=$IMAGE_NAME:local)
echo "IMAGE_ID_OLD=$IMAGE_ID_OLD" >> $GITHUB_ENV
echo "IMAGE_ID_NEW=$IMAGE_ID_NEW" >> $GITHUB_ENV
if [[ "$IMAGE_ID_OLD" == "$IMAGE_ID_NEW" ]]
then
echo "Image IDs are equal"; DOCKER_TAG=latest
else
echo "Image IDs are different"; DOCKER_TAG=$GITHUB_RUN_ID
fi
# Environment variable DOCKER_TAG references the image in subsequent jobs
echo "DOCKER_TAG=$DOCKER_TAG" >> $GITHUB_ENV
- name: Tag and save temporary built Docker image
# If temporary image is different from latest on ghcr.io
if: env.IMAGE_ID_OLD != env.IMAGE_ID_NEW
run: |
mkdir -p $DOCKER_CACHE_PATH
# Use GITHUB_RUN_ID, a unique number for each workflow run within a repository as a temporary Docker tag
docker tag $IMAGE_NAME:local $IMAGE_NAME:$GITHUB_RUN_ID
# Save Docker image locally
docker save $IMAGE_NAME:$GITHUB_RUN_ID -o $DOCKER_CACHE_PATH/docker-image.tar
pigz -f -v --fast $DOCKER_CACHE_PATH/docker-image.tar
outputs:
# Make variables available to all downstream jobs that depend on this job
image-repo: ${{ env.IMAGE_REPO }}
docker-tag: ${{ env.DOCKER_TAG }}
cpu-tests:
needs: build-docker-image-cpu
runs-on: ubuntu-20.04
env:
IMAGE_REPO: ${{ needs.build-docker-image-cpu.outputs.image-repo }}
DOCKER_TAG: ${{ needs.build-docker-image-cpu.outputs.docker-tag }}
steps:
- name: "Prepare environment: Restore cache"
if: env.DOCKER_TAG != 'latest'
uses: actions/cache@v4
id: cache-docker
with:
path: ${{ env.DOCKER_CACHE_PATH }}
key: ${{ github.run_id }}
- name: "Prepare environment: Load Docker image from cache"
if: env.DOCKER_TAG != 'latest'
run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz --quiet
- name: "Prepare environment: Pull latest image from container registry"
if: env.DOCKER_TAG == 'latest'
run: |
docker pull $IMAGE_REPO/$IMAGE_NAME:latest --quiet
docker image tag $IMAGE_REPO/$IMAGE_NAME:latest $IMAGE_NAME:latest
- name: "Prepare environment: Run Docker container"
run: |
# Make the github workspace (i.e. checked out mala repository) available inside Docker container by binding it to /home via -v
docker run -i -d --rm --name mala-cpu -v ${{ github.workspace }}:/home -w /home $IMAGE_NAME:$DOCKER_TAG /bin/bash &
# Wait for Docker container to come online
wait
# Check running docker container
docker ps
# Check if docker container is running
[[ $(docker inspect --format '{{json .State.Running}}' mala-cpu) == 'true' ]]
- name: Check out repository (mala)
uses: actions/checkout@v4
with:
fetch-depth: '1'
- name: Install mala package
# Exec all commands inside the mala-cpu container
shell: 'bash -c "docker exec -i mala-cpu bash < {0}"'
run: |
# export Docker image Conda environment for a later comparison
conda env export -n mala-cpu > env_before.yml
# install mala package
pip --no-cache-dir install -e .[opt,test] --no-build-isolation
- name: Check if Conda environment meets the specified requirements
shell: 'bash -c "docker exec -i mala-cpu bash < {0}"'
run: |
# export Conda environment _with_ mala package installed in it (and extra dependencies)
conda env export -n mala-cpu > env_after.yml
# if comparison fails, `install/mala_cpu_[base]_environment.yml` needs to be aligned with
# `requirements.txt` and/or extra dependencies are missing in the Docker Conda environment
if diff --brief env_before.yml env_after.yml
then
echo "Files env_before.yml and env_after.yml do not differ."
else
diff --side-by-side --color-always env_before.yml env_after.yml
fi
- name: Download test data repository from RODARE
shell: 'bash -c "docker exec -i mala-cpu python < {0}"'
run: |
import requests, shutil, zipfile
# This DOI represents all versions, and will always resolve to the latest one
DOI = "https://doi.org/10.14278/rodare.2900"
# Resolve DOI and get record ID and the associated API URL
response = requests.get(DOI)
*_, record_id = response.url.split("/")
api_url = f"https://rodare.hzdr.de/api/records/{record_id}"
# Download record from API and get the first file
response = requests.get(api_url)
record = response.json()
size = record["files"][0]["size"]
download_link = record["files"][0]["links"]["self"]
print(size, "bytes", "--", download_link)
# TODO: implement some sort of auto retry for failed HTTP requests
response = requests.get(download_link)
# Saving downloaded content to a file
with open("test-data.zip", mode="wb") as file:
file.write(response.content)
# Get top level directory name
dir_name = zipfile.ZipFile("test-data.zip").namelist()[0]
shutil.unpack_archive("test-data.zip", ".")
print(f"Rename {dir_name} to mala_data")
shutil.move(dir_name, "mala_data")
- name: Test mala
shell: 'bash -c "docker exec -i mala-cpu bash < {0}"'
run: MALA_DATA_REPO=$(pwd)/mala_data pytest --cov=mala --cov-fail-under=60 -m "not examples" --disable-warnings
retag-docker-image-cpu:
needs: [cpu-tests, build-docker-image-cpu]
runs-on: ubuntu-22.04
permissions:
packages: write
env:
IMAGE_REPO: ${{ needs.build-docker-image-cpu.outputs.image-repo }}
DOCKER_TAG: ${{ needs.build-docker-image-cpu.outputs.docker-tag }}
# Trigger on pushes to master or develop (this includes _merged_ PRs) only if Docker image has changed and on git tag pushes
# NOTE: The `env` context is not available for workflow key `jobs.<job_id>.if`.
if: |
((contains(github.ref_name, 'develop') || contains(github.ref_name, 'master')) && needs.build-docker-image-cpu.outputs.docker-tag != 'latest')
|| startsWith(github.ref, 'refs/tags/')
steps:
- name: "Prepare environment: Restore cache"
if: env.DOCKER_TAG != 'latest'
uses: actions/cache@v4
id: cache-docker
with:
path: ${{ env.DOCKER_CACHE_PATH }}
key: ${{ github.run_id }}
- name: "Prepare environment: Load Docker image from cache"
if: env.DOCKER_TAG != 'latest'
run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz --quiet
- name: "Prepare environment: Pull latest image from container registry"
if: env.DOCKER_TAG == 'latest'
run: docker pull $IMAGE_REPO/$IMAGE_NAME:latest --quiet
- name: Tag Docker image
run: |
# Execute on change of Docker image
if [[ "$DOCKER_TAG" != 'latest' ]]; then
docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:latest
docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:${GITHUB_REF_NAME}-${GITHUB_SHA:0:7}
fi
# Execute on push of git tag
if ${{ startsWith(github.ref, 'refs/tags/') }}; then
GIT_TAG=$(echo "${{ github.ref_name }}" | sed -e 's/^v//')
echo "GIT_TAG=$GIT_TAG"
docker tag $IMAGE_REPO/$IMAGE_NAME:latest $IMAGE_REPO/$IMAGE_NAME:$GIT_TAG
fi
- name: Log into container registry
# Authentication required for pushing images
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
- name: Push Docker image
run: docker push $IMAGE_REPO/$IMAGE_NAME --all-tags | grep -v -E 'Waiting|Layer already|Preparing|Pushed'