Skip to content

use more shards (#2938) #3588

use more shards (#2938)

use more shards (#2938) #3588

Workflow file for this run

on:
push:
branches:
- main
paths-ignore:
- README.md
- withdrawn-images.txt
- withdrawn-repos.txt
- reinstated-images.txt
schedule:
- cron: "0 0 * * *"
workflow_dispatch:
inputs:
only:
description: "Specific image name to build"
type: string
required: false
default: ""
concurrency: release
env:
TOTAL_SHARDS: 256
TF_VAR_target_repository: cgr.dev/chainguard
permissions:
contents: read
jobs:
shard:
runs-on: ubuntu-latest
steps:
- uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
with:
egress-policy: audit
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- id: shard
name: Shard
shell: bash # bash math foo required
run: |
images=($(find ./images -maxdepth 1 -type d -not -path "./images/TEMPLATE" | awk -F'/' '{print $3}' | sort -u))
# n buckets to shard into
n=${{ env.TOTAL_SHARDS }}
total=${#images[@]}
base_size=$((total / n))
remainder=$((total % n))
declare -a bins
# Sequentially fill up each bin, and append any remainders to the last bin
for ((i = 0; i < total; i++)); do
idx=$((i < (total - remainder) ? i / base_size : n - 1))
bins[$idx]+="${images[$i]} "
done
matrix=$(printf "%s\n" "${bins[@]}" | jq -cRnjr '[inputs] | [ range(0; length) as $i | { "index": $i | tostring, "images": .[$i] } ]')
echo "matrix=${matrix}" >> $GITHUB_OUTPUT
# Overwrite the output above if workflow_dispatch'd with `only`
if [ -n "${{ inputs.only }}" ]; then
shard='[{"index": 0, "images": "${{ inputs.only }}"}]'
echo "matrix=${shard}" >> $GITHUB_OUTPUT
fi
- name: Shard Results
run: echo '${{ steps.shard.outputs.matrix }}'
outputs:
# This is of the format [{"index": 0, "images": "a b c"}, {"index": 1, "images": "d e f"}, ...]
matrix: "${{steps.shard.outputs.matrix}}"
build:
runs-on: ubuntu-latest-32-cores
needs: shard
strategy:
fail-fast: false
matrix:
shard: ${{ fromJson(needs.shard.outputs.matrix) }}
permissions:
id-token: write
packages: write
contents: read
actions: read
steps:
- uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
with:
egress-policy: audit
- uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3
with:
terraform_version: "1.8.*"
terraform_wrapper: false
- uses: chainguard-dev/setup-chainctl@598499528905f95b94e62e4831cf42035e768933 # v0.2.3
with:
# This allows chainguard-images/images-private to publish images to cgr.dev/chainguard-private
# We maintain this identity here:
# https://github.com/chainguard-dev/mono/blob/main/env/chainguard-images/iac/images-pusher.tf
identity: 720909c9f5279097d847ad02a2f24ba8f59de36a/b6461e99e132298f
- uses: chainguard-dev/actions/setup-k3d@ba1a9c9ffe799736883d58f31caff18d85b2800e # main
with:
k3s-image: cgr.dev/chainguard/k3s:latest@sha256:78fe2d04d80c000306a075677b180e3ef857e3116b423daa119c0e63f02d7912
# Make cosign/crane CLI available to the tests
- uses: sigstore/cosign-installer@dc72c7d5c4d10cd6bcb8cf6e3fd625a9e5e537da # v3.7.0
- uses: imjasonh/setup-crane@31b88efe9de28ae0ffa220711af4b60be9435f6e # v0.4
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Terraform apply
timeout-minutes: 60
run: |
set -exo pipefail
env | grep '^TF_VAR_'
make init-upgrade
targets=""
for image in ${{ matrix.shard.images }}; do
targets+=' -target='module."${image}"''
done
terraform plan ${targets} -out mega-module.tfplan
terraform apply ${targets} -auto-approve --parallelism=$(nproc) -json mega-module.tfplan | tee /tmp/mega-module.tf.json | jq -r '.["@message"]'
- name: Collect TF diagnostics
if: ${{ always() }}
id: tf-diag
uses: chainguard-dev/actions/terraform-diag@ba1a9c9ffe799736883d58f31caff18d85b2800e # main
with:
json-file: /tmp/mega-module.tf.json
- name: Collect K8s diagnostics and upload
if: ${{ failure() }}
run: |
echo 'This step is deprecated. Please use the imagetest provider and reference the imagetest logs'
- name: Upload terraform logs
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v3
with:
name: "mega-module-${{ matrix.shard.index }}.tf.json"
path: /tmp/mega-module.tf.json
- name: Upload imagetest logs
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v3
with:
name: "mega-module-${{ matrix.shard.index }}-imagetest-logs"
path: imagetest-logs
- uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 # v2.3.0
if: ${{ failure() && github.event_name == 'schedule' }}
env:
SLACK_ICON: http://github.com/chainguard-dev.png?size=48
SLACK_USERNAME: guardian
SLACK_WEBHOOK: ${{ secrets.DISTROLESS_SLACK_WEBHOOK }}
SLACK_MSG_AUTHOR: chainguardian
SLACK_CHANNEL: chainguard-images-alerts
SLACK_COLOR: "#8E1600"
MSG_MINIMAL: "true"
SLACK_TITLE: "[images] release failed (shard ${{ matrix.shard.index }} of ${{ env.TOTAL_SHARDS }})"
SLACK_MESSAGE: |
https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
${{ steps.tf-diag.outputs.errors }}
summary:
name: "Build Summary"
runs-on: ubuntu-latest
if: ${{ always() }}
needs: build
steps:
- uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
with:
egress-policy: audit
- name: "Download shard logs"
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
path: /tmp/shard-logs
pattern: mega-module-*
# Cat all the files into one, while maintaining their shard
- run: |
find '/tmp/shard-logs' -name 'mega-module.tf.json' | while read file; do
shard_index=$(echo "$file" | sed -E 's/.*mega-module-([0-9]+)\.tf\.json.*/\1/')
echo $shard_index
jq -cr --arg shard_index "$shard_index" '. + {"shard_index":$shard_index}' $file >> logs.tf.json
done
# process the logs
- run: |
# Create a file just for errors
jq -r 'select(.["@level"]=="error")' logs.tf.json > errors.tf.json
# Build the markdown table
- run: |
echo "| Status | Shard | Image | Summary | Address |" >> $GITHUB_STEP_SUMMARY
echo "| :-: | ----- | ----- | ------- | ------- |" >> $GITHUB_STEP_SUMMARY
# append the rows to the table
export rows="$(jq -r '"| ❌ | " + .shard_index + " | " + (.diagnostic.address | split(".")[1]) + " | ```" + .diagnostic.summary + "``` | ```" + .diagnostic.address + "``` |"' errors.tf.json)"
echo "${rows}"
cat >> $GITHUB_STEP_SUMMARY <<EOR
${rows}
EOR
- name: Error Details
run: |
# Print the errors as expandable groups
jq -r '"::group:: shard: " + .shard_index + " | " + (.diagnostic.address | split(".")[1]) + "\nresource: " + .diagnostic.address + "\n\nsummary: " + .diagnostic.summary + "\n\ndetails:\n\n" + .diagnostic.detail + "\n::endgroup::"' errors.tf.json || true