Skip to content

Added CI to create PRs to bump to latest Ark timestamps #1893

Added CI to create PRs to bump to latest Ark timestamps

Added CI to create PRs to bump to latest Ark timestamps #1893

Workflow file for this run

name: Test deployment and reimage on OpenStack
on:
workflow_dispatch:
workflow_call:
inputs:
target_branch:
type: string
default: ${{ github.ref }}
push:
branches:
- main
paths:
- '**'
- '!dev/**'
- 'dev/setup-env.sh'
- '!docs/**'
- '!README.md'
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
pull_request:
paths:
- '**'
- '!dev/**'
- 'dev/setup-env.sh'
- '!docs/**'
- '!README.md'
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
jobs:
openstack:
name: openstack-ci
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix:
os_version:
- RL8
- RL9
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
TF_VAR_os_version: ${{ matrix.os_version }}
steps:
- uses: stackhpc/ansible-slurm-appliance/.github/actions/is_callee@feat/auto-bump-timestamps # todo: change to main once merges
id: callee_check
with:
current_workflow_file: stackhpc.yml
- uses: actions/checkout@v3
with:
ref: ${{ steps.callee_check.outputs.is_callee == 'true' && inputs.target_branch || github.ref }}
- name: Override CI_CLOUD if PR label is present
if: ${{ github.event_name == 'pull_request' }}
run: |
# Iterate over the labels
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name')
echo $labels
for label in $labels; do
if [[ $label == CI_CLOUD=* ]]; then
# Extract the value after 'CI_CLOUD='
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=}
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV
fi
done
- name: Record settings for CI cloud
run: |
echo CI_CLOUD: ${{ env.CI_CLOUD }}
- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash
- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash
- name: Install ansible etc
run: dev/setup-env.sh
- name: Install OpenTofu
uses: opentofu/setup-opentofu@v1
with:
tofu_version: 1.6.2
- name: Initialise terraform
run: terraform init
working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform
- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash
- name: Setup environment-specific inventory/terraform inputs
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook ansible/adhoc/generate-passwords.yml
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
- name: Provision nodes using fat image
id: provision_servers
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
- name: Delete infrastructure if provisioning failed
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'
- name: Configure cluster
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Run MPI-based tests
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml
# - name: Run EESSI tests
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/check_eessi.yml
- name: Confirm Open Ondemand is up (via SOCKS proxy)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
# load ansible variables into shell:
ansible-playbook ansible/ci/output_vars.yml \
-e output_vars_hosts=openondemand \
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt
# setup ssh proxying:
sudo apt-get --yes install proxychains
echo proxychains installed
ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip}
echo port 9050 forwarded
# check OOD server returns 200:
statuscode=$(proxychains wget \
--quiet \
--spider \
--server-response \
--no-check-certificate \
--http-user=testuser \
--http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \
2>&1)
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1)
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
# - name: Build environment-specific compute image
# id: packer_build
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# cd packer/
# packer init
# PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
# ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs
# - name: Test reimage of compute nodes to new environment-specific image (via slurm)
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
# ansible-playbook -v ansible/ci/check_slurm.yml
- name: Test reimage of login and control nodes (via rebuild adhoc)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Check sacct state survived reimage
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
- name: Check MPI-based tests are shown in Grafana
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_grafana.yml
- name: Delete infrastructure
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}
# - name: Delete images
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/delete_images.yml