Support and test "re-imageable" compute nodes via compute node metadata #1901
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test deployment and reimage on OpenStack | |
on: | |
workflow_dispatch: | |
push: | |
branches: | |
- main | |
paths: | |
- '**' | |
- '!dev/**' | |
- 'dev/setup-env.sh' | |
- '!docs/**' | |
- '!README.md' | |
- '!.gitignore' | |
- '!.github/workflows/' | |
- '.github/workflows/stackhpc' | |
pull_request: | |
paths: | |
- '**' | |
- '!dev/**' | |
- 'dev/setup-env.sh' | |
- '!docs/**' | |
- '!README.md' | |
- '!.gitignore' | |
- '!.github/workflows/' | |
- '.github/workflows/stackhpc' | |
jobs: | |
openstack: | |
name: openstack-ci | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS | |
cancel-in-progress: true | |
runs-on: ubuntu-22.04 | |
strategy: | |
fail-fast: false # allow other matrix jobs to continue even if one fails | |
matrix: | |
os_version: | |
- RL8 | |
- RL9 | |
env: | |
ANSIBLE_FORCE_COLOR: True | |
OS_CLOUD: openstack | |
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} | |
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings | |
TF_VAR_os_version: ${{ matrix.os_version }} | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Override CI_CLOUD if PR label is present | |
if: ${{ github.event_name == 'pull_request' }} | |
run: | | |
# Iterate over the labels | |
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') | |
echo $labels | |
for label in $labels; do | |
if [[ $label == CI_CLOUD=* ]]; then | |
# Extract the value after 'CI_CLOUD=' | |
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} | |
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV | |
fi | |
done | |
- name: Record settings for CI cloud | |
run: | | |
echo CI_CLOUD: ${{ env.CI_CLOUD }} | |
- name: Setup ssh | |
run: | | |
set -x | |
mkdir ~/.ssh | |
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa | |
chmod 0600 ~/.ssh/id_rsa | |
shell: bash | |
- name: Add bastion's ssh key to known_hosts | |
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts | |
shell: bash | |
- name: Install ansible etc | |
run: dev/setup-env.sh | |
- name: Install OpenTofu | |
uses: opentofu/setup-opentofu@v1 | |
with: | |
tofu_version: 1.6.2 | |
- name: Initialise terraform | |
run: terraform init | |
working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform | |
- name: Write clouds.yaml | |
run: | | |
mkdir -p ~/.config/openstack/ | |
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml | |
shell: bash | |
- name: Setup environment-specific inventory/terraform inputs | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook ansible/adhoc/generate-passwords.yml | |
echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml | |
env: | |
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} | |
- name: Provision nodes using fat image | |
id: provision_servers | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform | |
terraform apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
- name: Delete infrastructure if provisioning failed | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform | |
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
if: failure() && steps.provision_servers.outcome == 'failure' | |
- name: Configure cluster | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible all -m wait_for_connection | |
ansible-playbook -v ansible/site.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Run MPI-based tests | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/adhoc/hpctests.yml | |
# - name: Run EESSI tests | |
# run: | | |
# . venv/bin/activate | |
# . environments/.stackhpc/activate | |
# ansible-playbook -vv ansible/ci/check_eessi.yml | |
- name: Confirm Open Ondemand is up (via SOCKS proxy) | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
# load ansible variables into shell: | |
ansible-playbook ansible/ci/output_vars.yml \ | |
-e output_vars_hosts=openondemand \ | |
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \ | |
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername | |
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt | |
# setup ssh proxying: | |
sudo apt-get --yes install proxychains | |
echo proxychains installed | |
ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip} | |
echo port 9050 forwarded | |
# check OOD server returns 200: | |
statuscode=$(proxychains wget \ | |
--quiet \ | |
--spider \ | |
--server-response \ | |
--no-check-certificate \ | |
--http-user=demo_user \ | |
--http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ | |
2>&1) | |
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) | |
env: | |
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} | |
- name: Test reimage of login and control nodes (via rebuild adhoc) | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml | |
ansible-playbook -v ansible/site.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Test reimage of compute nodes and compute-init (via rebuild adhoc) | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Pause for debugging | |
run: sleep 1800 | |
- name: Check sacct state survived reimage | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml | |
- name: Check MPI-based tests are shown in Grafana | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/ci/check_grafana.yml | |
- name: Delete infrastructure | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform | |
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
if: ${{ success() || cancelled() }} | |
# - name: Delete images | |
# run: | | |
# . venv/bin/activate | |
# . environments/.stackhpc/activate | |
# ansible-playbook -vv ansible/ci/delete_images.yml |