diff --git a/ansible/aws/README.md b/ansible/aws/README.md index df8c50fb..dbdd03c9 100644 --- a/ansible/aws/README.md +++ b/ansible/aws/README.md @@ -57,6 +57,8 @@ Create and configure the instance: If you run more than one at a time, set a custom host name with `-e hostname=cockpit-aws-tasks-2` or similar, so that GitHub test statuses remain useful to identify where a test runs. +There is also an "elastic" mode where the tasks bots keep running until the AMQP queue runs low. Use that for situations where AWS instances act as extra high-demand capacity instead of being the primary runners. Enable that mode with `-e idle_poweroff=1`. + Webhook setup ------------- AWS runs our primary webhook. Deploy or update it with: diff --git a/ansible/roles/tasks-systemd/tasks/main.yml b/ansible/roles/tasks-systemd/tasks/main.yml index 855c2fbc..d05afda7 100644 --- a/ansible/roles/tasks-systemd/tasks/main.yml +++ b/ansible/roles/tasks-systemd/tasks/main.yml @@ -146,4 +146,5 @@ export INSTANCES={{ instances | default(1) }} export TEST_NOTIFICATION_MX={{ notification_mx | default('') }} export TEST_NOTIFICATION_TO={{ notification_to | default('') }} + export IDLE_POWEROFF={{ idle_poweroff | default('') }} /run/install-service diff --git a/tasks/container/cockpit-tasks b/tasks/container/cockpit-tasks index af3c22db..4b64393b 100755 --- a/tasks/container/cockpit-tasks +++ b/tasks/container/cockpit-tasks @@ -20,7 +20,17 @@ function update_bots() { } # wait between 1 and 10 minutes, with an override to speed up tests +# in IDLE_POWEROFF mode, also check queue size function slumber() { + if [ -n "${IDLE_POWEROFF:-}" ]; then + # only consider job-runner entries, not statistics or webhook + NUM_JOBS=$(./inspect-queue | grep --count '"job":') + if [ "$NUM_JOBS" -lt 10 ]; then + echo "Job queue running low, exiting" + exit 100 + fi + fi + if [ -n "${SLUMBER:-}" ]; then sleep "$SLUMBER" else diff --git a/tasks/install-service b/tasks/install-service index 5de050d5..56200ab9 100755 --- a/tasks/install-service +++ b/tasks/install-service @@ -39,6 +39,9 @@ After=podman.socket [Service] Slice=cockpittasks.slice Restart=always +# cockpit-tasks exits with 100 in IDLE_POWEROFF mode when queue is running low +SuccessExitStatus=100 +RestartPreventExitStatus=100 RestartSec=60 # give image pull enough time TimeoutStartSec=10min @@ -63,6 +66,7 @@ ExecStart=/usr/bin/podman run --name=cockpit-tasks-%i --hostname=${CONTAINER_HOS --env=GIT_AUTHOR_EMAIL=cockpituous@cockpit-project.org \ --env=TEST_NOTIFICATION_MX=${TEST_NOTIFICATION_MX} \ --env=TEST_NOTIFICATION_TO=${TEST_NOTIFICATION_TO} \ + --env=IDLE_POWEROFF=${IDLE_POWEROFF:-} \ ghcr.io/cockpit-project/tasks cockpit-tasks --verbose ExecStop=/usr/bin/podman rm -f cockpit-tasks-%i @@ -70,6 +74,16 @@ ExecStop=/usr/bin/podman rm -f cockpit-tasks-%i WantedBy=multi-user.target EOF +# mode for elastic cloud runners +if [ -n "${IDLE_POWEROFF:-}" ]; then + mkdir -p /etc/systemd/system/cockpittasks.slice.d + cat < /etc/systemd/system/cockpittasks.slice.d/poweroff.conf + [Unit] + StopWhenUnneeded=yes + SuccessAction=poweroff-immediate +EOF +fi + systemctl daemon-reload for i in `seq $INSTANCES`; do systemctl enable --now cockpit-tasks@$i; done