From 76d02f7bae22185ca1e6481157d78ff503039f2a Mon Sep 17 00:00:00 2001 From: Johan Bloemberg Date: Mon, 21 Oct 2024 16:35:59 +0200 Subject: [PATCH] Refactor worker restart to prevent issues with periodic tests --- .../periodic/15min/restart_nassl_worker | 8 +++++-- .../periodic/daily/restart_slow_worker | 24 +++++++++++++++++++ .../periodic/hourly/restart_worker | 8 +++++-- docker/docker-compose.yml | 5 ++++ 4 files changed, 41 insertions(+), 4 deletions(-) create mode 100755 docker/cron-docker/periodic/daily/restart_slow_worker diff --git a/docker/cron-docker/periodic/15min/restart_nassl_worker b/docker/cron-docker/periodic/15min/restart_nassl_worker index 1293befa5..66e5ee227 100755 --- a/docker/cron-docker/periodic/15min/restart_nassl_worker +++ b/docker/cron-docker/periodic/15min/restart_nassl_worker @@ -1,4 +1,8 @@ #!/bin/sh set -e -# find nassl worker and restart the container(s) -docker ps --filter label=com.docker.compose.service=worker-nassl --quiet | xargs --no-run-if-empty docker restart +# stop and start worker one at a time to ensure (batch) tasks are still being picked up +# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent +for worker in $(docker ps --filter label=com.docker.compose.service=worker-nassl --quiet); do + docker stop "$$worker" + docker start "$$worker" +done diff --git a/docker/cron-docker/periodic/daily/restart_slow_worker b/docker/cron-docker/periodic/daily/restart_slow_worker new file mode 100755 index 000000000..a289a6656 --- /dev/null +++ b/docker/cron-docker/periodic/daily/restart_slow_worker @@ -0,0 +1,24 @@ +#!/bin/sh + +# restart slow worker every day to prevent slow memory leaks +# as the slow worker can run very long tasks (eg: report generation) +# we first start a new container before stopping the previous one + +set -e + +cd /opt/Internet.nl + +SERVICE=worker-slow +REPLICAS=$WORKER_SLOW_REPLICAS +COMPOSE_CMD="docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env" + +OLD_CONTAINERS=$($COMPOSE_CMD ps --format "{{ .Name }}"|grep "$SERVICE") + +# bring up new containers, wait until healthy +$COMPOSE_CMD up --no-deps --no-recreate --wait --scale="$SERVICE=$(($REPLICAS*2))" "$SERVICE" + +# graceful shutdown and remove old containers +docker rm --force "$OLD_CONTAINERS" + +# restore replica number to original +$COMPOSE_CMD scale $SERVICE=$REPLICAS diff --git a/docker/cron-docker/periodic/hourly/restart_worker b/docker/cron-docker/periodic/hourly/restart_worker index 580e1b50e..2707185cf 100755 --- a/docker/cron-docker/periodic/hourly/restart_worker +++ b/docker/cron-docker/periodic/hourly/restart_worker @@ -1,4 +1,8 @@ #!/bin/sh set -e -# find worker and restart the container(s) -docker ps --filter label=com.docker.compose.service=worker --quiet | xargs --no-run-if-empty docker restart +# stop and start worker one at a time to ensure (batch) tasks are still being picked up +# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent +for worker in $(docker ps --filter label=com.docker.compose.service=worker --quiet); do + docker stop "$$worker" + docker start "$$worker" +done diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 69c0ef70b..2c9b5641c 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -249,6 +249,8 @@ services: # time after which a SIGKILL is sent to celery after a SIGTERM (warm shutdown), default 10s # insufficient short grace period causes issues on batch when tasks are killed during the hourly worker restart stop_grace_period: 10m + # SIGTERM is default, but make it explicit + stop_signal: SIGTERM depends_on: db-migrate: @@ -735,6 +737,9 @@ services: command: crond -f -d7 -c /etc/crontabs-docker environment: - AUTO_UPDATE_TO + - WORKER_SLOW_REPLICAS + - WORKER_REPLICAS + - RELEASE restart: unless-stopped logging: