Skip to content

Commit ae584bc

Browse files
committed
feat: add Docker healthcheck and auto-restart for Celery workers
Implements healthcheck scripts, worker protections (max-tasks-per-child, max-memory-per-child), and autoheal container for automatic recovery of stuck workers.
1 parent e3b9711 commit ae584bc

9 files changed

Lines changed: 183 additions & 4 deletions

File tree

compose/local/django/Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
4141
libpq-dev \
4242
# Translations dependencies
4343
gettext \
44+
# healthcheck dependencies
45+
procps \
4446
# cleaning up unused files
4547
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
4648
&& rm -rf /var/lib/apt/lists/*
@@ -74,6 +76,10 @@ COPY ./compose/local/django/celery/flower/start /start-flower
7476
RUN sed -i 's/\r$//g' /start-flower
7577
RUN chmod +x /start-flower
7678

79+
# Copy celery scripts directory for healthcheck
80+
COPY ./compose/local/django/celery /celery
81+
RUN chmod +x /celery/healthcheck.sh
82+
7783

7884
# copy application code to WORKDIR
7985
COPY . ${APP_HOME}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
#
3+
# Celery Worker Healthcheck Script
4+
#
5+
# This script checks if the Celery worker process is running and responsive.
6+
# It uses two checks:
7+
# 1. Process check - is celery worker process running?
8+
# 2. Redis connectivity - can we connect to the broker?
9+
#
10+
# When used with the autoheal container, unhealthy workers will be
11+
# automatically restarted.
12+
13+
set -e
14+
15+
# Check 1: Is the celery worker process running?
16+
if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then
17+
echo "ERROR: Celery worker process not found" >&2
18+
exit 1
19+
fi
20+
21+
# Check 2: Can we connect to Redis (the broker)?
22+
# Use redis-cli if available, otherwise skip
23+
if command -v redis-cli > /dev/null 2>&1; then
24+
if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
25+
echo "ERROR: Cannot connect to Redis broker" >&2
26+
exit 1
27+
fi
28+
fi
29+
30+
# All checks passed
31+
exit 0

compose/local/django/celery/worker/start

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,26 @@
33
set -o errexit
44
set -o nounset
55

6+
# Local development with auto-reload and optional debugging
7+
#
8+
# CELERY_DEBUG=1 - Enable debugpy for remote debugging on port 5678
9+
# CELERY_NO_RELOAD=1 - Disable watchfiles auto-reload
10+
#
11+
# Worker protections (same as production):
12+
# --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks)
13+
# --max-memory-per-child=4000000 - Restart if memory exceeds 4GB
614

7-
exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO'
15+
# Check if debugging is enabled
16+
if [ "${CELERY_DEBUG:-0}" = "1" ]; then
17+
echo "Starting Celery worker with debugpy on port 5678..."
18+
exec python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
19+
fi
20+
21+
# Check if auto-reload should be disabled
22+
if [ "${CELERY_NO_RELOAD:-0}" = "1" ]; then
23+
echo "Starting Celery worker without auto-reload..."
24+
exec celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
25+
else
26+
echo "Starting Celery worker with watchfiles auto-reload..."
27+
exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000'
28+
fi

compose/production/django/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
4545
libpq-dev \
4646
# Translations dependencies
4747
gettext \
48+
# healthcheck dependencies
49+
procps \
4850
# cleaning up unused files
4951
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
5052
&& rm -rf /var/lib/apt/lists/*
@@ -81,6 +83,11 @@ RUN sed -i 's/\r$//g' /start-flower
8183
RUN chmod +x /start-flower
8284

8385

86+
# Copy celery scripts directory for healthcheck
87+
COPY --chown=django:django ./compose/production/django/celery /celery
88+
RUN chmod +x /celery/healthcheck.sh
89+
90+
8491
# copy application code to WORKDIR
8592
COPY --chown=django:django . ${APP_HOME}
8693

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
#
3+
# Celery Worker Healthcheck Script (Production)
4+
#
5+
# This script checks if the Celery worker process is running and responsive.
6+
# It uses two checks:
7+
# 1. Process check - is celery worker process running?
8+
# 2. Redis connectivity - can we connect to the broker?
9+
#
10+
# When used with the autoheal container, unhealthy workers will be
11+
# automatically restarted.
12+
13+
set -e
14+
15+
# Check 1: Is the celery worker process running?
16+
if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then
17+
echo "ERROR: Celery worker process not found" >&2
18+
exit 1
19+
fi
20+
21+
# Check 2: Can we connect to Redis (the broker)?
22+
# Use redis-cli if available, otherwise skip
23+
if command -v redis-cli > /dev/null 2>&1; then
24+
if ! redis-cli -h ${CELERY_BROKER_URL:-redis} ping > /dev/null 2>&1; then
25+
echo "ERROR: Cannot connect to Redis broker" >&2
26+
exit 1
27+
fi
28+
fi
29+
30+
# All checks passed
31+
exit 0

compose/production/django/celery/worker/start

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,23 @@ set -o errexit
44
set -o pipefail
55
set -o nounset
66

7-
exec newrelic-admin run-program celery -A config.celery_app worker -l INFO
7+
# Celery worker with built-in protections against stuck/leaking workers:
8+
#
9+
# --max-tasks-per-child=50
10+
# Restart worker process after 50 tasks to prevent memory leaks
11+
# Conservative value since ML tasks can be memory-intensive
12+
#
13+
# --max-memory-per-child=4000000
14+
# Restart worker if memory exceeds 4GB (4,000,000 KB)
15+
# Prevents runaway memory consumption from large images/models
16+
#
17+
# These options work in conjunction with the Docker healthcheck:
18+
# - Healthcheck detects STUCK workers (not responding to ping)
19+
# - These options prevent RESOURCE LEAKS (memory/task buildup)
20+
# - Autoheal restarts UNHEALTHY containers
21+
# - restart:always brings containers back after any exit
22+
23+
exec newrelic-admin run-program celery -A config.celery_app worker \
24+
-l INFO \
25+
--max-tasks-per-child=50 \
26+
--max-memory-per-child=4000000

docker-compose.production.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,28 @@ services:
2929
ports: []
3030
command: /start-celeryworker
3131
restart: always
32+
healthcheck:
33+
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
34+
interval: 30s # Check every 30 seconds
35+
timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
36+
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
37+
start_period: 60s # Grace period during container startup
38+
labels:
39+
- "autoheal=true" # Enable autoheal to restart this container when unhealthy
3240

3341
celerybeat:
3442
<<: *django
3543
ports: []
3644
command: /start-celerybeat
3745
restart: always
46+
healthcheck:
47+
test: ["CMD-SHELL", "pgrep -f 'celery.*beat' > /dev/null || exit 1"]
48+
interval: 60s # Beat is less critical, check every minute
49+
timeout: 10s
50+
retries: 3
51+
start_period: 30s
52+
labels:
53+
- "autoheal=true"
3854

3955
flower:
4056
<<: *django
@@ -44,6 +60,25 @@ services:
4460
restart: always
4561
volumes:
4662
- ./data/flower/:/data/
63+
healthcheck:
64+
test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
65+
interval: 30s
66+
timeout: 5s
67+
retries: 3
68+
start_period: 30s
69+
labels:
70+
- "autoheal=true"
71+
72+
autoheal:
73+
image: willfarrell/autoheal:latest
74+
container_name: ami_production_autoheal
75+
restart: always
76+
environment:
77+
- AUTOHEAL_CONTAINER_LABEL=autoheal
78+
- AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds
79+
- AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start
80+
volumes:
81+
- /var/run/docker.sock:/var/run/docker.sock
4782

4883
awscli:
4984
build:

docker-compose.worker.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,22 @@ services:
2525
ports: []
2626
command: /start-celeryworker
2727
restart: always
28+
healthcheck:
29+
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
30+
interval: 30s # Check every 30 seconds
31+
timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
32+
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
33+
start_period: 60s # Grace period during container startup
34+
labels:
35+
- "autoheal=true" # Enable autoheal to restart this container when unhealthy
36+
37+
autoheal:
38+
image: willfarrell/autoheal:latest
39+
container_name: ami_worker_autoheal
40+
restart: always
41+
environment:
42+
- AUTOHEAL_CONTAINER_LABEL=autoheal
43+
- AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds
44+
- AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start
45+
volumes:
46+
- /var/run/docker.sock:/var/run/docker.sock

docker-compose.yml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,21 @@ services:
9090
<<: *django
9191
image: ami_local_celeryworker
9292
scale: 1
93-
# For remote debugging with debugpy, should get overridden for production
93+
# For remote debugging with debugpy, set CELERY_DEBUG=1 in environment
94+
# To disable watchfiles auto-reload, set CELERY_NO_RELOAD=1
9495
# Also make sure to install debugpy in your requirements/local.txt
9596
ports:
9697
- "5678:5678"
97-
command: python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO
98+
# environment:
99+
# - CELERY_DEBUG=1
100+
# - CELERY_NO_RELOAD=1
101+
command: /start-celeryworker
102+
healthcheck:
103+
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
104+
interval: 30s # Check every 30 seconds
105+
timeout: 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
106+
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
107+
start_period: 60s # Grace period during container startup
98108

99109
celerybeat:
100110
<<: *django

0 commit comments

Comments
 (0)