File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
4141 libpq-dev \
4242 # Translations dependencies
4343 gettext \
44+ # healthcheck dependencies
45+ procps \
4446 # cleaning up unused files
4547 && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
4648 && rm -rf /var/lib/apt/lists/*
@@ -74,6 +76,10 @@ COPY ./compose/local/django/celery/flower/start /start-flower
7476RUN sed -i 's/\r $//g' /start-flower
7577RUN chmod +x /start-flower
7678
79+ # Copy celery scripts directory for healthcheck
80+ COPY ./compose/local/django/celery /celery
81+ RUN chmod +x /celery/healthcheck.sh
82+
7783
7884# copy application code to WORKDIR
7985COPY . ${APP_HOME}
Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+ #
3+ # Celery Worker Healthcheck Script
4+ #
5+ # This script checks if the Celery worker process is running and responsive.
6+ # It uses two checks:
7+ # 1. Process check - is celery worker process running?
8+ # 2. Redis connectivity - can we connect to the broker?
9+ #
10+ # When used with the autoheal container, unhealthy workers will be
11+ # automatically restarted.
12+
13+ set -e
14+
15+ # Check 1: Is the celery worker process running?
16+ if ! pgrep -f " celery.*worker" > /dev/null 2>&1 ; then
17+ echo " ERROR: Celery worker process not found" >&2
18+ exit 1
19+ fi
20+
21+ # Check 2: Can we connect to Redis (the broker)?
22+ # Use redis-cli if available, otherwise skip
23+ if command -v redis-cli > /dev/null 2>&1 ; then
24+ if ! redis-cli -h ${CELERY_BROKER_URL:- redis} ping > /dev/null 2>&1 ; then
25+ echo " ERROR: Cannot connect to Redis broker" >&2
26+ exit 1
27+ fi
28+ fi
29+
30+ # All checks passed
31+ exit 0
Original file line number Diff line number Diff line change 33set -o errexit
44set -o nounset
55
6+ # Local development with auto-reload and optional debugging
7+ #
8+ # CELERY_DEBUG=1 - Enable debugpy for remote debugging on port 5678
9+ # CELERY_NO_RELOAD=1 - Disable watchfiles auto-reload
10+ #
11+ # Worker protections (same as production):
12+ # --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks)
13+ # --max-memory-per-child=4000000 - Restart if memory exceeds 4GB
614
7- exec watchfiles --filter python celery.__main__.main --args ' -A config.celery_app worker -l INFO'
15+ # Check if debugging is enabled
16+ if [ " ${CELERY_DEBUG:- 0} " = " 1" ]; then
17+ echo " Starting Celery worker with debugpy on port 5678..."
18+ exec python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
19+ fi
20+
21+ # Check if auto-reload should be disabled
22+ if [ " ${CELERY_NO_RELOAD:- 0} " = " 1" ]; then
23+ echo " Starting Celery worker without auto-reload..."
24+ exec celery -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
25+ else
26+ echo " Starting Celery worker with watchfiles auto-reload..."
27+ exec watchfiles --filter python celery.__main__.main --args ' -A config.celery_app worker -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000'
28+ fi
Original file line number Diff line number Diff line change @@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
4545 libpq-dev \
4646 # Translations dependencies
4747 gettext \
48+ # healthcheck dependencies
49+ procps \
4850 # cleaning up unused files
4951 && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
5052 && rm -rf /var/lib/apt/lists/*
@@ -81,6 +83,11 @@ RUN sed -i 's/\r$//g' /start-flower
8183RUN chmod +x /start-flower
8284
8385
86+ # Copy celery scripts directory for healthcheck
87+ COPY --chown=django:django ./compose/production/django/celery /celery
88+ RUN chmod +x /celery/healthcheck.sh
89+
90+
8491# copy application code to WORKDIR
8592COPY --chown=django:django . ${APP_HOME}
8693
Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+ #
3+ # Celery Worker Healthcheck Script (Production)
4+ #
5+ # This script checks if the Celery worker process is running and responsive.
6+ # It uses two checks:
7+ # 1. Process check - is celery worker process running?
8+ # 2. Redis connectivity - can we connect to the broker?
9+ #
10+ # When used with the autoheal container, unhealthy workers will be
11+ # automatically restarted.
12+
13+ set -e
14+
15+ # Check 1: Is the celery worker process running?
16+ if ! pgrep -f " celery.*worker" > /dev/null 2>&1 ; then
17+ echo " ERROR: Celery worker process not found" >&2
18+ exit 1
19+ fi
20+
21+ # Check 2: Can we connect to Redis (the broker)?
22+ # Use redis-cli if available, otherwise skip
23+ if command -v redis-cli > /dev/null 2>&1 ; then
24+ if ! redis-cli -h ${CELERY_BROKER_URL:- redis} ping > /dev/null 2>&1 ; then
25+ echo " ERROR: Cannot connect to Redis broker" >&2
26+ exit 1
27+ fi
28+ fi
29+
30+ # All checks passed
31+ exit 0
Original file line number Diff line number Diff line change @@ -4,4 +4,23 @@ set -o errexit
44set -o pipefail
55set -o nounset
66
7- exec newrelic-admin run-program celery -A config.celery_app worker -l INFO
7+ # Celery worker with built-in protections against stuck/leaking workers:
8+ #
9+ # --max-tasks-per-child=50
10+ # Restart worker process after 50 tasks to prevent memory leaks
11+ # Conservative value since ML tasks can be memory-intensive
12+ #
13+ # --max-memory-per-child=4000000
14+ # Restart worker if memory exceeds 4GB (4,000,000 KB)
15+ # Prevents runaway memory consumption from large images/models
16+ #
17+ # These options work in conjunction with the Docker healthcheck:
18+ # - Healthcheck detects STUCK workers (not responding to ping)
19+ # - These options prevent RESOURCE LEAKS (memory/task buildup)
20+ # - Autoheal restarts UNHEALTHY containers
21+ # - restart:always brings containers back after any exit
22+
23+ exec newrelic-admin run-program celery -A config.celery_app worker \
24+ -l INFO \
25+ --max-tasks-per-child=50 \
26+ --max-memory-per-child=4000000
Original file line number Diff line number Diff line change @@ -29,12 +29,28 @@ services:
2929 ports : []
3030 command : /start-celeryworker
3131 restart : always
32+ healthcheck :
33+ test : ["CMD-SHELL", "/celery/healthcheck.sh"]
34+ interval : 30s # Check every 30 seconds
35+ timeout : 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
36+ retries : 3 # Mark unhealthy after 3 consecutive failures (90s total)
37+ start_period : 60s # Grace period during container startup
38+ labels :
39+ - " autoheal=true" # Enable autoheal to restart this container when unhealthy
3240
3341 celerybeat :
3442 << : *django
3543 ports : []
3644 command : /start-celerybeat
3745 restart : always
46+ healthcheck :
47+ test : ["CMD-SHELL", "pgrep -f 'celery.*beat' > /dev/null || exit 1"]
48+ interval : 60s # Beat is less critical, check every minute
49+ timeout : 10s
50+ retries : 3
51+ start_period : 30s
52+ labels :
53+ - " autoheal=true"
3854
3955 flower :
4056 << : *django
@@ -44,6 +60,25 @@ services:
4460 restart : always
4561 volumes :
4662 - ./data/flower/:/data/
63+ healthcheck :
64+ test : ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
65+ interval : 30s
66+ timeout : 5s
67+ retries : 3
68+ start_period : 30s
69+ labels :
70+ - " autoheal=true"
71+
72+ autoheal :
73+ image : willfarrell/autoheal:latest
74+ container_name : ami_production_autoheal
75+ restart : always
76+ environment :
77+ - AUTOHEAL_CONTAINER_LABEL=autoheal
78+ - AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds
79+ - AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start
80+ volumes :
81+ - /var/run/docker.sock:/var/run/docker.sock
4782
4883 awscli :
4984 build :
Original file line number Diff line number Diff line change @@ -25,3 +25,22 @@ services:
2525 ports : []
2626 command : /start-celeryworker
2727 restart : always
28+ healthcheck :
29+ test : ["CMD-SHELL", "/celery/healthcheck.sh"]
30+ interval : 30s # Check every 30 seconds
31+ timeout : 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
32+ retries : 3 # Mark unhealthy after 3 consecutive failures (90s total)
33+ start_period : 60s # Grace period during container startup
34+ labels :
35+ - " autoheal=true" # Enable autoheal to restart this container when unhealthy
36+
37+ autoheal :
38+ image : willfarrell/autoheal:latest
39+ container_name : ami_worker_autoheal
40+ restart : always
41+ environment :
42+ - AUTOHEAL_CONTAINER_LABEL=autoheal
43+ - AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds
44+ - AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start
45+ volumes :
46+ - /var/run/docker.sock:/var/run/docker.sock
Original file line number Diff line number Diff line change @@ -90,11 +90,21 @@ services:
9090 << : *django
9191 image : ami_local_celeryworker
9292 scale : 1
93- # For remote debugging with debugpy, should get overridden for production
93+ # For remote debugging with debugpy, set CELERY_DEBUG=1 in environment
94+ # To disable watchfiles auto-reload, set CELERY_NO_RELOAD=1
9495 # Also make sure to install debugpy in your requirements/local.txt
9596 ports :
9697 - " 5678:5678"
97- command : python -m debugpy --listen 0.0.0.0:5678 -m celery -A config.celery_app worker -l INFO
98+ # environment:
99+ # - CELERY_DEBUG=1
100+ # - CELERY_NO_RELOAD=1
101+ command : /start-celeryworker
102+ healthcheck :
103+ test : ["CMD-SHELL", "/celery/healthcheck.sh"]
104+ interval : 30s # Check every 30 seconds
105+ timeout : 15s # Healthcheck must complete within 15s (ping timeout is 10s + overhead)
106+ retries : 3 # Mark unhealthy after 3 consecutive failures (90s total)
107+ start_period : 60s # Grace period during container startup
98108
99109 celerybeat :
100110 << : *django
You can’t perform that action at this time.
0 commit comments