-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yaml
More file actions
91 lines (87 loc) · 3.51 KB
/
Copy pathdocker-compose.yaml
File metadata and controls
91 lines (87 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# One-command deployment for the whole stack.
#
# cp deploy/.env.example deploy/.env # set HF_TOKEN / GPUs
# docker compose -f deploy/docker-compose.yaml up -d --build (or: make up)
#
# Topology:
# backend [GPU] — dashboard API (:5000); spawns vLLM subprocesses on :800x.
# router — OpenAI-compatible router (:8887); shares the backend's
# network namespace so it can reach those localhost vLLM ports.
# frontend — nginx serving the SPA + reverse-proxying /api and /v1.
#
# Build context is the repo root (..) so the Dockerfiles can COPY apps/ + packages/.
services:
backend:
build:
context: ..
dockerfile: deploy/engine.Dockerfile
image: llmops-engine:latest
container_name: llmops-backend
working_dir: /app/apps/backend
# Use the uvicorn console script (not `python -m`): the vllm base image only
# ships `python3`, not a `python` symlink.
command: uvicorn main:app --host 0.0.0.0 --port 5000
env_file: .env
environment:
# Backend reads LLM_ROUTER_SERVER_CONFIG_PATH; the router reads CONFIG_PATH.
- LLM_ROUTER_SERVER_CONFIG_PATH=/app/packages/config-schema/config.yaml
- LLMOPS_DB_PATH=/app/data/llmops.db
- LLMOPS_OVERLAY_PATH=/app/data/dynamic_models.json
- HF_HOME=/hf
- NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}
volumes:
# Edit models without rebuilding: the canonical config is bind-mounted.
- ../packages/config-schema/config.yaml:/app/packages/config-schema/config.yaml
- llmops-data:/app/data # shared SQLite + dynamic-model overlay
# HF cache bind-mounted to a host dir so weights are browsable locally and
# shared with a host-side HF cache. HF_HOME=/hf -> /hf/hub == <dir>/hub.
- ${HF_CACHE_DIR:-${HOME}/.cache/huggingface}:/hf
- /etc/localtime:/etc/localtime:ro
ports:
# Host ports are configurable to avoid collisions; container ports are
# fixed (internal routing + nginx target them by name).
- "${BACKEND_PORT:-5000}:5000" # dashboard backend API
- "${ROUTER_PORT:-8887}:8887" # router (lives in this namespace, so mapped here)
shm_size: "16gb" # vLLM/torch need a large /dev/shm
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
restart: unless-stopped
router:
image: llmops-engine:latest
container_name: llmops-router
depends_on:
- backend
# Share the backend's network namespace -> localhost:800x reaches the vLLM
# subprocesses the backend spawned. (Cannot declare its own ports; 8887 is
# published on the backend service above.)
network_mode: "service:backend"
working_dir: /app/apps/router-server
command: gunicorn src.llm_router.main:app -c configs/gunicorn.conf.py
env_file: .env
environment:
- CONFIG_PATH=/app/packages/config-schema/config.yaml
- LLMOPS_DB_PATH=/app/data/llmops.db
- LLMOPS_OVERLAY_PATH=/app/data/dynamic_models.json
- PYTHONPATH=/app/apps/router-server
volumes:
- ../packages/config-schema/config.yaml:/app/packages/config-schema/config.yaml
- llmops-data:/app/data
- /etc/localtime:/etc/localtime:ro
restart: unless-stopped
frontend:
build:
context: ..
dockerfile: deploy/frontend.Dockerfile
image: llmops-frontend:latest
container_name: llmops-frontend
depends_on:
- backend
ports:
- "${FRONTEND_PORT:-8884}:80"
restart: unless-stopped
volumes:
llmops-data: