LLM-Router-Server-Dashboard/deploy/docker-compose.yaml at main · LLMSystems/LLM-Router-Server-Dashboard · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# One-command deployment for the whole stack.
#
#   cp deploy/.env.example deploy/.env   # set HF_TOKEN / GPUs
#   docker compose -f deploy/docker-compose.yaml up -d --build   (or: make up)
#
# Topology:
#   backend  [GPU] — dashboard API (:5000); spawns vLLM subprocesses on :800x.
#   router         — OpenAI-compatible router (:8887); shares the backend's
#                    network namespace so it can reach those localhost vLLM ports.
#   frontend       — nginx serving the SPA + reverse-proxying /api and /v1.
#
# Build context is the repo root (..) so the Dockerfiles can COPY apps/ + packages/.

services:
  backend:
    build:
      context: ..
      dockerfile: deploy/engine.Dockerfile
    image: llmops-engine:latest
    container_name: llmops-backend
    working_dir: /app/apps/backend
    # Use the uvicorn console script (not `python -m`): the vllm base image only
    # ships `python3`, not a `python` symlink.
    command: uvicorn main:app --host 0.0.0.0 --port 5000
    env_file: .env
    environment:
      # Backend reads LLM_ROUTER_SERVER_CONFIG_PATH; the router reads CONFIG_PATH.
      - LLM_ROUTER_SERVER_CONFIG_PATH=/app/packages/config-schema/config.yaml
      - LLMOPS_DB_PATH=/app/data/llmops.db
      - LLMOPS_OVERLAY_PATH=/app/data/dynamic_models.json
      - HF_HOME=/hf
      - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}
    volumes:
      # Edit models without rebuilding: the canonical config is bind-mounted.
      - ../packages/config-schema/config.yaml:/app/packages/config-schema/config.yaml
      - llmops-data:/app/data          # shared SQLite + dynamic-model overlay
      # HF cache bind-mounted to a host dir so weights are browsable locally and
      # shared with a host-side HF cache. HF_HOME=/hf -> /hf/hub == <dir>/hub.
      - ${HF_CACHE_DIR:-${HOME}/.cache/huggingface}:/hf
      - /etc/localtime:/etc/localtime:ro
    ports:
      # Host ports are configurable to avoid collisions; container ports are
      # fixed (internal routing + nginx target them by name).
      - "${BACKEND_PORT:-5000}:5000"   # dashboard backend API
      - "${ROUTER_PORT:-8887}:8887"    # router (lives in this namespace, so mapped here)
    shm_size: "16gb"  # vLLM/torch need a large /dev/shm
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
    restart: unless-stopped

  router:
    image: llmops-engine:latest
    container_name: llmops-router
    depends_on:
      - backend
    # Share the backend's network namespace -> localhost:800x reaches the vLLM
    # subprocesses the backend spawned. (Cannot declare its own ports; 8887 is
    # published on the backend service above.)
    network_mode: "service:backend"
    working_dir: /app/apps/router-server
    command: gunicorn src.llm_router.main:app -c configs/gunicorn.conf.py
    env_file: .env
    environment:
      - CONFIG_PATH=/app/packages/config-schema/config.yaml
      - LLMOPS_DB_PATH=/app/data/llmops.db
      - LLMOPS_OVERLAY_PATH=/app/data/dynamic_models.json
      - PYTHONPATH=/app/apps/router-server
    volumes:
      - ../packages/config-schema/config.yaml:/app/packages/config-schema/config.yaml
      - llmops-data:/app/data
      - /etc/localtime:/etc/localtime:ro
    restart: unless-stopped

  frontend:
    build:
      context: ..
      dockerfile: deploy/frontend.Dockerfile
    image: llmops-frontend:latest
    container_name: llmops-frontend
    depends_on:
      - backend
    ports:
      - "${FRONTEND_PORT:-8884}:80"
    restart: unless-stopped

volumes:
  llmops-data: