feat(presidio): build & own combined analyzer+anonymizer image

TheodoreSpeaks · TheodoreSpeaks · commit 1001ca933448 · 2026-06-22T19:18:49.000-07:00
Replace the stock mcr.microsoft.com/presidio-* sidecar images with a single
image we build and push to ECR/GHCR. A thin FastAPI service constructs one
AnalyzerEngine + one AnonymizerEngine at startup and serves both on port 3000
(/health, /supportedentities, /analyze, /anonymize) so the app needs one
PRESIDIO_URL. English only; pinned presidio 2.2.362 + en_core_web_lg 3.8.0.

Bakes in the native check-digit VIN recognizer and registers 12 English
recognizers Presidio ships but does not load by default (UK_NINO, AU_*, IN_*,
SG_*), taking the supported English set from 19 to 32.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -88,6 +88,8 @@ jobs:
             ecr_repo_secret: ECR_MIGRATIONS
           - dockerfile: ./docker/realtime.Dockerfile
             ecr_repo_secret: ECR_REALTIME
+          - dockerfile: ./docker/presidio.Dockerfile
+            ecr_repo_secret: ECR_PRESIDIO
     steps:
       - name: Checkout code
         uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
@@ -115,7 +117,7 @@ jobs:
         id: ecr-repo
         run: echo "name=$ECR_REPO" >> $GITHUB_OUTPUT
         env:
-          ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || '' }}
+          ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || matrix.ecr_repo_secret == 'ECR_PRESIDIO' && secrets.ECR_PRESIDIO || '' }}
 
       - name: Build and push
         uses: useblacksmith/build-push-action@fb9e3e6a9299c78462bfadd0d93352c316adc9b8 # v2
@@ -153,6 +155,9 @@ jobs:
           - dockerfile: ./docker/realtime.Dockerfile
             ghcr_image: ghcr.io/simstudioai/realtime
             ecr_repo_secret: ECR_REALTIME
+          - dockerfile: ./docker/presidio.Dockerfile
+            ghcr_image: ghcr.io/simstudioai/presidio
+            ecr_repo_secret: ECR_PRESIDIO
     steps:
       - name: Checkout code
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
@@ -188,7 +193,7 @@ jobs:
         id: ecr-repo
         run: echo "name=$ECR_REPO" >> $GITHUB_OUTPUT
         env:
-          ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || '' }}
+          ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || matrix.ecr_repo_secret == 'ECR_PRESIDIO' && secrets.ECR_PRESIDIO || '' }}
 
       - name: Generate tags
         id: meta
@@ -251,6 +256,8 @@ jobs:
             image: ghcr.io/simstudioai/migrations
           - dockerfile: ./docker/realtime.Dockerfile
             image: ghcr.io/simstudioai/realtime
+          - dockerfile: ./docker/presidio.Dockerfile
+            image: ghcr.io/simstudioai/presidio
 
     steps:
       - name: Checkout code
@@ -306,6 +313,7 @@ jobs:
           - image: ghcr.io/simstudioai/simstudio
           - image: ghcr.io/simstudioai/migrations
           - image: ghcr.io/simstudioai/realtime
+          - image: ghcr.io/simstudioai/presidio
 
     steps:
       - name: Login to GHCR
diff --git a/.github/workflows/images.yml b/.github/workflows/images.yml
@@ -26,6 +26,9 @@ jobs:
           - dockerfile: ./docker/realtime.Dockerfile
             ghcr_image: ghcr.io/simstudioai/realtime
             ecr_repo_secret: ECR_REALTIME
+          - dockerfile: ./docker/presidio.Dockerfile
+            ghcr_image: ghcr.io/simstudioai/presidio
+            ecr_repo_secret: ECR_PRESIDIO
     outputs:
       registry: ${{ steps.login-ecr.outputs.registry }}
 
@@ -114,6 +117,8 @@ jobs:
             image: ghcr.io/simstudioai/migrations
           - dockerfile: ./docker/realtime.Dockerfile
             image: ghcr.io/simstudioai/realtime
+          - dockerfile: ./docker/presidio.Dockerfile
+            image: ghcr.io/simstudioai/presidio
 
     steps:
       - name: Checkout code
@@ -157,6 +162,7 @@ jobs:
           - image: ghcr.io/simstudioai/simstudio
           - image: ghcr.io/simstudioai/migrations
           - image: ghcr.io/simstudioai/realtime
+          - image: ghcr.io/simstudioai/presidio
 
     steps:
       - name: Login to GHCR
diff --git a/docker/presidio.Dockerfile b/docker/presidio.Dockerfile
@@ -0,0 +1,44 @@
+# ========================================
+# Combined Presidio service (analyzer + anonymizer) on a single port (3000)
+# ========================================
+FROM python:3.12-slim-bookworm AS base
+
+WORKDIR /app
+
+# build-essential for any sdist that compiles native deps (e.g. blis/thinc).
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update && apt-get install -y --no-install-recommends \
+    build-essential curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Pinned deps + pinned en_core_web_lg wheel. Separate layer so source edits
+# don't reinstall the heavy model.
+COPY docker/presidio/requirements.txt ./requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir -r requirements.txt
+
+# Pinned English spaCy model. Downloaded with retries/resume (the wheel is
+# ~400MB and truncates on flaky networks if pip fetches the URL directly).
+ARG SPACY_MODEL_VERSION=3.8.0
+RUN --mount=type=cache,target=/root/.cache/pip \
+    MODEL_WHL="en_core_web_lg-${SPACY_MODEL_VERSION}-py3-none-any.whl" && \
+    curl -fL --retry 5 --retry-delay 5 --retry-all-errors -C - \
+      -o "/tmp/${MODEL_WHL}" \
+      "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-${SPACY_MODEL_VERSION}/${MODEL_WHL}" && \
+    pip install --no-cache-dir "/tmp/${MODEL_WHL}" && \
+    rm "/tmp/${MODEL_WHL}"
+
+COPY docker/presidio/server.py ./server.py
+
+RUN groupadd -g 1001 presidio && \
+    useradd -u 1001 -g presidio presidio && \
+    chown -R presidio:presidio /app
+USER presidio
+
+EXPOSE 3000
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=40s --retries=3 \
+    CMD curl -fsS http://localhost:3000/health || exit 1
+
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3000"]
diff --git a/docker/presidio/requirements.txt b/docker/presidio/requirements.txt
@@ -0,0 +1,10 @@
+# Pinned for reproducible image builds. Bump deliberately.
+presidio-analyzer==2.2.362
+presidio-anonymizer==2.2.362
+spacy==3.8.14
+fastapi==0.138.0
+uvicorn[standard]==0.49.0
+
+# The English spaCy model (en_core_web_lg, ~400MB) is fetched + pinned in the
+# Dockerfile via curl-with-retry rather than here — a direct pip wheel URL
+# truncates on flaky networks and fails wheel validation.
diff --git a/docker/presidio/server.py b/docker/presidio/server.py
@@ -0,0 +1,156 @@
+"""Combined Presidio REST service: analyzer + anonymizer on one port.
+
+Constructs one warm AnalyzerEngine (with a native check-digit VIN recognizer)
+and one AnonymizerEngine at startup, exposing stock-compatible endpoints so a
+single PRESIDIO_URL serves both. English only.
+"""
+
+from typing import Any
+
+from fastapi import Body, FastAPI
+from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult
+from presidio_analyzer.predefined_recognizers import (
+    AuAbnRecognizer,
+    AuAcnRecognizer,
+    AuMedicareRecognizer,
+    AuTfnRecognizer,
+    InAadhaarRecognizer,
+    InPanRecognizer,
+    InPassportRecognizer,
+    InVehicleRegistrationRecognizer,
+    InVoterRecognizer,
+    SgFinRecognizer,
+    SgUenRecognizer,
+    UkNinoRecognizer,
+)
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+
+# English-capable predefined recognizers Presidio ships but does NOT load by
+# default (UK_NINO, AU_*, IN_*, SG_*). es/it/pl/fi/th/ko recognizers are
+# language-locked and excluded — this image is English only.
+EXTRA_RECOGNIZERS = [
+    UkNinoRecognizer,
+    AuAbnRecognizer,
+    AuAcnRecognizer,
+    AuTfnRecognizer,
+    AuMedicareRecognizer,
+    InPanRecognizer,
+    InAadhaarRecognizer,
+    InVehicleRegistrationRecognizer,
+    InVoterRecognizer,
+    InPassportRecognizer,
+    SgFinRecognizer,
+    SgUenRecognizer,
+]
+
+
+class VinRecognizer(PatternRecognizer):
+    """VIN (17 chars, A-Z/0-9 excluding I/O/Q) with ISO 3779 check-digit
+    validation (position 9). Validation makes accidental matches on arbitrary
+    17-char codes (request ids, SKUs, tokens) extremely unlikely. Some
+    non-North-American VINs omit the check digit and are skipped — an
+    intentional bias toward precision.
+    """
+
+    _TRANSLIT = {
+        **{str(d): d for d in range(10)},
+        "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
+        "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
+        "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
+    }
+    _WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
+
+    def validate_result(self, pattern_text: str):
+        vin = pattern_text.upper()
+        if len(vin) != 17:
+            return False
+        try:
+            total = sum(self._TRANSLIT[c] * w for c, w in zip(vin, self._WEIGHTS))
+        except KeyError:
+            return False
+        check = total % 11
+        expected = "X" if check == 10 else str(check)
+        return vin[8] == expected
+
+
+def build_analyzer() -> AnalyzerEngine:
+    analyzer = AnalyzerEngine()
+    vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7)
+    analyzer.registry.add_recognizer(
+        VinRecognizer(
+            supported_entity="VIN",
+            patterns=[vin_pattern],
+            context=["vin", "vehicle", "chassis"],
+        )
+    )
+    for recognizer_cls in EXTRA_RECOGNIZERS:
+        analyzer.registry.add_recognizer(recognizer_cls())
+    return analyzer
+
+
+analyzer = build_analyzer()
+anonymizer = AnonymizerEngine()
+
+app = FastAPI(title="Sim Presidio", docs_url=None, redoc_url=None)
+
+
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"status": "ok"}
+
+
+@app.get("/supportedentities")
+def supported_entities(language: str = "en") -> list[str]:
+    return analyzer.get_supported_entities(language)
+
+
+@app.post("/analyze")
+def analyze(payload: dict[str, Any] = Body(...)) -> list[dict[str, Any]]:
+    entities = payload.get("entities") or None
+    results = analyzer.analyze(
+        text=payload["text"],
+        language=payload.get("language", "en"),
+        entities=entities,
+        score_threshold=payload.get("score_threshold"),
+        return_decision_process=payload.get("return_decision_process", False),
+    )
+    return [r.to_dict() for r in results]
+
+
+@app.post("/anonymize")
+def anonymize(payload: dict[str, Any] = Body(...)) -> dict[str, Any]:
+    analyzer_results = [
+        RecognizerResult(
+            entity_type=r["entity_type"],
+            start=r["start"],
+            end=r["end"],
+            score=r.get("score", 1.0),
+        )
+        for r in payload.get("analyzer_results", [])
+    ]
+    raw_operators = payload.get("anonymizers") or payload.get("operators")
+    operators = None
+    if raw_operators:
+        operators = {}
+        for entity, cfg in raw_operators.items():
+            cfg = dict(cfg)
+            operators[entity] = OperatorConfig(cfg.pop("type"), cfg)
+    result = anonymizer.anonymize(
+        text=payload["text"],
+        analyzer_results=analyzer_results,
+        operators=operators,
+    )
+    return {
+        "text": result.text,
+        "items": [
+            {
+                "operator": item.operator,
+                "entity_type": item.entity_type,
+                "start": item.start,
+                "end": item.end,
+                "text": item.text,
+            }
+            for item in result.items
+        ],
+    }