Skip to content

Commit 1001ca9

Browse files
feat(presidio): build & own combined analyzer+anonymizer image
Replace the stock mcr.microsoft.com/presidio-* sidecar images with a single image we build and push to ECR/GHCR. A thin FastAPI service constructs one AnalyzerEngine + one AnonymizerEngine at startup and serves both on port 3000 (/health, /supportedentities, /analyze, /anonymize) so the app needs one PRESIDIO_URL. English only; pinned presidio 2.2.362 + en_core_web_lg 3.8.0. Bakes in the native check-digit VIN recognizer and registers 12 English recognizers Presidio ships but does not load by default (UK_NINO, AU_*, IN_*, SG_*), taking the supported English set from 19 to 32.
1 parent 707c3cc commit 1001ca9

5 files changed

Lines changed: 226 additions & 2 deletions

File tree

.github/workflows/ci.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ jobs:
8888
ecr_repo_secret: ECR_MIGRATIONS
8989
- dockerfile: ./docker/realtime.Dockerfile
9090
ecr_repo_secret: ECR_REALTIME
91+
- dockerfile: ./docker/presidio.Dockerfile
92+
ecr_repo_secret: ECR_PRESIDIO
9193
steps:
9294
- name: Checkout code
9395
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
@@ -115,7 +117,7 @@ jobs:
115117
id: ecr-repo
116118
run: echo "name=$ECR_REPO" >> $GITHUB_OUTPUT
117119
env:
118-
ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || '' }}
120+
ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || matrix.ecr_repo_secret == 'ECR_PRESIDIO' && secrets.ECR_PRESIDIO || '' }}
119121

120122
- name: Build and push
121123
uses: useblacksmith/build-push-action@fb9e3e6a9299c78462bfadd0d93352c316adc9b8 # v2
@@ -153,6 +155,9 @@ jobs:
153155
- dockerfile: ./docker/realtime.Dockerfile
154156
ghcr_image: ghcr.io/simstudioai/realtime
155157
ecr_repo_secret: ECR_REALTIME
158+
- dockerfile: ./docker/presidio.Dockerfile
159+
ghcr_image: ghcr.io/simstudioai/presidio
160+
ecr_repo_secret: ECR_PRESIDIO
156161
steps:
157162
- name: Checkout code
158163
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
@@ -188,7 +193,7 @@ jobs:
188193
id: ecr-repo
189194
run: echo "name=$ECR_REPO" >> $GITHUB_OUTPUT
190195
env:
191-
ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || '' }}
196+
ECR_REPO: ${{ matrix.ecr_repo_secret == 'ECR_APP' && secrets.ECR_APP || matrix.ecr_repo_secret == 'ECR_MIGRATIONS' && secrets.ECR_MIGRATIONS || matrix.ecr_repo_secret == 'ECR_REALTIME' && secrets.ECR_REALTIME || matrix.ecr_repo_secret == 'ECR_PRESIDIO' && secrets.ECR_PRESIDIO || '' }}
192197

193198
- name: Generate tags
194199
id: meta
@@ -251,6 +256,8 @@ jobs:
251256
image: ghcr.io/simstudioai/migrations
252257
- dockerfile: ./docker/realtime.Dockerfile
253258
image: ghcr.io/simstudioai/realtime
259+
- dockerfile: ./docker/presidio.Dockerfile
260+
image: ghcr.io/simstudioai/presidio
254261

255262
steps:
256263
- name: Checkout code
@@ -306,6 +313,7 @@ jobs:
306313
- image: ghcr.io/simstudioai/simstudio
307314
- image: ghcr.io/simstudioai/migrations
308315
- image: ghcr.io/simstudioai/realtime
316+
- image: ghcr.io/simstudioai/presidio
309317

310318
steps:
311319
- name: Login to GHCR

.github/workflows/images.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ jobs:
2626
- dockerfile: ./docker/realtime.Dockerfile
2727
ghcr_image: ghcr.io/simstudioai/realtime
2828
ecr_repo_secret: ECR_REALTIME
29+
- dockerfile: ./docker/presidio.Dockerfile
30+
ghcr_image: ghcr.io/simstudioai/presidio
31+
ecr_repo_secret: ECR_PRESIDIO
2932
outputs:
3033
registry: ${{ steps.login-ecr.outputs.registry }}
3134

@@ -114,6 +117,8 @@ jobs:
114117
image: ghcr.io/simstudioai/migrations
115118
- dockerfile: ./docker/realtime.Dockerfile
116119
image: ghcr.io/simstudioai/realtime
120+
- dockerfile: ./docker/presidio.Dockerfile
121+
image: ghcr.io/simstudioai/presidio
117122

118123
steps:
119124
- name: Checkout code
@@ -157,6 +162,7 @@ jobs:
157162
- image: ghcr.io/simstudioai/simstudio
158163
- image: ghcr.io/simstudioai/migrations
159164
- image: ghcr.io/simstudioai/realtime
165+
- image: ghcr.io/simstudioai/presidio
160166

161167
steps:
162168
- name: Login to GHCR

docker/presidio.Dockerfile

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# ========================================
2+
# Combined Presidio service (analyzer + anonymizer) on a single port (3000)
3+
# ========================================
4+
FROM python:3.12-slim-bookworm AS base
5+
6+
WORKDIR /app
7+
8+
# build-essential for any sdist that compiles native deps (e.g. blis/thinc).
9+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
10+
--mount=type=cache,target=/var/lib/apt,sharing=locked \
11+
apt-get update && apt-get install -y --no-install-recommends \
12+
build-essential curl ca-certificates \
13+
&& rm -rf /var/lib/apt/lists/*
14+
15+
# Pinned deps + pinned en_core_web_lg wheel. Separate layer so source edits
16+
# don't reinstall the heavy model.
17+
COPY docker/presidio/requirements.txt ./requirements.txt
18+
RUN --mount=type=cache,target=/root/.cache/pip \
19+
pip install --no-cache-dir -r requirements.txt
20+
21+
# Pinned English spaCy model. Downloaded with retries/resume (the wheel is
22+
# ~400MB and truncates on flaky networks if pip fetches the URL directly).
23+
ARG SPACY_MODEL_VERSION=3.8.0
24+
RUN --mount=type=cache,target=/root/.cache/pip \
25+
MODEL_WHL="en_core_web_lg-${SPACY_MODEL_VERSION}-py3-none-any.whl" && \
26+
curl -fL --retry 5 --retry-delay 5 --retry-all-errors -C - \
27+
-o "/tmp/${MODEL_WHL}" \
28+
"https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-${SPACY_MODEL_VERSION}/${MODEL_WHL}" && \
29+
pip install --no-cache-dir "/tmp/${MODEL_WHL}" && \
30+
rm "/tmp/${MODEL_WHL}"
31+
32+
COPY docker/presidio/server.py ./server.py
33+
34+
RUN groupadd -g 1001 presidio && \
35+
useradd -u 1001 -g presidio presidio && \
36+
chown -R presidio:presidio /app
37+
USER presidio
38+
39+
EXPOSE 3000
40+
41+
HEALTHCHECK --interval=30s --timeout=5s --start-period=40s --retries=3 \
42+
CMD curl -fsS http://localhost:3000/health || exit 1
43+
44+
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3000"]

docker/presidio/requirements.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Pinned for reproducible image builds. Bump deliberately.
2+
presidio-analyzer==2.2.362
3+
presidio-anonymizer==2.2.362
4+
spacy==3.8.14
5+
fastapi==0.138.0
6+
uvicorn[standard]==0.49.0
7+
8+
# The English spaCy model (en_core_web_lg, ~400MB) is fetched + pinned in the
9+
# Dockerfile via curl-with-retry rather than here — a direct pip wheel URL
10+
# truncates on flaky networks and fails wheel validation.

docker/presidio/server.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Combined Presidio REST service: analyzer + anonymizer on one port.
2+
3+
Constructs one warm AnalyzerEngine (with a native check-digit VIN recognizer)
4+
and one AnonymizerEngine at startup, exposing stock-compatible endpoints so a
5+
single PRESIDIO_URL serves both. English only.
6+
"""
7+
8+
from typing import Any
9+
10+
from fastapi import Body, FastAPI
11+
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult
12+
from presidio_analyzer.predefined_recognizers import (
13+
AuAbnRecognizer,
14+
AuAcnRecognizer,
15+
AuMedicareRecognizer,
16+
AuTfnRecognizer,
17+
InAadhaarRecognizer,
18+
InPanRecognizer,
19+
InPassportRecognizer,
20+
InVehicleRegistrationRecognizer,
21+
InVoterRecognizer,
22+
SgFinRecognizer,
23+
SgUenRecognizer,
24+
UkNinoRecognizer,
25+
)
26+
from presidio_anonymizer import AnonymizerEngine
27+
from presidio_anonymizer.entities import OperatorConfig
28+
29+
# English-capable predefined recognizers Presidio ships but does NOT load by
30+
# default (UK_NINO, AU_*, IN_*, SG_*). es/it/pl/fi/th/ko recognizers are
31+
# language-locked and excluded — this image is English only.
32+
EXTRA_RECOGNIZERS = [
33+
UkNinoRecognizer,
34+
AuAbnRecognizer,
35+
AuAcnRecognizer,
36+
AuTfnRecognizer,
37+
AuMedicareRecognizer,
38+
InPanRecognizer,
39+
InAadhaarRecognizer,
40+
InVehicleRegistrationRecognizer,
41+
InVoterRecognizer,
42+
InPassportRecognizer,
43+
SgFinRecognizer,
44+
SgUenRecognizer,
45+
]
46+
47+
48+
class VinRecognizer(PatternRecognizer):
49+
"""VIN (17 chars, A-Z/0-9 excluding I/O/Q) with ISO 3779 check-digit
50+
validation (position 9). Validation makes accidental matches on arbitrary
51+
17-char codes (request ids, SKUs, tokens) extremely unlikely. Some
52+
non-North-American VINs omit the check digit and are skipped — an
53+
intentional bias toward precision.
54+
"""
55+
56+
_TRANSLIT = {
57+
**{str(d): d for d in range(10)},
58+
"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
59+
"J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
60+
"S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
61+
}
62+
_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
63+
64+
def validate_result(self, pattern_text: str):
65+
vin = pattern_text.upper()
66+
if len(vin) != 17:
67+
return False
68+
try:
69+
total = sum(self._TRANSLIT[c] * w for c, w in zip(vin, self._WEIGHTS))
70+
except KeyError:
71+
return False
72+
check = total % 11
73+
expected = "X" if check == 10 else str(check)
74+
return vin[8] == expected
75+
76+
77+
def build_analyzer() -> AnalyzerEngine:
78+
analyzer = AnalyzerEngine()
79+
vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7)
80+
analyzer.registry.add_recognizer(
81+
VinRecognizer(
82+
supported_entity="VIN",
83+
patterns=[vin_pattern],
84+
context=["vin", "vehicle", "chassis"],
85+
)
86+
)
87+
for recognizer_cls in EXTRA_RECOGNIZERS:
88+
analyzer.registry.add_recognizer(recognizer_cls())
89+
return analyzer
90+
91+
92+
analyzer = build_analyzer()
93+
anonymizer = AnonymizerEngine()
94+
95+
app = FastAPI(title="Sim Presidio", docs_url=None, redoc_url=None)
96+
97+
98+
@app.get("/health")
99+
def health() -> dict[str, str]:
100+
return {"status": "ok"}
101+
102+
103+
@app.get("/supportedentities")
104+
def supported_entities(language: str = "en") -> list[str]:
105+
return analyzer.get_supported_entities(language)
106+
107+
108+
@app.post("/analyze")
109+
def analyze(payload: dict[str, Any] = Body(...)) -> list[dict[str, Any]]:
110+
entities = payload.get("entities") or None
111+
results = analyzer.analyze(
112+
text=payload["text"],
113+
language=payload.get("language", "en"),
114+
entities=entities,
115+
score_threshold=payload.get("score_threshold"),
116+
return_decision_process=payload.get("return_decision_process", False),
117+
)
118+
return [r.to_dict() for r in results]
119+
120+
121+
@app.post("/anonymize")
122+
def anonymize(payload: dict[str, Any] = Body(...)) -> dict[str, Any]:
123+
analyzer_results = [
124+
RecognizerResult(
125+
entity_type=r["entity_type"],
126+
start=r["start"],
127+
end=r["end"],
128+
score=r.get("score", 1.0),
129+
)
130+
for r in payload.get("analyzer_results", [])
131+
]
132+
raw_operators = payload.get("anonymizers") or payload.get("operators")
133+
operators = None
134+
if raw_operators:
135+
operators = {}
136+
for entity, cfg in raw_operators.items():
137+
cfg = dict(cfg)
138+
operators[entity] = OperatorConfig(cfg.pop("type"), cfg)
139+
result = anonymizer.anonymize(
140+
text=payload["text"],
141+
analyzer_results=analyzer_results,
142+
operators=operators,
143+
)
144+
return {
145+
"text": result.text,
146+
"items": [
147+
{
148+
"operator": item.operator,
149+
"entity_type": item.entity_type,
150+
"start": item.start,
151+
"end": item.end,
152+
"text": item.text,
153+
}
154+
for item in result.items
155+
],
156+
}

0 commit comments

Comments
 (0)