|
| 1 | +"""Combined Presidio REST service: analyzer + anonymizer on one port. |
| 2 | +
|
| 3 | +Constructs one warm AnalyzerEngine (multi-language NLP + a native check-digit |
| 4 | +VIN recognizer) and one AnonymizerEngine at startup, exposing stock-compatible |
| 5 | +endpoints so a single PRESIDIO_URL serves both. |
| 6 | +""" |
| 7 | + |
| 8 | +from typing import Any |
| 9 | + |
| 10 | +from fastapi import FastAPI |
| 11 | +from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult |
| 12 | +from presidio_analyzer.nlp_engine import NlpEngineProvider |
| 13 | +from presidio_analyzer.predefined_recognizers import ( |
| 14 | + AuAbnRecognizer, |
| 15 | + AuAcnRecognizer, |
| 16 | + AuMedicareRecognizer, |
| 17 | + AuTfnRecognizer, |
| 18 | + EsNieRecognizer, |
| 19 | + EsNifRecognizer, |
| 20 | + FiPersonalIdentityCodeRecognizer, |
| 21 | + InAadhaarRecognizer, |
| 22 | + InPanRecognizer, |
| 23 | + InPassportRecognizer, |
| 24 | + InVehicleRegistrationRecognizer, |
| 25 | + InVoterRecognizer, |
| 26 | + ItDriverLicenseRecognizer, |
| 27 | + ItFiscalCodeRecognizer, |
| 28 | + ItIdentityCardRecognizer, |
| 29 | + ItPassportRecognizer, |
| 30 | + ItVatCodeRecognizer, |
| 31 | + PlPeselRecognizer, |
| 32 | + SgFinRecognizer, |
| 33 | + SgUenRecognizer, |
| 34 | + UkNinoRecognizer, |
| 35 | +) |
| 36 | +from presidio_anonymizer import AnonymizerEngine |
| 37 | +from presidio_anonymizer.entities import OperatorConfig |
| 38 | +from pydantic import BaseModel |
| 39 | + |
| 40 | +# Languages served. Each needs its spaCy model installed in the image; the |
| 41 | +# es/it/pl/fi predefined recognizers (ES_NIF, IT_FISCAL_CODE, PL_PESEL, ...) |
| 42 | +# auto-load once their NLP engine is present. |
| 43 | +NLP_CONFIGURATION = { |
| 44 | + "nlp_engine_name": "spacy", |
| 45 | + "models": [ |
| 46 | + {"lang_code": "en", "model_name": "en_core_web_lg"}, |
| 47 | + {"lang_code": "es", "model_name": "es_core_news_lg"}, |
| 48 | + {"lang_code": "it", "model_name": "it_core_news_lg"}, |
| 49 | + {"lang_code": "pl", "model_name": "pl_core_news_lg"}, |
| 50 | + {"lang_code": "fi", "model_name": "fi_core_news_lg"}, |
| 51 | + ], |
| 52 | +} |
| 53 | +SUPPORTED_LANGUAGES = [m["lang_code"] for m in NLP_CONFIGURATION["models"]] |
| 54 | + |
| 55 | +# Predefined recognizers Presidio ships but does NOT load into the default |
| 56 | +# registry — they must be added explicitly. Each carries its own |
| 57 | +# supported_language, so it fires under that language once its NLP model is |
| 58 | +# loaded. en: UK/AU/IN/SG locale ids; es/it/pl/fi: national ids. |
| 59 | +EXTRA_RECOGNIZERS = [ |
| 60 | + UkNinoRecognizer, |
| 61 | + AuAbnRecognizer, |
| 62 | + AuAcnRecognizer, |
| 63 | + AuTfnRecognizer, |
| 64 | + AuMedicareRecognizer, |
| 65 | + InPanRecognizer, |
| 66 | + InAadhaarRecognizer, |
| 67 | + InVehicleRegistrationRecognizer, |
| 68 | + InVoterRecognizer, |
| 69 | + InPassportRecognizer, |
| 70 | + SgFinRecognizer, |
| 71 | + SgUenRecognizer, |
| 72 | + EsNifRecognizer, |
| 73 | + EsNieRecognizer, |
| 74 | + ItFiscalCodeRecognizer, |
| 75 | + ItDriverLicenseRecognizer, |
| 76 | + ItVatCodeRecognizer, |
| 77 | + ItPassportRecognizer, |
| 78 | + ItIdentityCardRecognizer, |
| 79 | + PlPeselRecognizer, |
| 80 | + FiPersonalIdentityCodeRecognizer, |
| 81 | +] |
| 82 | + |
| 83 | + |
| 84 | +class VinRecognizer(PatternRecognizer): |
| 85 | + """VIN (17 chars, A-Z/0-9 excluding I/O/Q) with ISO 3779 check-digit |
| 86 | + validation (position 9). Validation makes accidental matches on arbitrary |
| 87 | + 17-char codes (request ids, SKUs, tokens) extremely unlikely. Some |
| 88 | + non-North-American VINs omit the check digit and are skipped — an |
| 89 | + intentional bias toward precision. |
| 90 | + """ |
| 91 | + |
| 92 | + _TRANSLIT = { |
| 93 | + **{str(d): d for d in range(10)}, |
| 94 | + "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, |
| 95 | + "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9, |
| 96 | + "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9, |
| 97 | + } |
| 98 | + _WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] |
| 99 | + |
| 100 | + def validate_result(self, pattern_text: str): |
| 101 | + vin = pattern_text.upper() |
| 102 | + if len(vin) != 17: |
| 103 | + return False |
| 104 | + try: |
| 105 | + total = sum(self._TRANSLIT[c] * w for c, w in zip(vin, self._WEIGHTS)) |
| 106 | + except KeyError: |
| 107 | + return False |
| 108 | + check = total % 11 |
| 109 | + expected = "X" if check == 10 else str(check) |
| 110 | + return vin[8] == expected |
| 111 | + |
| 112 | + |
| 113 | +def build_analyzer() -> AnalyzerEngine: |
| 114 | + nlp_engine = NlpEngineProvider(nlp_configuration=NLP_CONFIGURATION).create_engine() |
| 115 | + analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=SUPPORTED_LANGUAGES) |
| 116 | + # VIN is language-agnostic, so register it under every served language — |
| 117 | + # a recognizer only fires for the language the caller routes to. |
| 118 | + vin_pattern = Pattern(name="vin", regex=r"\b[A-HJ-NPR-Z0-9]{17}\b", score=0.7) |
| 119 | + for language in SUPPORTED_LANGUAGES: |
| 120 | + analyzer.registry.add_recognizer( |
| 121 | + VinRecognizer( |
| 122 | + supported_entity="VIN", |
| 123 | + patterns=[vin_pattern], |
| 124 | + context=["vin", "vehicle", "chassis"], |
| 125 | + supported_language=language, |
| 126 | + ) |
| 127 | + ) |
| 128 | + for recognizer_cls in EXTRA_RECOGNIZERS: |
| 129 | + analyzer.registry.add_recognizer(recognizer_cls()) |
| 130 | + return analyzer |
| 131 | + |
| 132 | + |
| 133 | +analyzer = build_analyzer() |
| 134 | +anonymizer = AnonymizerEngine() |
| 135 | + |
| 136 | +app = FastAPI(title="Sim Presidio", docs_url=None, redoc_url=None) |
| 137 | + |
| 138 | + |
| 139 | +class AnalyzeRequest(BaseModel): |
| 140 | + text: str |
| 141 | + language: str = "en" |
| 142 | + entities: list[str] | None = None |
| 143 | + score_threshold: float | None = None |
| 144 | + return_decision_process: bool = False |
| 145 | + |
| 146 | + |
| 147 | +class AnonymizeRequest(BaseModel): |
| 148 | + text: str |
| 149 | + analyzer_results: list[dict[str, Any]] = [] |
| 150 | + anonymizers: dict[str, dict[str, Any]] | None = None |
| 151 | + operators: dict[str, dict[str, Any]] | None = None |
| 152 | + |
| 153 | + |
| 154 | +@app.get("/health") |
| 155 | +def health() -> dict[str, str]: |
| 156 | + return {"status": "ok"} |
| 157 | + |
| 158 | + |
| 159 | +@app.get("/supportedentities") |
| 160 | +def supported_entities(language: str = "en") -> list[str]: |
| 161 | + return analyzer.get_supported_entities(language) |
| 162 | + |
| 163 | + |
| 164 | +@app.post("/analyze") |
| 165 | +def analyze(req: AnalyzeRequest) -> list[dict[str, Any]]: |
| 166 | + results = analyzer.analyze( |
| 167 | + text=req.text, |
| 168 | + language=req.language, |
| 169 | + entities=req.entities or None, |
| 170 | + score_threshold=req.score_threshold, |
| 171 | + return_decision_process=req.return_decision_process, |
| 172 | + ) |
| 173 | + return [r.to_dict() for r in results] |
| 174 | + |
| 175 | + |
| 176 | +@app.post("/anonymize") |
| 177 | +def anonymize(req: AnonymizeRequest) -> dict[str, Any]: |
| 178 | + analyzer_results = [ |
| 179 | + RecognizerResult( |
| 180 | + entity_type=r["entity_type"], |
| 181 | + start=r["start"], |
| 182 | + end=r["end"], |
| 183 | + score=r.get("score", 1.0), |
| 184 | + ) |
| 185 | + for r in req.analyzer_results |
| 186 | + ] |
| 187 | + raw_operators = req.anonymizers or req.operators |
| 188 | + operators = None |
| 189 | + if raw_operators: |
| 190 | + operators = {} |
| 191 | + for entity, raw_cfg in raw_operators.items(): |
| 192 | + op_cfg = dict(raw_cfg) |
| 193 | + op_type = op_cfg.pop("type", "replace") |
| 194 | + operators[entity] = OperatorConfig(op_type, op_cfg) |
| 195 | + result = anonymizer.anonymize( |
| 196 | + text=req.text, |
| 197 | + analyzer_results=analyzer_results, |
| 198 | + operators=operators, |
| 199 | + ) |
| 200 | + return { |
| 201 | + "text": result.text, |
| 202 | + "items": [ |
| 203 | + { |
| 204 | + "operator": item.operator, |
| 205 | + "entity_type": item.entity_type, |
| 206 | + "start": item.start, |
| 207 | + "end": item.end, |
| 208 | + "text": item.text, |
| 209 | + } |
| 210 | + for item in result.items |
| 211 | + ], |
| 212 | + } |
0 commit comments