Skip to content

Commit de32214

Browse files
refactor(guardrails): single Presidio image, native VIN, per-rule redaction language
- collapse the analyzer/anonymizer URLs into one PRESIDIO_URL (combined image serves /analyze + /anonymize) - remove the TS VIN recognizer (vin.ts, recognizers.ts) — VIN is now native + multi-language in the image; validate_pii is a thin analyze→anonymize client - trim KR_RRN/TH_TNIN from the catalog (no Korean/Thai model in the image) - add per-rule redaction language: PII_LANGUAGES catalog drives the contract enum, the Data Retention rule modal, and the guardrails block dropdown; resolver + logger thread it through to maskPIIBatch (default en), so non-English entity rules (e.g. ES_NIF) actually fire instead of silently no-op'ing under en
1 parent 91ce2d1 commit de32214

15 files changed

Lines changed: 148 additions & 257 deletions

File tree

apps/sim/app/api/organizations/[id]/data-retention/route.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ import { isOrganizationOnEnterprisePlan } from '@/lib/billing/core/subscription'
1616
import { isBillingEnabled } from '@/lib/core/config/env-flags'
1717
import { isFeatureEnabled } from '@/lib/core/config/feature-flags'
1818
import { withRouteHandler } from '@/lib/core/utils/with-route-handler'
19+
import { PII_LANGUAGE_CODES, type PIILanguage } from '@/lib/guardrails/pii-entities'
20+
21+
/** Narrow a stored (loosely-typed) language to the supported set; unknown ⇒ undefined (defaults to en). */
22+
function coercePiiLanguage(value: string | undefined): PIILanguage | undefined {
23+
return value && (PII_LANGUAGE_CODES as readonly string[]).includes(value)
24+
? (value as PIILanguage)
25+
: undefined
26+
}
1927

2028
const logger = createLogger('DataRetentionAPI')
2129

@@ -35,7 +43,14 @@ function normalizeConfigured(
3543
logRetentionHours: settings?.logRetentionHours ?? null,
3644
softDeleteRetentionHours: settings?.softDeleteRetentionHours ?? null,
3745
taskCleanupHours: settings?.taskCleanupHours ?? null,
38-
piiRedaction: settings?.piiRedaction?.rules ? { rules: settings.piiRedaction.rules } : null,
46+
piiRedaction: settings?.piiRedaction?.rules
47+
? {
48+
rules: settings.piiRedaction.rules.map((rule) => ({
49+
...rule,
50+
language: coercePiiLanguage(rule.language),
51+
})),
52+
}
53+
: null,
3954
}
4055
}
4156

apps/sim/blocks/blocks/guardrails.ts

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { ShieldCheckIcon } from '@/components/icons'
2-
import { PII_ENTITY_GROUPS } from '@/lib/guardrails/pii-entities'
2+
import { PII_ENTITY_GROUPS, PII_LANGUAGES } from '@/lib/guardrails/pii-entities'
33
import type { BlockConfig } from '@/blocks/types'
44
import {
55
getModelOptions,
@@ -206,13 +206,7 @@ Return ONLY the regex pattern - no explanations, no quotes, no forward slashes,
206206
id: 'piiLanguage',
207207
title: 'Language',
208208
type: 'dropdown',
209-
options: [
210-
{ label: 'English', id: 'en' },
211-
{ label: 'Spanish', id: 'es' },
212-
{ label: 'Italian', id: 'it' },
213-
{ label: 'Polish', id: 'pl' },
214-
{ label: 'Finnish', id: 'fi' },
215-
],
209+
options: PII_LANGUAGES.map((language) => ({ label: language.label, id: language.value })),
216210
defaultValue: 'en',
217211
condition: {
218212
field: 'validationType',

apps/sim/ee/data-retention/components/data-retention-settings.tsx

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,13 @@ import {
2121
} from '@/components/emcn'
2222
import { useSession } from '@/lib/auth/auth-client'
2323
import { isBillingEnabled } from '@/lib/core/config/env-flags'
24-
import { PII_ENTITY_GROUPS, SUPPORTED_PII_ENTITIES } from '@/lib/guardrails/pii-entities'
24+
import {
25+
DEFAULT_PII_LANGUAGE,
26+
PII_ENTITY_GROUPS,
27+
PII_LANGUAGES,
28+
type PIILanguage,
29+
SUPPORTED_PII_ENTITIES,
30+
} from '@/lib/guardrails/pii-entities'
2531
import { getUserRole } from '@/lib/workspaces/organization/utils'
2632
import { SettingsSection } from '@/app/workspace/[workspaceId]/settings/components/settings-section/settings-section'
2733
import { InfoNote } from '@/ee/components/info-note'
@@ -59,6 +65,7 @@ interface RuleDraft {
5965
id: string
6066
entityTypes: string[]
6167
workspaceId: string | null
68+
language: PIILanguage
6269
}
6370

6471
function hoursToDisplayDays(hours: number | null): string {
@@ -75,6 +82,7 @@ function normalizeRule(rule: RuleDraft): string {
7582
return JSON.stringify({
7683
entityTypes: [...rule.entityTypes].sort(),
7784
workspaceId: rule.workspaceId,
85+
language: rule.language,
7886
})
7987
}
8088

@@ -227,6 +235,18 @@ function RuleModal({
227235
onChange={(entityTypes) => onChange({ ...draft, entityTypes })}
228236
/>
229237
</ChipModalField>
238+
<ChipModalField
239+
type='custom'
240+
title='Language'
241+
hint='Detection runs with this language’s recognizers — match it to your log content.'
242+
>
243+
<ChipSelect
244+
value={draft.language}
245+
onChange={(language) => onChange({ ...draft, language: language as PIILanguage })}
246+
options={PII_LANGUAGES.map((l) => ({ value: l.value, label: l.label }))}
247+
align='start'
248+
/>
249+
</ChipModalField>
230250
</ChipModalBody>
231251
<ChipModalFooter
232252
onCancel={onClose}
@@ -291,6 +311,7 @@ export function DataRetentionSettings() {
291311
id: r.id,
292312
entityTypes: r.entityTypes,
293313
workspaceId: r.workspaceId,
314+
language: r.language ?? DEFAULT_PII_LANGUAGE,
294315
}))
295316
)
296317
hydratedOrgRef.current = orgId
@@ -327,6 +348,7 @@ export function DataRetentionSettings() {
327348
id: r.id,
328349
entityTypes: r.entityTypes,
329350
workspaceId: r.workspaceId,
351+
language: r.language,
330352
})),
331353
},
332354
},
@@ -335,7 +357,12 @@ export function DataRetentionSettings() {
335357
}
336358

337359
function openEditDefault() {
338-
const rule: RuleDraft = defaultRule ?? { id: generateId(), entityTypes: [], workspaceId: null }
360+
const rule: RuleDraft = defaultRule ?? {
361+
id: generateId(),
362+
entityTypes: [],
363+
workspaceId: null,
364+
language: DEFAULT_PII_LANGUAGE,
365+
}
339366
setModalIsNew(defaultRule === null)
340367
setModalOriginal(rule)
341368
setModalDraft({ ...rule })
@@ -344,7 +371,12 @@ export function DataRetentionSettings() {
344371
function openAddOverride() {
345372
const workspaceId = freeWorkspaces[0]?.value
346373
if (!workspaceId) return
347-
const blank: RuleDraft = { id: generateId(), entityTypes: [], workspaceId }
374+
const blank: RuleDraft = {
375+
id: generateId(),
376+
entityTypes: [],
377+
workspaceId,
378+
language: DEFAULT_PII_LANGUAGE,
379+
}
348380
setModalIsNew(true)
349381
setModalOriginal(blank)
350382
setModalDraft(blank)

apps/sim/lib/api/contracts/primitives.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { z } from 'zod'
2+
import { PII_LANGUAGE_CODES } from '@/lib/guardrails/pii-entities'
23

34
export const unknownRecordSchema = z.record(z.string(), z.unknown())
45

@@ -93,6 +94,8 @@ export const piiRedactionRuleSchema = z.object({
9394
entityTypes: z.array(z.string().min(1, 'Entity type cannot be empty')).max(100),
9495
/** null = all workspaces; otherwise the single targeted workspace. */
9596
workspaceId: z.string().min(1).nullable(),
97+
/** Language whose Presidio recognizers apply; defaults to English. */
98+
language: z.enum(PII_LANGUAGE_CODES).optional(),
9699
})
97100

98101
export type PiiRedactionRule = z.output<typeof piiRedactionRuleSchema>

apps/sim/lib/billing/retention.test.ts

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,40 +21,55 @@ describe('resolveEffectivePiiRedaction', () => {
2121
orgSettings: settings([allRule]),
2222
workspaceId: 'ws-1',
2323
})
24-
expect(result).toEqual({ enabled: true, entityTypes: ['EMAIL_ADDRESS', 'PHONE_NUMBER'] })
24+
expect(result).toEqual({
25+
enabled: true,
26+
entityTypes: ['EMAIL_ADDRESS', 'PHONE_NUMBER'],
27+
language: 'en',
28+
})
2529
})
2630

2731
it('lets a workspace-specific rule override the all rule', () => {
2832
const result = resolveEffectivePiiRedaction({
2933
orgSettings: settings([allRule, { id: 'r-1', entityTypes: ['US_SSN'], workspaceId: 'ws-1' }]),
3034
workspaceId: 'ws-1',
3135
})
32-
expect(result).toEqual({ enabled: true, entityTypes: ['US_SSN'] })
36+
expect(result).toEqual({ enabled: true, entityTypes: ['US_SSN'], language: 'en' })
37+
})
38+
39+
it('carries the rule language through (defaults to en)', () => {
40+
const result = resolveEffectivePiiRedaction({
41+
orgSettings: settings([
42+
{ id: 'r-es', entityTypes: ['ES_NIF'], workspaceId: 'ws-1', language: 'es' },
43+
]),
44+
workspaceId: 'ws-1',
45+
})
46+
expect(result).toEqual({ enabled: true, entityTypes: ['ES_NIF'], language: 'es' })
3347
})
3448

3549
it('exempts a workspace when its specific rule has no entity types', () => {
3650
const result = resolveEffectivePiiRedaction({
3751
orgSettings: settings([allRule, { id: 'r-1', entityTypes: [], workspaceId: 'ws-1' }]),
3852
workspaceId: 'ws-1',
3953
})
40-
expect(result).toEqual({ enabled: false, entityTypes: [] })
54+
expect(result).toEqual({ enabled: false, entityTypes: [], language: 'en' })
4155
})
4256

4357
it('is disabled when no rule matches and there is no all rule', () => {
4458
const result = resolveEffectivePiiRedaction({
4559
orgSettings: settings([{ id: 'r-1', entityTypes: ['US_SSN'], workspaceId: 'ws-2' }]),
4660
workspaceId: 'ws-1',
4761
})
48-
expect(result).toEqual({ enabled: false, entityTypes: [] })
62+
expect(result).toEqual({ enabled: false, entityTypes: [], language: 'en' })
4963
})
5064

5165
it('is disabled when there are no rules', () => {
5266
expect(
5367
resolveEffectivePiiRedaction({ orgSettings: settings([]), workspaceId: 'ws-1' })
54-
).toEqual({ enabled: false, entityTypes: [] })
68+
).toEqual({ enabled: false, entityTypes: [], language: 'en' })
5569
expect(resolveEffectivePiiRedaction({ orgSettings: null, workspaceId: 'ws-1' })).toEqual({
5670
enabled: false,
5771
entityTypes: [],
72+
language: 'en',
5873
})
5974
})
6075
})

apps/sim/lib/billing/retention.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
import type { DataRetentionSettings } from '@sim/db/schema'
2+
import { DEFAULT_PII_LANGUAGE } from '@/lib/guardrails/pii-entities'
23

34
export interface EffectivePiiRedaction {
45
enabled: boolean
56
/** Presidio entity types to mask. Empty = redact all detected PII. */
67
entityTypes: string[]
8+
/** Language whose Presidio recognizers apply when masking. */
9+
language: string
710
}
811

912
export const DEFAULT_PII_REDACTION: EffectivePiiRedaction = {
1013
enabled: false,
1114
entityTypes: [],
15+
language: DEFAULT_PII_LANGUAGE,
1216
}
1317

1418
/**
@@ -34,5 +38,6 @@ export function resolveEffectivePiiRedaction(params: {
3438
? rule.entityTypes.filter((t): t is string => typeof t === 'string')
3539
: []
3640
if (types.length === 0) return DEFAULT_PII_REDACTION
37-
return { enabled: true, entityTypes: types }
41+
const language = typeof rule?.language === 'string' ? rule.language : DEFAULT_PII_LANGUAGE
42+
return { enabled: true, entityTypes: types, language }
3843
}

apps/sim/lib/core/config/env.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,7 @@ export const env = createEnv({
311311
PORT: z.number().optional(), // Main application port
312312
INTERNAL_API_BASE_URL: z.string().optional(), // Optional internal base URL for server-side self-calls; must include protocol if set (e.g., http://sim-app.namespace.svc.cluster.local:3000)
313313
ALLOWED_ORIGINS: z.string().optional(), // CORS allowed origins
314-
PRESIDIO_ANALYZER_URL: z.string().optional(), // Presidio analyzer sidecar base URL for PII detection (default http://localhost:5002)
315-
PRESIDIO_ANONYMIZER_URL: z.string().optional(), // Presidio anonymizer sidecar base URL for PII masking (default http://localhost:5001)
314+
PRESIDIO_URL: z.string().optional(), // Presidio sidecar base URL serving /analyze + /anonymize (default http://localhost:5002)
316315

317316
// OAuth Integration Credentials - All optional, enables third-party integrations
318317
GOOGLE_CLIENT_ID: z.string().optional(), // Google OAuth client ID for Google services

apps/sim/lib/guardrails/pii-entities.ts

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,6 @@ export const SUPPORTED_PII_ENTITIES = {
5151
IN_VOTER: 'Indian voter ID',
5252
IN_PASSPORT: 'Indian passport',
5353
FI_PERSONAL_IDENTITY_CODE: 'Finnish Personal Identity Code',
54-
KR_RRN: 'Korean Resident Registration Number',
55-
TH_TNIN: 'Thai National ID Number',
5654
} as const
5755

5856
export type PIIEntityType = keyof typeof SUPPORTED_PII_ENTITIES
@@ -115,8 +113,6 @@ export const PII_ENTITY_GROUPS: ReadonlyArray<{
115113
'IN_VOTER',
116114
'IN_PASSPORT',
117115
'FI_PERSONAL_IDENTITY_CODE',
118-
'KR_RRN',
119-
'TH_TNIN',
120116
],
121117
},
122118
].map((group) => ({
@@ -126,3 +122,26 @@ export const PII_ENTITY_GROUPS: ReadonlyArray<{
126122
label: SUPPORTED_PII_ENTITIES[value as PIIEntityType],
127123
})),
128124
}))
125+
126+
/**
127+
* Languages the Presidio image has NLP models for. The analyzer only recognizes a
128+
* language's entities when its model is loaded, so this set must match the image.
129+
*/
130+
export const PII_LANGUAGES = [
131+
{ value: 'en', label: 'English' },
132+
{ value: 'es', label: 'Spanish' },
133+
{ value: 'it', label: 'Italian' },
134+
{ value: 'pl', label: 'Polish' },
135+
{ value: 'fi', label: 'Finnish' },
136+
] as const
137+
138+
export type PIILanguage = (typeof PII_LANGUAGES)[number]['value']
139+
140+
/** Non-empty tuple of language codes for schema/enum use. */
141+
export const PII_LANGUAGE_CODES = PII_LANGUAGES.map((l) => l.value) as [
142+
PIILanguage,
143+
...PIILanguage[],
144+
]
145+
146+
/** Default redaction language when a rule doesn't set one. */
147+
export const DEFAULT_PII_LANGUAGE: PIILanguage = 'en'

apps/sim/lib/guardrails/recognizers.ts

Lines changed: 0 additions & 27 deletions
This file was deleted.

0 commit comments

Comments
 (0)