fix(guardrails): bound-parallelize mask batch; refresh stale comments

TheodoreSpeaks · TheodoreSpeaks · commit 91ce2d1431d5 · 2026-06-22T17:38:17.000-07:00
- maskPIIBatch runs per-string sidecar calls with bounded concurrency (8) via
  mapWithConcurrency, so a chunk of many small leaves finishes within the 45s
  request timeout instead of aborting and scrubbing; order + fail-on-error kept
- drop stale comments referencing the deleted Python venv / 30s subprocess timeout
diff --git a/apps/sim/app/api/guardrails/mask-batch/route.ts b/apps/sim/app/api/guardrails/mask-batch/route.ts
@@ -11,9 +11,9 @@ const logger = createLogger('GuardrailsMaskBatchAPI')
 
 /**
  * Internal batch PII masking. The log-redaction persist path runs in both the
- * Next.js server and the trigger.dev runtime, but Presidio (Python venv) lives
- * only in the app container — so redaction calls this endpoint server-to-server
- * (internal JWT) to keep Presidio centralized here.
+ * Next.js server and the trigger.dev runtime, but the Presidio sidecars live only
+ * in the app task — so redaction calls this endpoint server-to-server (internal
+ * JWT) to keep Presidio centralized here.
  */
 export const POST = withRouteHandler(async (request: NextRequest) => {
   const auth = await checkInternalAuth(request, { requireWorkflowId: false })
@@ -31,8 +31,8 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
     logger.info('Masked PII batch', { count: texts.length })
     return NextResponse.json({ masked })
   } catch (error) {
-    // A broken/absent venv makes maskPIIBatch throw; fail loudly here (the
-    // caller scrubs to REDACTION_FAILED, so PII is never leaked).
+    // An unreachable/misconfigured Presidio sidecar makes maskPIIBatch throw; fail
+    // loudly here (the caller scrubs to REDACTION_FAILED, so PII is never leaked).
     logger.error('PII batch masking failed', {
       error: getErrorMessage(error),
       count: texts.length,
diff --git a/apps/sim/lib/guardrails/mask-client.ts b/apps/sim/lib/guardrails/mask-client.ts
@@ -10,15 +10,15 @@ import { getInternalApiBaseUrl } from '@/lib/core/utils/urls'
  */
 const REQUEST_MAX_BYTES = 256 * 1024
 const REQUEST_MAX_COUNT = 2_000
-/** Slightly above the 30s Python subprocess timeout so a hung app container aborts gracefully. */
+/** Bounds one mask-batch request; an unreachable/stuck Presidio sidecar aborts so the caller scrubs. */
 const REQUEST_TIMEOUT_MS = 45_000
 
 /**
  * Mask PII across many strings via the internal app-container endpoint.
  *
- * Presidio (a Python venv) only exists in the app container, but the
- * log-redaction persist path also runs inside the trigger.dev runtime — so
- * redaction always routes through HTTP, the same way the guardrails tool does.
+ * The Presidio sidecars run only in the app task, but the log-redaction persist
+ * path also runs inside the trigger.dev runtime — so redaction always routes
+ * through HTTP, the same way the guardrails tool does.
  * Strings are grouped into byte/count-budgeted chunks; order is preserved, so
  * the returned array matches `texts` length.
  *
diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts
@@ -1,13 +1,17 @@
 import { createLogger } from '@sim/logger'
 import { getErrorMessage } from '@sim/utils/errors'
 import { env } from '@/lib/core/config/env'
+import { mapWithConcurrency } from '@/lib/core/utils/concurrency'
 import { CUSTOM_ENTITY_TYPES, CUSTOM_RECOGNIZERS } from '@/lib/guardrails/recognizers'
 
 const logger = createLogger('PIIValidator')
 
 /** Just above the analyzer's spaCy NER budget so a stuck sidecar aborts gracefully. */
 const REQUEST_TIMEOUT_MS = 45_000
 
+/** Concurrent per-string sidecar calls within one batch; the warm model handles parallelism. */
+const MASK_CONCURRENCY = 8
+
 const ANALYZER_URL = env.PRESIDIO_ANALYZER_URL || 'http://localhost:5002'
 const ANONYMIZER_URL = env.PRESIDIO_ANONYMIZER_URL || 'http://localhost:5001'
 
@@ -177,9 +181,12 @@ export async function validatePII(input: PIIValidationInput): Promise<PIIValidat
 
 /**
  * Mask PII across many strings via the Presidio sidecars, preserving input order.
- * Each string runs a TS VIN pre-pass, then analyze → anonymize. Strings with no
- * detected PII are returned unchanged. Rejects on any sidecar failure so callers
- * can apply their own fail-safe (scrub rather than leak).
+ * Each string runs a TS custom-recognizer pass, then analyze → anonymize. Strings
+ * with no detected PII are returned unchanged. Calls run with bounded concurrency:
+ * the sidecars' model is warm, so the bottleneck is round-trip latency, and a
+ * batch of thousands of small leaves would otherwise exceed the caller's request
+ * timeout if run strictly sequentially. Rejects on any sidecar failure (which
+ * fails the whole batch) so callers can apply their own fail-safe (scrub).
  */
 export async function maskPIIBatch(
   texts: string[],
@@ -188,16 +195,11 @@ export async function maskPIIBatch(
 ): Promise<string[]> {
   if (texts.length === 0) return []
 
-  const masked: string[] = []
-  for (const text of texts) {
-    if (!text) {
-      masked.push(text)
-      continue
-    }
+  return mapWithConcurrency(texts, MASK_CONCURRENCY, async (text) => {
+    if (!text) return text
     const spans = await collectSpans(text, entityTypes, language)
-    masked.push(await anonymize(text, spans))
-  }
-  return masked
+    return anonymize(text, spans)
+  })
 }
 
 export { type PIIEntityType, SUPPORTED_PII_ENTITIES } from '@/lib/guardrails/pii-entities'