diff --git a/apps/sim/app/api/guardrails/mask-batch/route.test.ts b/apps/sim/app/api/guardrails/mask-batch/route.test.ts new file mode 100644 index 00000000000..cbb5b12265f --- /dev/null +++ b/apps/sim/app/api/guardrails/mask-batch/route.test.ts @@ -0,0 +1,64 @@ +/** + * @vitest-environment node + */ +import { createMockRequest } from '@sim/testing' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const { mockCheckInternalAuth, mockMaskPIIBatch } = vi.hoisted(() => ({ + mockCheckInternalAuth: vi.fn(), + mockMaskPIIBatch: vi.fn(), +})) + +vi.mock('@/lib/auth/hybrid', () => ({ + checkInternalAuth: mockCheckInternalAuth, +})) + +vi.mock('@/lib/guardrails/validate_pii', () => ({ + maskPIIBatch: mockMaskPIIBatch, +})) + +import { POST } from '@/app/api/guardrails/mask-batch/route' + +describe('POST /api/guardrails/mask-batch', () => { + beforeEach(() => { + vi.clearAllMocks() + mockCheckInternalAuth.mockResolvedValue({ success: true }) + mockMaskPIIBatch.mockImplementation(async (texts: string[]) => texts.map((t) => `M(${t})`)) + }) + + it('returns 401 without internal auth', async () => { + mockCheckInternalAuth.mockResolvedValue({ + success: false, + error: 'Internal authentication required', + }) + + const res = await POST( + createMockRequest('POST', { texts: ['a@b.com'], entityTypes: ['EMAIL_ADDRESS'] }) + ) + + expect(res.status).toBe(401) + expect(mockMaskPIIBatch).not.toHaveBeenCalled() + }) + + it('masks the batch in-process and preserves order', async () => { + const res = await POST( + createMockRequest('POST', { + texts: ['a@b.com', 'hello'], + entityTypes: ['EMAIL_ADDRESS'], + language: 'en', + }) + ) + + expect(res.status).toBe(200) + const json = await res.json() + expect(json.masked).toEqual(['M(a@b.com)', 'M(hello)']) + expect(mockMaskPIIBatch).toHaveBeenCalledWith(['a@b.com', 'hello'], ['EMAIL_ADDRESS'], 'en') + }) + + it('rejects an invalid body with 400', async () => { + const res = await POST(createMockRequest('POST', { texts: 'not-an-array', entityTypes: [] })) + + expect(res.status).toBe(400) + expect(mockMaskPIIBatch).not.toHaveBeenCalled() + }) +}) diff --git a/apps/sim/app/api/guardrails/mask-batch/route.ts b/apps/sim/app/api/guardrails/mask-batch/route.ts new file mode 100644 index 00000000000..43979c611c9 --- /dev/null +++ b/apps/sim/app/api/guardrails/mask-batch/route.ts @@ -0,0 +1,45 @@ +import { createLogger } from '@sim/logger' +import { getErrorMessage } from '@sim/utils/errors' +import { type NextRequest, NextResponse } from 'next/server' +import { guardrailsMaskBatchContract } from '@/lib/api/contracts' +import { parseRequest } from '@/lib/api/server' +import { checkInternalAuth } from '@/lib/auth/hybrid' +import { withRouteHandler } from '@/lib/core/utils/with-route-handler' +import { maskPIIBatch } from '@/lib/guardrails/validate_pii' + +const logger = createLogger('GuardrailsMaskBatchAPI') + +/** + * Internal batch PII masking. The log-redaction persist path runs in both the + * Next.js server and the trigger.dev runtime, but Presidio (Python venv) lives + * only in the app container — so redaction calls this endpoint server-to-server + * (internal JWT) to keep Presidio centralized here. + */ +export const POST = withRouteHandler(async (request: NextRequest) => { + const auth = await checkInternalAuth(request, { requireWorkflowId: false }) + if (!auth.success) { + return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }) + } + + const parsed = await parseRequest(guardrailsMaskBatchContract, request, {}) + if (!parsed.success) return parsed.response + + const { texts, entityTypes, language } = parsed.data.body + + try { + const masked = await maskPIIBatch(texts, entityTypes, language) + logger.info('Masked PII batch', { count: texts.length }) + return NextResponse.json({ masked }) + } catch (error) { + // A broken/absent venv makes maskPIIBatch throw; fail loudly here (the + // caller scrubs to REDACTION_FAILED, so PII is never leaked). + logger.error('PII batch masking failed', { + error: getErrorMessage(error), + count: texts.length, + }) + return NextResponse.json( + { error: getErrorMessage(error, 'PII masking failed') }, + { status: 500 } + ) + } +}) diff --git a/apps/sim/lib/api/contracts/hotspots.ts b/apps/sim/lib/api/contracts/hotspots.ts index 6c280898c39..897c99fad56 100644 --- a/apps/sim/lib/api/contracts/hotspots.ts +++ b/apps/sim/lib/api/contracts/hotspots.ts @@ -45,6 +45,34 @@ export const guardrailsValidateContract = defineRouteContract({ }, }) +const guardrailsMaskBatchBodySchema = z.object({ + texts: z.array(z.string()).max(100_000), + entityTypes: z.array(z.string().min(1, 'Entity type cannot be empty')).max(200), + language: z.string().min(1).max(20).optional(), +}) + +const guardrailsMaskBatchResponseSchema = z.object({ + masked: z.array(z.string()), +}) + +/** + * Internal batch PII masking. Called server-to-server (internal JWT) from the + * log-redaction persist path so Presidio always runs in the app container, + * including for async executions that persist inside the trigger.dev runtime. + */ +export const guardrailsMaskBatchContract = defineRouteContract({ + method: 'POST', + path: '/api/guardrails/mask-batch', + body: guardrailsMaskBatchBodySchema, + response: { + mode: 'json', + schema: guardrailsMaskBatchResponseSchema, + }, +}) + +export type GuardrailsMaskBatchBody = z.input +export type GuardrailsMaskBatchResult = z.output + const chatMessageSchema = z.object({ role: z.enum(['user', 'assistant', 'system']), content: z.string(), diff --git a/apps/sim/lib/guardrails/mask-client.test.ts b/apps/sim/lib/guardrails/mask-client.test.ts new file mode 100644 index 00000000000..d1c4ad5b843 --- /dev/null +++ b/apps/sim/lib/guardrails/mask-client.test.ts @@ -0,0 +1,68 @@ +/** + * @vitest-environment node + */ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' + +const { mockToken, mockBaseUrl } = vi.hoisted(() => ({ + mockToken: vi.fn(), + mockBaseUrl: vi.fn(), +})) + +vi.mock('@/lib/auth/internal', () => ({ generateInternalToken: mockToken })) +vi.mock('@/lib/core/utils/urls', () => ({ getInternalApiBaseUrl: mockBaseUrl })) + +import { maskPIIBatchViaHttp } from '@/lib/guardrails/mask-client' + +describe('maskPIIBatchViaHttp', () => { + let fetchMock: ReturnType + + beforeEach(() => { + vi.clearAllMocks() + mockToken.mockResolvedValue('tok') + mockBaseUrl.mockReturnValue('http://app.internal:3000') + fetchMock = vi.fn(async (_url: string, init: { body: string }) => { + const { texts } = JSON.parse(init.body) as { texts: string[] } + return new Response(JSON.stringify({ masked: texts.map((t) => `M(${t})`) }), { + status: 200, + headers: { 'content-type': 'application/json' }, + }) + }) + vi.stubGlobal('fetch', fetchMock) + }) + + afterEach(() => { + vi.unstubAllGlobals() + }) + + it('masks a small batch in a single request, with an abort timeout', async () => { + const out = await maskPIIBatchViaHttp(['a', 'b', 'c'], ['EMAIL_ADDRESS']) + + expect(out).toEqual(['M(a)', 'M(b)', 'M(c)']) + expect(fetchMock).toHaveBeenCalledTimes(1) + expect(fetchMock.mock.calls[0][1].signal).toBeInstanceOf(AbortSignal) + }) + + it('splits by count into multiple requests, preserving global order', async () => { + const texts = Array.from({ length: 5000 }, (_, i) => `t${i}`) + + const out = await maskPIIBatchViaHttp(texts, []) + + expect(out).toHaveLength(5000) + expect(out[0]).toBe('M(t0)') + expect(out[4999]).toBe('M(t4999)') + expect(fetchMock).toHaveBeenCalledTimes(3) // 2000-per-request cap + }) + + it('throws on a non-2xx response so the caller can scrub', async () => { + fetchMock.mockResolvedValueOnce(new Response('boom', { status: 500 })) + + await expect(maskPIIBatchViaHttp(['a'], [])).rejects.toThrow(/mask-batch request failed/) + }) + + it('returns [] without any request for empty input', async () => { + const out = await maskPIIBatchViaHttp([], []) + + expect(out).toEqual([]) + expect(fetchMock).not.toHaveBeenCalled() + }) +}) diff --git a/apps/sim/lib/guardrails/mask-client.ts b/apps/sim/lib/guardrails/mask-client.ts new file mode 100644 index 00000000000..8e94495334b --- /dev/null +++ b/apps/sim/lib/guardrails/mask-client.ts @@ -0,0 +1,99 @@ +import type { GuardrailsMaskBatchResult } from '@/lib/api/contracts' +import { generateInternalToken } from '@/lib/auth/internal' +import { getInternalApiBaseUrl } from '@/lib/core/utils/urls' + +/** + * Per-request limits. A chunk is flushed when it hits either bound, keeping each + * request small enough for one short Presidio pass under a tight timeout and far + * below the contract's 100k-entry cap — so large executions split across + * requests instead of failing validation. + */ +const REQUEST_MAX_BYTES = 256 * 1024 +const REQUEST_MAX_COUNT = 2_000 +/** Slightly above the 30s Python subprocess timeout so a hung app container aborts gracefully. */ +const REQUEST_TIMEOUT_MS = 45_000 + +/** + * Mask PII across many strings via the internal app-container endpoint. + * + * Presidio (a Python venv) only exists in the app container, but the + * log-redaction persist path also runs inside the trigger.dev runtime — so + * redaction always routes through HTTP, the same way the guardrails tool does. + * Strings are grouped into byte/count-budgeted chunks; order is preserved, so + * the returned array matches `texts` length. + * + * Rejects on any non-2xx, timeout, or shape mismatch so the caller can apply + * its own fail-safe (scrubbing rather than leaking). + */ +export async function maskPIIBatchViaHttp( + texts: string[], + entityTypes: string[], + language?: string +): Promise { + if (texts.length === 0) return [] + + const url = `${getInternalApiBaseUrl()}/api/guardrails/mask-batch` + + const masked: string[] = [] + let batch: string[] = [] + let batchBytes = 0 + + const flush = async () => { + if (batch.length === 0) return + const out = await postChunk(url, batch, entityTypes, language) + if (out.length !== batch.length) { + throw new Error('PII mask-batch returned an unexpected result') + } + for (const item of out) masked.push(item) + batch = [] + batchBytes = 0 + } + + for (const text of texts) { + const bytes = Buffer.byteLength(text, 'utf8') + if ( + batch.length > 0 && + (batch.length >= REQUEST_MAX_COUNT || batchBytes + bytes > REQUEST_MAX_BYTES) + ) { + await flush() + } + batch.push(text) + batchBytes += bytes + } + await flush() + + return masked +} + +async function postChunk( + url: string, + texts: string[], + entityTypes: string[], + language: string | undefined +): Promise { + // Mint per request: a single token (5min TTL) can expire mid-batch when a + // large execution fans out into many sequential chunk requests. + const token = await generateInternalToken() + + // boundary-raw-fetch: internal server-to-server call to the app container (internal JWT auth, configurable base URL) + const response = await fetch(url, { + method: 'POST', + headers: { + 'content-type': 'application/json', + authorization: `Bearer ${token}`, + }, + body: JSON.stringify({ texts, entityTypes, language }), + signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS), + }) + + if (!response.ok) { + const detail = await response.text().catch(() => '') + throw new Error(`PII mask-batch request failed (${response.status}): ${detail.slice(0, 200)}`) + } + + const data = (await response.json()) as GuardrailsMaskBatchResult + if (!Array.isArray(data.masked)) { + throw new Error('PII mask-batch returned an unexpected result') + } + return data.masked +} diff --git a/apps/sim/lib/guardrails/setup.sh b/apps/sim/lib/guardrails/setup.sh index 233e9a51a27..20eba4247ee 100755 --- a/apps/sim/lib/guardrails/setup.sh +++ b/apps/sim/lib/guardrails/setup.sh @@ -30,6 +30,11 @@ source "$VENV_DIR/bin/activate" pip install --upgrade pip pip install -r "$SCRIPT_DIR/requirements.txt" +# Presidio's default AnalyzerEngine loads the en_core_web_lg spaCy model; it is +# not a pip dependency, so download the version compatible with the installed spaCy. +echo "Downloading spaCy model (en_core_web_lg)..." +python -m spacy download en_core_web_lg + echo "" echo "✅ Setup complete! Guardrails validators are ready to use." echo "" diff --git a/apps/sim/lib/guardrails/validate_pii.ts b/apps/sim/lib/guardrails/validate_pii.ts index ba6886bb92d..3e1ec90edb3 100644 --- a/apps/sim/lib/guardrails/validate_pii.ts +++ b/apps/sim/lib/guardrails/validate_pii.ts @@ -13,6 +13,33 @@ const DEFAULT_TIMEOUT = 30000 // 30 seconds */ const PII_CHUNK_MAX_BYTES = 256 * 1024 +/** + * Resolve the guardrails Presidio interpreter + script path. + * + * `process.cwd()` is not stable across runtimes — the Next standalone container + * launches from the monorepo root while local dev and some paths run from + * `apps/sim` — so probe both layouts (mirrors the candidate-path resolution in + * `lib/execution/isolated-vm.ts`). Requires the bundled venv: throws if it is + * absent rather than silently falling back to the system `python3`, which has no + * Presidio and reports a misleading "not installed". + */ +function resolveGuardrailsPython(): { pythonCmd: string; scriptPath: string } { + const candidateDirs = [ + path.join(process.cwd(), 'apps', 'sim', 'lib', 'guardrails'), + path.join(process.cwd(), 'lib', 'guardrails'), + ] + for (const dir of candidateDirs) { + const venvPython = path.join(dir, 'venv', 'bin', 'python3') + if (fs.existsSync(venvPython)) { + return { pythonCmd: venvPython, scriptPath: path.join(dir, 'validate_pii.py') } + } + } + const probed = candidateDirs.map((d) => path.join(d, 'venv', 'bin', 'python3')).join(', ') + throw new Error( + `Guardrails Presidio venv not found (looked in ${probed}). Provision it with apps/sim/lib/guardrails/setup.sh locally, or verify the image build installs it.` + ) +} + export interface PIIValidationInput { text: string entityTypes: string[] // e.g., ["PERSON", "EMAIL_ADDRESS", "CREDIT_CARD"] @@ -136,10 +163,7 @@ export async function maskPIIBatch( */ function runPythonScript(payload: Record): Promise { return new Promise((resolve, reject) => { - const guardrailsDir = path.join(process.cwd(), 'lib/guardrails') - const scriptPath = path.join(guardrailsDir, 'validate_pii.py') - const venvPython = path.join(guardrailsDir, 'venv/bin/python3') - const pythonCmd = fs.existsSync(venvPython) ? venvPython : 'python3' + const { pythonCmd, scriptPath } = resolveGuardrailsPython() const python = spawn(pythonCmd, [scriptPath]) let stdout = '' @@ -208,14 +232,7 @@ async function executePythonPIIDetection( requestId: string ): Promise { return new Promise((resolve, reject) => { - // Use path relative to project root - // In Next.js, process.cwd() returns the project root - const guardrailsDir = path.join(process.cwd(), 'lib/guardrails') - const scriptPath = path.join(guardrailsDir, 'validate_pii.py') - const venvPython = path.join(guardrailsDir, 'venv/bin/python3') - - // Use venv Python if it exists, otherwise fall back to system python3 - const pythonCmd = fs.existsSync(venvPython) ? venvPython : 'python3' + const { pythonCmd, scriptPath } = resolveGuardrailsPython() const python = spawn(pythonCmd, [scriptPath]) diff --git a/apps/sim/lib/logs/execution/pii-redaction.test.ts b/apps/sim/lib/logs/execution/pii-redaction.test.ts index dccbc59cc38..5a2da7a5996 100644 --- a/apps/sim/lib/logs/execution/pii-redaction.test.ts +++ b/apps/sim/lib/logs/execution/pii-redaction.test.ts @@ -7,8 +7,8 @@ const { mockMaskPIIBatch } = vi.hoisted(() => ({ mockMaskPIIBatch: vi.fn(), })) -vi.mock('@/lib/guardrails/validate_pii', () => ({ - maskPIIBatch: mockMaskPIIBatch, +vi.mock('@/lib/guardrails/mask-client', () => ({ + maskPIIBatchViaHttp: mockMaskPIIBatch, })) import { REDACTION_FAILED_MARKER, redactPIIFromExecution } from '@/lib/logs/execution/pii-redaction' diff --git a/apps/sim/lib/logs/execution/pii-redaction.ts b/apps/sim/lib/logs/execution/pii-redaction.ts index 7b4794fd483..8cd0fac5326 100644 --- a/apps/sim/lib/logs/execution/pii-redaction.ts +++ b/apps/sim/lib/logs/execution/pii-redaction.ts @@ -1,5 +1,6 @@ import { createLogger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' +import { maskPIIBatchViaHttp } from '@/lib/guardrails/mask-client' const logger = createLogger('PiiRedaction') @@ -158,11 +159,9 @@ export async function redactPIIFromExecution( masked = collected.map(() => REDACTION_FAILED_MARKER) } else { try { - // Lazy import keeps the Python-spawning guardrails module (child_process + - // a `lib/guardrails` dir reference) out of the static middleware/RSC graph; - // it's only loaded at runtime on the Node log-persist path. - const { maskPIIBatch } = await import('@/lib/guardrails/validate_pii') - masked = await maskPIIBatch(collected, entityTypes, language) + // Presidio runs only in the app container; the persist path also runs in + // the trigger.dev runtime, so masking always goes over HTTP to the app. + masked = await maskPIIBatchViaHttp(collected, entityTypes, language) } catch (error) { logger.error('PII masking failed; scrubbing text to avoid leaking PII', { error: getErrorMessage(error), diff --git a/docker/app.Dockerfile b/docker/app.Dockerfile index 67eb5f02c77..2323ebb1df4 100644 --- a/docker/app.Dockerfile +++ b/docker/app.Dockerfile @@ -118,11 +118,14 @@ COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/execution/sandbox/bu COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/guardrails/requirements.txt ./apps/sim/lib/guardrails/requirements.txt COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/guardrails/validate_pii.py ./apps/sim/lib/guardrails/validate_pii.py -# Install Python dependencies with pip cache mount for faster rebuilds +# Install Python dependencies with pip cache mount for faster rebuilds. +# Presidio's default AnalyzerEngine loads en_core_web_lg, which is not a pip +# dependency — download the spaCy model into the venv after installing Presidio. RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m venv ./apps/sim/lib/guardrails/venv && \ ./apps/sim/lib/guardrails/venv/bin/pip install --upgrade pip && \ ./apps/sim/lib/guardrails/venv/bin/pip install -r ./apps/sim/lib/guardrails/requirements.txt && \ + ./apps/sim/lib/guardrails/venv/bin/python -m spacy download en_core_web_lg && \ chown -R nextjs:nodejs /app/apps/sim/lib/guardrails # Create .next/cache directory with correct ownership diff --git a/scripts/check-api-validation-contracts.ts b/scripts/check-api-validation-contracts.ts index 09744c629ba..17f0a25fa29 100644 --- a/scripts/check-api-validation-contracts.ts +++ b/scripts/check-api-validation-contracts.ts @@ -9,8 +9,8 @@ const QUERY_HOOKS_DIR = path.join(ROOT, 'apps/sim/hooks/queries') const SELECTOR_HOOKS_DIR = path.join(ROOT, 'apps/sim/hooks/selectors') const BASELINE = { - totalRoutes: 859, - zodRoutes: 859, + totalRoutes: 860, + zodRoutes: 860, nonZodRoutes: 0, } as const