From 5f5326f9371084ba78a39b29008639f8baa1372b Mon Sep 17 00:00:00 2001 From: eugenegujing Date: Thu, 14 May 2026 19:07:29 -0700 Subject: [PATCH 01/14] feat(agent-service): add DataGuard types and profiler First checkpoint for the DataGuard data-cleaning agent: adds the shared types contract (DataQualityIssue, FixProposal, DecisionLogEntry, AutoAllowRule, plus supporting unions) and the read-only profile_dataset scanner with four heuristics (missing, placeholder, duplicate-ID, out-of-range). DataGuard auto-launches when a dataset is added to the workflow and asks Claude-Code-style approval before applying each fix; see README_DataGuard_Texera.md. --- .../src/agent/tools/dataguard/dataset.ts | 26 ++ .../tools/dataguard/profile-dataset.test.ts | 209 ++++++++++++++++ .../agent/tools/dataguard/profile-dataset.ts | 236 ++++++++++++++++++ agent-service/src/types/dataguard.test.ts | 219 ++++++++++++++++ agent-service/src/types/dataguard.ts | 101 ++++++++ agent-service/src/types/index.ts | 1 + 6 files changed, 792 insertions(+) create mode 100644 agent-service/src/agent/tools/dataguard/dataset.ts create mode 100644 agent-service/src/agent/tools/dataguard/profile-dataset.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/profile-dataset.ts create mode 100644 agent-service/src/types/dataguard.test.ts create mode 100644 agent-service/src/types/dataguard.ts diff --git a/agent-service/src/agent/tools/dataguard/dataset.ts b/agent-service/src/agent/tools/dataguard/dataset.ts new file mode 100644 index 00000000000..6d1d5fdb32a --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/dataset.ts @@ -0,0 +1,26 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// In-memory tabular dataset view shared across the DataGuard tools +// (profile_dataset / suggest_fix / apply_fix). Source-agnostic: rows can come +// from a parsed CSV, a Texera operator result, or a fixture used in tests. +export interface DatasetView { + columns: string[]; + rows: Record[]; +} diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts new file mode 100644 index 00000000000..d48d828c98d --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts @@ -0,0 +1,209 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { profileDataset } from "./profile-dataset"; +import type { DatasetView } from "./dataset"; + +describe("profileDataset", () => { + test("clean dataset → empty issue list", () => { + const ds: DatasetView = { + columns: ["age", "name"], + rows: [ + { age: 25, name: "Alice" }, + { age: 30, name: "Bob" }, + ], + }; + expect(profileDataset(ds)).toEqual([]); + }); + + test("empty dataset → empty issue list", () => { + expect(profileDataset({ columns: [], rows: [] })).toEqual([]); + }); + + test("detects missing values per column (null + empty string)", () => { + const ds: DatasetView = { + columns: ["age", "name"], + rows: [ + { age: 25, name: "Alice" }, + { age: null, name: "Bob" }, + { age: 30, name: "" }, + ], + }; + const issues = profileDataset(ds); + const ageMiss = issues.find(i => i.issueType === "missing_value" && i.column === "age"); + const nameMiss = issues.find(i => i.issueType === "missing_value" && i.column === "name"); + expect(ageMiss).toBeDefined(); + expect(ageMiss!.affectedRowCount).toBe(1); + expect(nameMiss).toBeDefined(); + expect(nameMiss!.affectedRowCount).toBe(1); + }); + + test("treats configured missing tokens as missing", () => { + const ds: DatasetView = { + columns: ["x"], + rows: [{ x: "ok" }, { x: "N/A" }, { x: "NA" }, { x: "ok" }], + }; + const issues = profileDataset(ds); + const miss = issues.find(i => i.issueType === "missing_value"); + expect(miss).toBeDefined(); + expect(miss!.affectedRowCount).toBe(2); + }); + + test("NaN counts as missing", () => { + const ds: DatasetView = { + columns: ["x"], + rows: [{ x: 1 }, { x: Number.NaN }, { x: 3 }], + }; + const issues = profileDataset(ds); + const miss = issues.find(i => i.issueType === "missing_value"); + expect(miss).toBeDefined(); + expect(miss!.affectedRowCount).toBe(1); + }); + + test("detects 999 as placeholder in numeric column (default placeholder list)", () => { + const ds: DatasetView = { + columns: ["age"], + rows: [ + { age: 25 }, { age: 999 }, { age: 30 }, { age: 999 }, { age: 999 }, + ], + }; + const issues = profileDataset(ds); + const ph = issues.find(i => i.issueType === "placeholder_value"); + expect(ph).toBeDefined(); + expect(ph!.column).toBe("age"); + expect(ph!.affectedRowCount).toBe(3); + }); + + test("custom placeholder list overrides default", () => { + const ds: DatasetView = { + columns: ["status"], + rows: [ + { status: "ok" }, { status: "missing" }, { status: "ok" }, { status: "missing" }, + ], + }; + const issues = profileDataset(ds, { placeholderValues: ["missing"] }); + const ph = issues.find(i => i.issueType === "placeholder_value"); + expect(ph).toBeDefined(); + expect(ph!.affectedRowCount).toBe(2); + }); + + test("idColumn → detects duplicate IDs", () => { + const ds: DatasetView = { + columns: ["sample_id", "value"], + rows: [ + { sample_id: "S1", value: 1 }, + { sample_id: "S2", value: 2 }, + { sample_id: "S1", value: 99 }, + { sample_id: "S3", value: 3 }, + ], + }; + const issues = profileDataset(ds, { idColumn: "sample_id" }); + const dup = issues.find(i => i.issueType === "duplicate_id"); + expect(dup).toBeDefined(); + expect(dup!.column).toBe("sample_id"); + expect(dup!.affectedRowCount).toBe(2); + }); + + test("no idColumn → no duplicate_id issue even with repeated values", () => { + const ds: DatasetView = { + columns: ["x"], + rows: [{ x: 1 }, { x: 1 }, { x: 1 }], + }; + const issues = profileDataset(ds); + expect(issues.find(i => i.issueType === "duplicate_id")).toBeUndefined(); + }); + + test("validRanges → detects out-of-range values", () => { + const ds: DatasetView = { + columns: ["bmi"], + rows: [ + { bmi: 25.5 }, { bmi: 65 }, { bmi: 72 }, { bmi: 22 }, + ], + }; + const issues = profileDataset(ds, { validRanges: { bmi: { min: 10, max: 60 } } }); + const oor = issues.find(i => i.issueType === "out_of_range"); + expect(oor).toBeDefined(); + expect(oor!.affectedRowCount).toBe(2); + }); + + test("placeholder values are not double-counted as out_of_range", () => { + const ds: DatasetView = { + columns: ["age"], + rows: [{ age: 25 }, { age: 999 }, { age: 30 }], + }; + const issues = profileDataset(ds, { validRanges: { age: { min: 0, max: 130 } } }); + expect(issues.find(i => i.issueType === "out_of_range")).toBeUndefined(); + expect(issues.find(i => i.issueType === "placeholder_value")).toBeDefined(); + }); + + test("affectedRowIndices included when small (≤50)", () => { + const ds: DatasetView = { + columns: ["age"], + rows: [{ age: 999 }, { age: 25 }, { age: 999 }], + }; + const issues = profileDataset(ds); + const ph = issues.find(i => i.issueType === "placeholder_value")!; + expect(ph.affectedRowIndices).toEqual([0, 2]); + }); + + test("affectedRowIndices omitted for large sets (>50)", () => { + const rows = Array.from({ length: 100 }, (_, i) => ({ x: i < 60 ? null : i })); + const ds: DatasetView = { columns: ["x"], rows }; + const issues = profileDataset(ds); + const miss = issues.find(i => i.issueType === "missing_value")!; + expect(miss.affectedRowCount).toBe(60); + expect(miss.affectedRowIndices).toBeUndefined(); + }); + + test("each emitted issue has a distinct issueId", () => { + const ds: DatasetView = { + columns: ["a", "b"], + rows: [{ a: null, b: null }], + }; + const issues = profileDataset(ds); + const ids = new Set(issues.map(i => i.issueId)); + expect(ids.size).toBe(issues.length); + expect(issues.length).toBeGreaterThan(0); + }); + + test("realistic diabetes fixture surfaces 4 issue categories", () => { + // Mirrors the §5 storyboard's diabetes_messy.csv: placeholder ages, + // missing glucose, duplicate sample IDs, and biologically impossible BMI. + const ds: DatasetView = { + columns: ["sample_id", "age", "glucose", "bmi", "group"], + rows: [ + { sample_id: "S1", age: 45, glucose: 110, bmi: 28, group: "A" }, + { sample_id: "S2", age: 999, glucose: null, bmi: 30, group: "A" }, + { sample_id: "S1", age: 45, glucose: 120, bmi: 27, group: "A" }, // dup ID + { sample_id: "S3", age: 50, glucose: null, bmi: 65, group: "B" }, // impossible BMI + { sample_id: "S4", age: 999, glucose: 140, bmi: 31, group: "B" }, + ], + }; + const issues = profileDataset(ds, { + idColumn: "sample_id", + validRanges: { age: { min: 0, max: 120 }, bmi: { min: 10, max: 60 } }, + }); + const kinds = new Set(issues.map(i => i.issueType)); + expect(kinds.has("placeholder_value")).toBe(true); + expect(kinds.has("missing_value")).toBe(true); + expect(kinds.has("duplicate_id")).toBe(true); + expect(kinds.has("out_of_range")).toBe(true); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.ts new file mode 100644 index 00000000000..08cd8723bd2 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/profile-dataset.ts @@ -0,0 +1,236 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Read-only scanner that returns DataQualityIssue[] for a dataset. +// No LLM calls — pure heuristics. Safe to auto-run on dataset-add. + +import type { DataQualityIssue } from "../../../types/dataguard"; +import type { DatasetView } from "./dataset"; + +export interface ProfileOptions { + // Column to treat as a unique identifier; duplicates flagged as duplicate_id. + idColumn?: string; + // Hard valid range per numeric column (e.g., { age: { min: 0, max: 130 } }). + validRanges?: Record; + // Sentinel values that should be treated as placeholders rather than data. + placeholderValues?: Array; + // String tokens that should be treated as missing in addition to null/undefined/NaN/"". + missingTokens?: string[]; + // Above this row count, affectedRowIndices is omitted from the issue. + maxIndicesInIssue?: number; +} + +// Placeholders are sentinel *values* that look like data but are actually +// "no value." Kept distinct from missing-tokens to avoid double-flagging +// the same cell under two issue types. +const DEFAULT_PLACEHOLDERS: Array = [ + 999, + -1, + "unknown", + "Unknown", +]; + +// Tokens that mean "no data was recorded." Empty string is always treated +// as missing without needing to be listed. +const DEFAULT_MISSING_TOKENS: string[] = [ + "NA", + "N/A", + "n/a", + "null", + "NULL", + "None", +]; + +const DEFAULT_MAX_INDICES_IN_ISSUE = 50; + +let issueCounter = 0; +function nextIssueId(): string { + issueCounter += 1; + return `iss-${Date.now()}-${issueCounter}`; +} + +function nowIso(): string { + return new Date().toISOString(); +} + +function isMissing(value: unknown, missingTokens: string[]): boolean { + if (value === null || value === undefined) return true; + if (typeof value === "number" && Number.isNaN(value)) return true; + if (typeof value === "string") { + if (value === "") return true; + if (missingTokens.includes(value)) return true; + } + return false; +} + +function toNumber(value: unknown): number | undefined { + if (typeof value === "number" && Number.isFinite(value)) return value; + if (typeof value === "string" && value.trim() !== "") { + const n = Number(value); + if (Number.isFinite(n)) return n; + } + return undefined; +} + +function placeholderHit( + value: unknown, + placeholders: Array +): string | number | undefined { + for (const p of placeholders) { + if (typeof p === "string" && typeof value === "string" && p === value) return p; + if (typeof p === "number") { + const n = toNumber(value); + if (n !== undefined && n === p) return p; + } + } + return undefined; +} + +function maybeIndices( + indices: number[], + cap: number +): number[] | undefined { + return indices.length <= cap ? indices : undefined; +} + +export function profileDataset( + dataset: DatasetView, + options: ProfileOptions = {} +): DataQualityIssue[] { + const placeholders = options.placeholderValues ?? DEFAULT_PLACEHOLDERS; + const missingTokens = options.missingTokens ?? DEFAULT_MISSING_TOKENS; + const indexCap = options.maxIndicesInIssue ?? DEFAULT_MAX_INDICES_IN_ISSUE; + const detectedAt = nowIso(); + const issues: DataQualityIssue[] = []; + + // Pre-compute placeholder hits per row/column so out_of_range can avoid + // double-counting and missing-value can avoid flagging a row that has a + // string placeholder like "N/A" twice. + const placeholderHitByColRow = new Map>(); + for (const col of dataset.columns) { + const map = new Map(); + for (let i = 0; i < dataset.rows.length; i++) { + const hit = placeholderHit(dataset.rows[i][col], placeholders); + if (hit !== undefined) map.set(i, hit); + } + placeholderHitByColRow.set(col, map); + } + + // Missing-value detector. + for (const col of dataset.columns) { + const missingIndices: number[] = []; + for (let i = 0; i < dataset.rows.length; i++) { + if (isMissing(dataset.rows[i][col], missingTokens)) missingIndices.push(i); + } + if (missingIndices.length === 0) continue; + const pct = (missingIndices.length / Math.max(dataset.rows.length, 1)) * 100; + issues.push({ + issueId: nextIssueId(), + issueType: "missing_value", + column: col, + description: `${missingIndices.length} row(s) have missing ${col}`, + evidence: `Missing: ${missingIndices.length} of ${dataset.rows.length} (${pct.toFixed(1)}%)`, + affectedRowCount: missingIndices.length, + affectedRowIndices: maybeIndices(missingIndices, indexCap), + detectedAt, + }); + } + + // Placeholder-value detector. + for (const col of dataset.columns) { + const hits = placeholderHitByColRow.get(col)!; + if (hits.size === 0) continue; + const indices = Array.from(hits.keys()).sort((a, b) => a - b); + const distinctValues = Array.from(new Set(hits.values())); + issues.push({ + issueId: nextIssueId(), + issueType: "placeholder_value", + column: col, + description: `${indices.length} row(s) in ${col} contain placeholder value(s): ${distinctValues.join(", ")}`, + evidence: `Placeholder(s) ${distinctValues.join(", ")} appear ${indices.length} time(s) in ${col}.`, + affectedRowCount: indices.length, + affectedRowIndices: maybeIndices(indices, indexCap), + detectedAt, + }); + } + + // Duplicate-ID detector (only when idColumn is configured and exists). + if (options.idColumn && dataset.columns.includes(options.idColumn)) { + const idCol = options.idColumn; + const positions = new Map(); + for (let i = 0; i < dataset.rows.length; i++) { + const v = dataset.rows[i][idCol]; + if (v === null || v === undefined) continue; + const key = String(v); + const existing = positions.get(key); + if (existing) existing.push(i); + else positions.set(key, [i]); + } + const duplicateIndices: number[] = []; + const duplicateKeys: string[] = []; + for (const [key, rows] of positions) { + if (rows.length > 1) { + duplicateIndices.push(...rows); + duplicateKeys.push(key); + } + } + if (duplicateIndices.length > 0) { + duplicateIndices.sort((a, b) => a - b); + issues.push({ + issueId: nextIssueId(), + issueType: "duplicate_id", + column: idCol, + description: `${duplicateKeys.length} duplicate ID(s) in ${idCol} affecting ${duplicateIndices.length} row(s)`, + evidence: `Duplicate keys (showing up to 5): ${duplicateKeys.slice(0, 5).join(", ")}`, + affectedRowCount: duplicateIndices.length, + affectedRowIndices: maybeIndices(duplicateIndices, indexCap), + detectedAt, + }); + } + } + + // Out-of-range detector — skips rows already flagged as placeholders so we + // don't surface the same row under two issue types. + if (options.validRanges) { + for (const [col, range] of Object.entries(options.validRanges)) { + if (!dataset.columns.includes(col)) continue; + const placeholderHits = placeholderHitByColRow.get(col)!; + const oorIndices: number[] = []; + for (let i = 0; i < dataset.rows.length; i++) { + if (placeholderHits.has(i)) continue; + const v = toNumber(dataset.rows[i][col]); + if (v === undefined) continue; + if (v < range.min || v > range.max) oorIndices.push(i); + } + if (oorIndices.length === 0) continue; + issues.push({ + issueId: nextIssueId(), + issueType: "out_of_range", + column: col, + description: `${oorIndices.length} row(s) in ${col} fall outside [${range.min}, ${range.max}]`, + evidence: `Valid range: [${range.min}, ${range.max}]; violations: ${oorIndices.length}.`, + affectedRowCount: oorIndices.length, + affectedRowIndices: maybeIndices(oorIndices, indexCap), + detectedAt, + }); + } + } + + return issues; +} diff --git a/agent-service/src/types/dataguard.test.ts b/agent-service/src/types/dataguard.test.ts new file mode 100644 index 00000000000..c256c7e4709 --- /dev/null +++ b/agent-service/src/types/dataguard.test.ts @@ -0,0 +1,219 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import type { + AutoAllowRule, + Confidence, + DataQualityIssue, + DecisionLogEntry, + FixOperationKind, + FixProposal, + IssueType, + PermissionDecision, + RiskTier, + Verdict, +} from "./dataguard"; + +// These tests double as runtime fixtures for downstream tools. Each shape is +// instantiated with realistic data drawn from the design doc's §5 storyboard +// so that any drift in the type definitions surfaces here first. + +describe("DataGuard type shapes", () => { + test("DataQualityIssue: placeholder-value example (high-affinity, small N)", () => { + const issue: DataQualityIssue = { + issueId: "iss-1", + issueType: "placeholder_value", + column: "age", + description: "5 rows have age = 999", + evidence: "5 of 5 rows with age=999 have no other anomalies.", + affectedRowCount: 5, + affectedRowIndices: [10, 42, 77, 199, 412], + detectedAt: "2026-05-14T12:00:00.000Z", + }; + expect(issue.issueType).toBe("placeholder_value"); + expect(issue.affectedRowCount).toBe(5); + expect(issue.affectedRowIndices).toHaveLength(5); + }); + + test("DataQualityIssue: missing-value example without row indices (large N)", () => { + const issue: DataQualityIssue = { + issueId: "iss-2", + issueType: "missing_value", + column: "glucose", + description: "17 missing glucose values, 14 in Group A", + evidence: "Group A: 14 of 200 rows missing; Group B: 3 of 200 rows missing.", + affectedRowCount: 17, + detectedAt: "2026-05-14T12:00:01.000Z", + }; + expect(issue.affectedRowIndices).toBeUndefined(); + }); + + test("FixProposal: replace-value, medium risk, high confidence", () => { + const proposal: FixProposal = { + issueId: "iss-1", + action: "Replace age = 999 with NULL", + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + riskTier: "medium", + reason: "999 is outside the valid human-age range and appears to be a placeholder.", + evidence: "5 of 5 rows with age=999 have no other anomalies.", + confidence: "high", + targetRowCount: 5, + }; + expect(proposal.riskTier).toBe("medium"); + expect(proposal.operationKind).toBe("replace_value"); + expect(proposal.operationParams).toMatchObject({ column: "age", match: 999 }); + }); + + test("FixProposal: drop-rows, high risk (the storyboard 'deny' case)", () => { + const proposal: FixProposal = { + issueId: "iss-3", + action: "Drop 3 rows with BMI > 60", + operationKind: "drop_rows", + operationParams: { rowIndices: [55, 211, 433] }, + riskTier: "high", + reason: "Extreme outliers may be data-entry errors.", + evidence: "3 rows have BMI > 60 (clinical maximum ~70).", + confidence: "low", + targetRowCount: 3, + }; + expect(proposal.riskTier).toBe("high"); + }); + + test("DecisionLogEntry: allowed and applied", () => { + const entry: DecisionLogEntry = { + decisionId: "dec-1", + timestamp: "2026-05-14T12:00:30.000Z", + issueType: "placeholder_value", + targetRowCount: 5, + proposedAction: "Replace age = 999 with NULL", + userDecision: "allow", + reason: "999 outside valid age range.", + confidence: "high", + appliedAt: "2026-05-14T12:00:31.123Z", + }; + expect(entry.userDecision).toBe("allow"); + expect(entry.appliedAt).toBeDefined(); + expect(entry.modifiedAction).toBeUndefined(); + }); + + test("DecisionLogEntry: denied — no appliedAt", () => { + const entry: DecisionLogEntry = { + decisionId: "dec-2", + timestamp: "2026-05-14T12:01:00.000Z", + issueType: "outlier", + targetRowCount: 3, + proposedAction: "Drop 3 rows with BMI > 60", + userDecision: "deny", + reason: "User flagged these as meaningful clinical cases.", + confidence: "low", + }; + expect(entry.userDecision).toBe("deny"); + expect(entry.appliedAt).toBeUndefined(); + }); + + test("DecisionLogEntry: modified — carries modifiedAction", () => { + const entry: DecisionLogEntry = { + decisionId: "dec-3", + timestamp: "2026-05-14T12:02:00.000Z", + issueType: "missing_value", + targetRowCount: 17, + proposedAction: "Impute missing glucose with group median", + userDecision: "modify", + modifiedAction: "Flag for manual review", + reason: "Imbalance across groups makes imputation risky.", + confidence: "medium", + appliedAt: "2026-05-14T12:02:05.000Z", + }; + expect(entry.userDecision).toBe("modify"); + expect(entry.modifiedAction).toBe("Flag for manual review"); + }); + + test("AutoAllowRule: per-issue-type policy", () => { + const rule: AutoAllowRule = { + ruleId: "rule-1", + issueType: "placeholder_value", + createdAt: "2026-05-14T12:00:30.000Z", + }; + expect(rule.issueType).toBe("placeholder_value"); + }); + + test("PermissionDecision: allow with remember=true triggers a rule write", () => { + const decision: PermissionDecision = { + stepId: "step-43", + verdict: "allow", + remember: true, + }; + expect(decision.remember).toBe(true); + expect(decision.modifiedAction).toBeUndefined(); + }); + + test("PermissionDecision: modify with modifiedAction", () => { + const decision: PermissionDecision = { + stepId: "step-42", + verdict: "modify", + modifiedAction: "Flag for manual review instead of impute", + }; + expect(decision.verdict).toBe("modify"); + expect(decision.modifiedAction).toBeDefined(); + }); + + test("PermissionDecision: deny", () => { + const decision: PermissionDecision = { + stepId: "step-44", + verdict: "deny", + }; + expect(decision.verdict).toBe("deny"); + }); + + test("Literal unions accept all documented members", () => { + const risks: RiskTier[] = ["low", "medium", "high"]; + const confidences: Confidence[] = ["low", "medium", "high"]; + const issueTypes: IssueType[] = [ + "placeholder_value", + "missing_value", + "duplicate_id", + "out_of_range", + "outlier", + "inconsistent_label", + ]; + const opKinds: FixOperationKind[] = [ + "replace_value", + "drop_rows", + "impute", + "flag", + "standardize", + "trim_whitespace", + "rename_column", + ]; + const verdicts: Verdict[] = [ + "allow", + "deny", + "modify", + "auto_allow_low_risk", + "auto_allow_remembered", + ]; + expect(risks).toHaveLength(3); + expect(confidences).toHaveLength(3); + expect(issueTypes).toHaveLength(6); + expect(opKinds).toHaveLength(7); + expect(verdicts).toHaveLength(5); + }); +}); diff --git a/agent-service/src/types/dataguard.ts b/agent-service/src/types/dataguard.ts new file mode 100644 index 00000000000..413e2e37814 --- /dev/null +++ b/agent-service/src/types/dataguard.ts @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Shared types for the DataGuard agent: the contract between the four +// DataGuard tools (profile_dataset / suggest_fix / apply_fix / write_decision_log), +// the agent's permission-gating layer, and the chat-panel approval UI. + +export type RiskTier = "low" | "medium" | "high"; + +export type Confidence = "low" | "medium" | "high"; + +export type IssueType = + | "placeholder_value" + | "missing_value" + | "duplicate_id" + | "out_of_range" + | "outlier" + | "inconsistent_label"; + +export type FixOperationKind = + | "replace_value" + | "drop_rows" + | "impute" + | "flag" + | "standardize" + | "trim_whitespace" + | "rename_column"; + +export type Verdict = + | "allow" + | "deny" + | "modify" + | "auto_allow_low_risk" + | "auto_allow_remembered"; + +export interface DataQualityIssue { + issueId: string; + issueType: IssueType; + column: string; + description: string; + evidence: string; + affectedRowCount: number; + // Present only when the affected set is small enough to enumerate; otherwise + // omit and rely on `evidence` for a sample / aggregate description. + affectedRowIndices?: number[]; + detectedAt: string; +} + +export interface FixProposal { + issueId: string; + action: string; + operationKind: FixOperationKind; + operationParams: Record; + riskTier: RiskTier; + reason: string; + evidence: string; + confidence: Confidence; + targetRowCount: number; +} + +export interface PermissionDecision { + stepId: string; + verdict: Verdict; + modifiedAction?: string; + remember?: boolean; +} + +export interface DecisionLogEntry { + decisionId: string; + timestamp: string; + issueType: IssueType; + targetRowCount: number; + proposedAction: string; + userDecision: Verdict; + modifiedAction?: string; + reason: string; + confidence: Confidence; + appliedAt?: string; +} + +export interface AutoAllowRule { + ruleId: string; + issueType: IssueType; + createdAt: string; +} diff --git a/agent-service/src/types/index.ts b/agent-service/src/types/index.ts index c6d7291e51d..1fd76593d69 100644 --- a/agent-service/src/types/index.ts +++ b/agent-service/src/types/index.ts @@ -20,3 +20,4 @@ export * from "./workflow"; export * from "./execution"; export * from "./agent"; +export * from "./dataguard"; From 076b708bc1eba75fd40d6f9cee28fb2a581fcaa4 Mon Sep 17 00:00:00 2001 From: eugenegujing Date: Thu, 14 May 2026 19:34:15 -0700 Subject: [PATCH 02/14] feat(agent-service): complete DataGuard backend with permission gating Finishes the agent-side DataGuard MVP: LLM-driven suggest_fix, mutating apply_fix, the requestApproval/awaitDecision/resolveDecision gating layer on TexeraAgent (pendingApproval step + WS decision message + auto-allow "remember" rules), write_decision_log (RFC-4180 CSV), bias-check (per-group retention), and a ~50-row polluted diabetes demo CSV. 122 new test cases, all four DataGuard tools registered into createTools(). --- agent-service/demo/README.md | 40 +++ agent-service/demo/diabetes_messy.csv | 51 ++++ agent-service/src/agent/texera-agent.ts | 128 +++++++- .../agent/tools/dataguard/apply-fix.test.ts | 280 ++++++++++++++++++ .../src/agent/tools/dataguard/apply-fix.ts | 205 +++++++++++++ .../agent/tools/dataguard/bias-check.test.ts | 106 +++++++ .../src/agent/tools/dataguard/bias-check.ts | 131 ++++++++ .../tools/dataguard/dataguard-session.test.ts | 119 ++++++++ .../tools/dataguard/dataguard-session.ts | 151 ++++++++++ .../agent/tools/dataguard/dataguard-tools.ts | 190 ++++++++++++ .../tools/dataguard/decision-log.test.ts | 98 ++++++ .../src/agent/tools/dataguard/decision-log.ts | 89 ++++++ .../agent/tools/dataguard/suggest-fix.test.ts | 150 ++++++++++ .../src/agent/tools/dataguard/suggest-fix.ts | 137 +++++++++ .../tools/dataguard/with-approval.test.ts | 137 +++++++++ .../agent/tools/dataguard/with-approval.ts | 57 ++++ agent-service/src/server.ts | 65 +++- agent-service/src/types/agent.ts | 11 + agent-service/src/types/dataguard.test.ts | 2 + agent-service/src/types/dataguard.ts | 1 + 20 files changed, 2145 insertions(+), 3 deletions(-) create mode 100644 agent-service/demo/README.md create mode 100644 agent-service/demo/diabetes_messy.csv create mode 100644 agent-service/src/agent/tools/dataguard/apply-fix.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/apply-fix.ts create mode 100644 agent-service/src/agent/tools/dataguard/bias-check.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/bias-check.ts create mode 100644 agent-service/src/agent/tools/dataguard/dataguard-session.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/dataguard-session.ts create mode 100644 agent-service/src/agent/tools/dataguard/dataguard-tools.ts create mode 100644 agent-service/src/agent/tools/dataguard/decision-log.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/decision-log.ts create mode 100644 agent-service/src/agent/tools/dataguard/suggest-fix.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/suggest-fix.ts create mode 100644 agent-service/src/agent/tools/dataguard/with-approval.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/with-approval.ts diff --git a/agent-service/demo/README.md b/agent-service/demo/README.md new file mode 100644 index 00000000000..968987a0703 --- /dev/null +++ b/agent-service/demo/README.md @@ -0,0 +1,40 @@ +# DataGuard demo dataset + +`diabetes_messy.csv` — a deliberately polluted ~50-row sample modeled on the +UCI Pima Indians Diabetes dataset. Each row is one subject; the rightmost +column (`diabetic_outcome`) is the label. + +## Injected issues (every issue type DataGuard detects) + +| # | Issue type | Rows | What's wrong | +|---|---|---|---| +| 1 | `placeholder_value` | S004, S012 | `age = 999` (sentinel) | +| 2 | `placeholder_value` | S043 | `bmi = -1` (sentinel) | +| 3 | `placeholder_value` | S047 | `age = "Unknown"` (string sentinel) | +| 4 | `missing_value` | S005, S007, S009, S014 | empty `glucose` in Group A (imbalanced) | +| 5 | `missing_value` | S045, S046, S048 | `age = "N/A"`, `glucose = " "`, `glucose = "null"` | +| 6 | `duplicate_id` | S001, S017 | sample_id repeats with conflicting `diabetic_outcome` | +| 7 | `out_of_range` | S041, S042, S044 | `bmi > 60` (clinical max ~70 — possibly real, possibly error) | + +## How to load into DataGuard + +```bash +# After bun install + bun run dev in agent-service/ +curl -X POST http://localhost:8000/api/agents//dataguard/dataset \ + -H "Content-Type: application/json" \ + -d "$(jq -nR --rawfile c diabetes_messy.csv '{ + columns: ($c | split("\n")[0] | split(",")), + rows: ($c | split("\n")[1:] | map(split(",") | length as $n | reduce range(0;$n) as $i ({}; . + {(.|keys|"col\($i)"): .[$i]}))) + }')" +``` + +Or via the demo script (Step 13's frontend auto-trigger handles this in the +real flow — once a `CSVFileScan` operator is added that references this file, +DataGuard auto-launches). + +## Bias-check expectation + +Group A: 22 rows. Group B: 23 rows. After cleaning, missingness imbalance +(more empties in A) means naive imputation drops ~18% of A but only ~4% of B +— DataGuard surfaces this and recommends `flag` instead of `impute` for the +missing-glucose issue (the §5 storyboard "Modify" beat). diff --git a/agent-service/demo/diabetes_messy.csv b/agent-service/demo/diabetes_messy.csv new file mode 100644 index 00000000000..12115816b1a --- /dev/null +++ b/agent-service/demo/diabetes_messy.csv @@ -0,0 +1,51 @@ +sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome +S001,45,110,28.1,80,A,0 +S002,52,140,30.5,85,A,1 +S003,38,95,24.0,70,A,0 +S004,999,130,29.8,82,A,1 +S005,41,,27.5,78,A,0 +S006,47,125,31.2,80,A,1 +S007,55,,33.0,90,A,1 +S008,49,118,28.9,82,A,0 +S009,42,,26.7,75,A,0 +S010,53,135,29.1,88,A,1 +S011,46,108,27.3,79,A,0 +S012,999,142,30.8,87,A,1 +S013,43,112,26.4,76,A,0 +S014,51,,29.5,84,A,1 +S015,48,120,28.6,81,A,0 +S016,40,98,25.2,72,A,0 +S017,57,148,32.7,92,A,1 +S018,44,115,27.8,79,A,0 +S019,50,128,30.0,85,A,1 +S020,39,102,24.9,73,A,0 +S021,46,113,28.4,80,B,0 +S022,54,138,31.5,88,B,1 +S023,42,108,27.0,77,B,0 +S024,48,122,29.3,82,B,1 +S025,45,116,28.7,79,B,0 +S026,52,134,30.6,86,B,1 +S027,41,105,25.8,74,B,0 +S028,49,124,29.0,83,B,1 +S029,47,118,28.2,81,B,0 +S030,43,110,26.9,76,B,0 +S031,55,145,32.1,90,B,1 +S032,46,114,28.5,80,B,0 +S033,50,130,30.2,85,B,1 +S034,38,72,23.5,68,B,0 +S035,53,140,31.0,87,B,1 +S036,44,112,27.6,78,B,0 +S037,51,132,30.4,86,B,1 +S038,42,107,26.3,75,B,0 +S039,48,121,28.8,82,B,1 +S040,45,115,27.9,80,B,0 +S001,45,110,28.1,80,A,1 +S017,57,148,32.7,92,A,0 +S041,62,180,67.5,95,A,1 +S042,58,165,65.2,93,B,1 +S043,49,128,-1,82,A,0 +S044,46,112,72.0,84,B,1 +S045,N/A,118,28.7,80,A,0 +S046,44, ,26.9,76,B,0 +S047,Unknown,124,29.2,83,A,1 +S048,50,null,28.8,81,B,0 diff --git a/agent-service/src/agent/texera-agent.ts b/agent-service/src/agent/texera-agent.ts index 37eb12d8688..f70cbadc358 100644 --- a/agent-service/src/agent/texera-agent.ts +++ b/agent-service/src/agent/texera-agent.ts @@ -51,6 +51,16 @@ import { assembleContext } from "./util/context-utils"; import { compileWorkflowAsync, type WorkflowCompilationResponse } from "../api/compile-api"; import { createLogger } from "../logger"; import type { Logger } from "pino"; +import type { + FixProposal, + IssueType, + PermissionDecision, +} from "../types/dataguard"; +import { DataGuardSession } from "./tools/dataguard/dataguard-session"; +import type { ApprovalGateway } from "./tools/dataguard/with-approval"; +import type { LlmCallFn } from "./tools/dataguard/suggest-fix"; +import { createDataGuardTools } from "./tools/dataguard/dataguard-tools"; +import type { DatasetView } from "./tools/dataguard/dataset"; const PERSIST_DEBOUNCE_MS = 500; @@ -80,8 +90,12 @@ type ReActStepCallback = (step: ReActStep) => void; * (`WorkflowResultState`), and the tool surface exposed to the LLM. Each call * to `sendMessage` drives one multi-step generation via the Vercel AI SDK, * streaming step updates to subscribed websockets. + * + * Also implements `ApprovalGateway` — DataGuard's mutating tools call + * `requestApproval(this, …)` to pause the ReAct loop until the user clicks + * Allow / Deny / Modify in the chat panel. */ -export class TexeraAgent { +export class TexeraAgent implements ApprovalGateway { readonly agentId: string; readonly agentName: string; readonly modelType: string; @@ -125,6 +139,11 @@ export class TexeraAgent { private log: Logger; + // DataGuard state — see agent/tools/dataguard/ + private readonly dataGuardSession = new DataGuardSession(); + private pendingDecisions: Map void> = new Map(); + private decidedBuffer: Map = new Map(); + constructor(config: TexeraAgentConfig) { this.agentId = config.agentId; this.agentName = config.agentName || `Agent-${config.agentId}`; @@ -228,6 +247,24 @@ export class TexeraAgent { ); } + // DataGuard tools — read-only profile/suggest plus permission-gated apply. + const llmCall: LlmCallFn = async (prompt: string) => { + const { text } = await generateText({ + model: this.model, + prompt, + temperature: 0.2, + }); + return text; + }; + Object.assign( + tools, + createDataGuardTools({ + session: this.dataGuardSession, + gateway: this, + llmCall, + }) + ); + return tools; } @@ -312,7 +349,9 @@ export class TexeraAgent { this.stepCallback = callback; } - private generateStepId(): string { + // Public because DataGuard's permission-gating layer needs a fresh step id + // for a pending-approval step before any AI SDK step has been minted. + public generateStepId(): string { return `step-${this.agentId}-${++this.stepCounter}-${Date.now()}`; } @@ -823,6 +862,83 @@ export class TexeraAgent { return relevantSteps; } + // ============================================================ + // DataGuard / ApprovalGateway + // ============================================================ + + public getDataGuardSession(): DataGuardSession { + return this.dataGuardSession; + } + + public setDataGuardDataset(dataset: DatasetView): void { + this.dataGuardSession.setDataset(dataset); + } + + // ApprovalGateway: does this issueType have a standing "remember" rule? + public matchesAutoAllowRule(issueType: IssueType): boolean { + return this.dataGuardSession.matchesAutoAllowRule(issueType); + } + + // ApprovalGateway: append a pending-approval step into the history and + // broadcast it through the existing stepCallback so the chat panel renders + // the prompt UI. + public emitPendingApproval(stepId: string, proposal: FixProposal): void { + const messageId = this.currentMessageId ?? ""; + const wf = this.workflowState.getWorkflowContent(); + const step: ReActStep = { + id: stepId, + parentId: this.head, + messageId, + stepId: -1, + timestamp: Date.now(), + role: "agent", + content: proposal.action, + isBegin: false, + isEnd: false, + pendingApproval: { + toolName: "apply_fix", + proposal, + riskTier: proposal.riskTier, + }, + beforeWorkflowContent: wf, + afterWorkflowContent: wf, + }; + this.addStep(step); + this.head = stepId; + } + + // ApprovalGateway: wait for the user's decision. Resolves when the server + // receives a WS {type:"decision", stepId, …} message and calls resolveDecision. + public awaitDecision(stepId: string): Promise { + const buffered = this.decidedBuffer.get(stepId); + if (buffered) { + this.decidedBuffer.delete(stepId); + return Promise.resolve(buffered); + } + return new Promise(resolve => { + this.pendingDecisions.set(stepId, resolve); + }); + } + + // Called from the WS handler when the user clicks Allow / Deny / Modify. + // Returns true if a waiting tool was unblocked, false if buffered for later. + public resolveDecision(stepId: string, decision: PermissionDecision): boolean { + if (decision.remember) { + // The "Allow & don't ask again" verdict also writes an auto-allow rule. + const proposal = this.dataGuardSession.getProposal(extractIssueIdFromStep(this.stepsById, stepId)); + if (proposal) this.dataGuardSession.addAutoAllowRule(proposal.issueType); + } + const resolver = this.pendingDecisions.get(stepId); + if (resolver) { + this.pendingDecisions.delete(stepId); + resolver(decision); + return true; + } + // Decision arrived before the tool started awaiting — buffer it. + this.decidedBuffer.set(stepId, decision); + return false; + } + destroy(): void { if (this.workflowChangeSubscription) { this.workflowChangeSubscription.unsubscribe(); @@ -838,3 +954,11 @@ export class TexeraAgent { this.currentMessageId = undefined; } } + +function extractIssueIdFromStep( + steps: Map, + stepId: string +): string { + const step = steps.get(stepId); + return step?.pendingApproval?.proposal.issueId ?? ""; +} diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/apply-fix.test.ts new file mode 100644 index 00000000000..1546102b4dd --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/apply-fix.test.ts @@ -0,0 +1,280 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { applyFix } from "./apply-fix"; +import type { DatasetView } from "./dataset"; +import type { FixProposal } from "../../../types/dataguard"; + +function makeProposal(overrides: Partial = {}): FixProposal { + return { + issueId: "iss-test", + issueType: "placeholder_value", + action: "test action", + operationKind: "replace_value", + operationParams: {}, + riskTier: "medium", + reason: "test", + evidence: "test", + confidence: "high", + targetRowCount: 0, + ...overrides, + }; +} + +describe("applyFix", () => { + test("replace_value: swaps matching cells, leaves rest", () => { + const ds: DatasetView = { + columns: ["age"], + rows: [{ age: 25 }, { age: 999 }, { age: 30 }, { age: 999 }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + }) + ); + expect(result.rowsAffected).toBe(2); + expect(result.dataset.rows[0].age).toBe(25); + expect(result.dataset.rows[1].age).toBe(null); + expect(result.dataset.rows[3].age).toBe(null); + }); + + test("replace_value: original dataset is not mutated", () => { + const ds: DatasetView = { + columns: ["age"], + rows: [{ age: 999 }, { age: 30 }], + }; + const before = JSON.stringify(ds); + applyFix( + ds, + makeProposal({ + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + }) + ); + expect(JSON.stringify(ds)).toBe(before); + }); + + test("drop_rows: removes rows at given indices", () => { + const ds: DatasetView = { + columns: ["x"], + rows: [{ x: 0 }, { x: 1 }, { x: 2 }, { x: 3 }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "drop_rows", + operationParams: { rowIndices: [1, 3] }, + }) + ); + expect(result.rowsAffected).toBe(2); + expect(result.dataset.rows).toHaveLength(2); + expect(result.dataset.rows[0].x).toBe(0); + expect(result.dataset.rows[1].x).toBe(2); + }); + + test("impute mean: fills missing with column mean", () => { + const ds: DatasetView = { + columns: ["v"], + rows: [{ v: 10 }, { v: 20 }, { v: null }, { v: 30 }, { v: null }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "impute", + operationParams: { column: "v", strategy: "mean" }, + }) + ); + expect(result.rowsAffected).toBe(2); + expect(result.dataset.rows[2].v).toBe(20); + expect(result.dataset.rows[4].v).toBe(20); + }); + + test("impute median (odd count): fills missing with middle value", () => { + const ds: DatasetView = { + columns: ["v"], + rows: [{ v: 1 }, { v: 3 }, { v: null }, { v: 100 }], + }; + // Non-missing values [1, 3, 100], sorted → median = 3 + const result = applyFix( + ds, + makeProposal({ + operationKind: "impute", + operationParams: { column: "v", strategy: "median" }, + }) + ); + expect(result.dataset.rows[2].v).toBe(3); + }); + + test("impute median (even count): fills missing with mean of two middle", () => { + const ds: DatasetView = { + columns: ["v"], + rows: [{ v: 1 }, { v: 3 }, { v: null }, { v: 5 }, { v: 100 }], + }; + // Non-missing values [1, 3, 5, 100], sorted → (3 + 5) / 2 = 4 + const result = applyFix( + ds, + makeProposal({ + operationKind: "impute", + operationParams: { column: "v", strategy: "median" }, + }) + ); + expect(result.dataset.rows[2].v).toBe(4); + }); + + test("impute mode: fills missing with most common string", () => { + const ds: DatasetView = { + columns: ["c"], + rows: [ + { c: "A" }, { c: "A" }, { c: "B" }, { c: null }, { c: "" }, + ], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "impute", + operationParams: { column: "c", strategy: "mode" }, + }) + ); + expect(result.rowsAffected).toBe(2); + expect(result.dataset.rows[3].c).toBe("A"); + expect(result.dataset.rows[4].c).toBe("A"); + }); + + test("flag: does not mutate rows, populates flaggedRows", () => { + const ds: DatasetView = { + columns: ["x"], + rows: [{ x: 1 }, { x: 2 }, { x: 3 }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "flag", + operationParams: { rowIndices: [0, 2] }, + }) + ); + expect(result.rowsAffected).toBe(2); + expect(result.flaggedRows).toEqual([0, 2]); + expect(result.dataset.rows[0].x).toBe(1); + expect(result.dataset.rows[2].x).toBe(3); + }); + + test("trim_whitespace: trims string cells in target column", () => { + const ds: DatasetView = { + columns: ["name"], + rows: [{ name: " Alice " }, { name: "Bob" }, { name: "\tCharlie\n" }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "trim_whitespace", + operationParams: { column: "name" }, + }) + ); + expect(result.rowsAffected).toBe(2); + expect(result.dataset.rows[0].name).toBe("Alice"); + expect(result.dataset.rows[1].name).toBe("Bob"); + expect(result.dataset.rows[2].name).toBe("Charlie"); + }); + + test("standardize: maps values per mapping dict", () => { + const ds: DatasetView = { + columns: ["yn"], + rows: [{ yn: "Y" }, { yn: "yes" }, { yn: "n" }, { yn: "N" }, { yn: "unknown" }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "standardize", + operationParams: { + column: "yn", + mapping: { Y: "yes", N: "no", n: "no" }, + }, + }) + ); + expect(result.rowsAffected).toBe(3); + expect(result.dataset.rows[0].yn).toBe("yes"); + expect(result.dataset.rows[1].yn).toBe("yes"); // unchanged (no mapping) + expect(result.dataset.rows[2].yn).toBe("no"); + expect(result.dataset.rows[3].yn).toBe("no"); + expect(result.dataset.rows[4].yn).toBe("unknown"); + }); + + test("rename_column: updates columns array and per-row keys", () => { + const ds: DatasetView = { + columns: ["sample_id", "value"], + rows: [{ sample_id: "S1", value: 1 }, { sample_id: "S2", value: 2 }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "rename_column", + operationParams: { from: "sample_id", to: "subjectId" }, + }) + ); + expect(result.dataset.columns).toEqual(["subjectId", "value"]); + expect(result.dataset.rows[0].subjectId).toBe("S1"); + expect(result.dataset.rows[0].sample_id).toBeUndefined(); + }); + + test("empty dataset: returns empty dataset and zero rowsAffected", () => { + const ds: DatasetView = { columns: [], rows: [] }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "replace_value", + operationParams: { column: "x", match: 1, replacement: 0 }, + }) + ); + expect(result.rowsAffected).toBe(0); + expect(result.dataset.rows).toEqual([]); + }); + + test("unknown operationKind: throws", () => { + const bad = makeProposal({ + operationKind: "nuke_database" as unknown as FixProposal["operationKind"], + operationParams: {}, + }); + expect(() => applyFix({ columns: [], rows: [] }, bad)).toThrow(/unknown operationKind/); + }); + + test("realistic diabetes flow: replace age=999 with NULL leaves other columns intact", () => { + const ds: DatasetView = { + columns: ["sample_id", "age", "glucose"], + rows: [ + { sample_id: "S1", age: 45, glucose: 110 }, + { sample_id: "S2", age: 999, glucose: 130 }, + { sample_id: "S3", age: 999, glucose: 140 }, + ], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + }) + ); + expect(result.rowsAffected).toBe(2); + expect(result.dataset.rows[1].age).toBeNull(); + expect(result.dataset.rows[1].glucose).toBe(130); + expect(result.dataset.rows[1].sample_id).toBe("S2"); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.ts b/agent-service/src/agent/tools/dataguard/apply-fix.ts new file mode 100644 index 00000000000..77d2c0f85f4 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/apply-fix.ts @@ -0,0 +1,205 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Applies one approved FixProposal to an in-memory DatasetView. +// Pure function: returns a new dataset; never mutates the input. +// This is the mutating boundary of DataGuard — every call here must have been +// authorized through the permission scaffolding (see with-approval.ts). + +import type { FixProposal } from "../../../types/dataguard"; +import type { DatasetView } from "./dataset"; + +export interface ApplyResult { + dataset: DatasetView; + rowsAffected: number; + flaggedRows: number[]; +} + +export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResult { + const rows = dataset.rows.map(r => ({ ...r })); + let columns = [...dataset.columns]; + const params = proposal.operationParams; + + switch (proposal.operationKind) { + case "replace_value": { + const column = params.column as string; + const match = params.match; + const replacement = params.replacement; + let affected = 0; + for (const r of rows) { + if (cellEquals(r[column], match)) { + r[column] = replacement; + affected++; + } + } + return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + } + + case "drop_rows": { + const drop = new Set(params.rowIndices as number[]); + const kept = rows.filter((_, i) => !drop.has(i)); + return { + dataset: { columns, rows: kept }, + rowsAffected: rows.length - kept.length, + flaggedRows: [], + }; + } + + case "impute": { + const column = params.column as string; + const strategy = params.strategy as "mean" | "median" | "mode"; + const fill = computeImputeValue(rows, column, strategy); + let affected = 0; + for (const r of rows) { + if (isMissing(r[column])) { + r[column] = fill; + affected++; + } + } + return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + } + + case "flag": { + const indices = (params.rowIndices as number[]).slice(); + return { + dataset: { columns, rows }, + rowsAffected: indices.length, + flaggedRows: indices, + }; + } + + case "trim_whitespace": { + const column = params.column as string; + let affected = 0; + for (const r of rows) { + const v = r[column]; + if (typeof v === "string") { + const trimmed = v.trim(); + if (trimmed !== v) { + r[column] = trimmed; + affected++; + } + } + } + return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + } + + case "standardize": { + const column = params.column as string; + const mapping = params.mapping as Record; + let affected = 0; + for (const r of rows) { + const v = r[column]; + if (typeof v === "string" && Object.prototype.hasOwnProperty.call(mapping, v)) { + r[column] = mapping[v]; + affected++; + } + } + return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + } + + case "rename_column": { + const from = params.from as string; + const to = params.to as string; + columns = columns.map(c => (c === from ? to : c)); + let affected = 0; + for (const r of rows) { + if (Object.prototype.hasOwnProperty.call(r, from)) { + r[to] = r[from]; + delete r[from]; + affected++; + } + } + return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + } + + default: + throw new Error( + `apply_fix: unknown operationKind: ${(proposal as unknown as { operationKind: string }).operationKind}` + ); + } +} + +function cellEquals(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (typeof a === "number" && typeof b === "number" && Number.isNaN(a) && Number.isNaN(b)) { + return true; + } + return false; +} + +function isMissing(v: unknown): boolean { + if (v === null || v === undefined) return true; + if (typeof v === "number" && Number.isNaN(v)) return true; + if (typeof v === "string" && v === "") return true; + return false; +} + +function computeImputeValue( + rows: Record[], + column: string, + strategy: "mean" | "median" | "mode" +): unknown { + const numericValues: number[] = []; + const stringCounts = new Map(); + for (const r of rows) { + const v = r[column]; + if (isMissing(v)) continue; + if (typeof v === "number" && Number.isFinite(v)) { + numericValues.push(v); + } else if (typeof v === "string") { + stringCounts.set(v, (stringCounts.get(v) ?? 0) + 1); + } + } + + if (strategy === "mean") { + if (numericValues.length === 0) return null; + return numericValues.reduce((s, n) => s + n, 0) / numericValues.length; + } + if (strategy === "median") { + if (numericValues.length === 0) return null; + const sorted = [...numericValues].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]; + } + // mode: prefer strings if any non-missing strings exist; else fall back to numbers + if (stringCounts.size > 0) { + let mode = ""; + let max = -1; + for (const [k, count] of stringCounts) { + if (count > max) { + max = count; + mode = k; + } + } + return mode; + } + if (numericValues.length === 0) return null; + const numCounts = new Map(); + for (const n of numericValues) numCounts.set(n, (numCounts.get(n) ?? 0) + 1); + let mode = numericValues[0]; + let max = -1; + for (const [k, count] of numCounts) { + if (count > max) { + max = count; + mode = k; + } + } + return mode; +} diff --git a/agent-service/src/agent/tools/dataguard/bias-check.test.ts b/agent-service/src/agent/tools/dataguard/bias-check.test.ts new file mode 100644 index 00000000000..04755ba8558 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/bias-check.test.ts @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { computeBiasCheck } from "./bias-check"; +import type { DatasetView } from "./dataset"; + +describe("computeBiasCheck", () => { + test("identical before/after → 100% retention, no skew", () => { + const ds: DatasetView = { + columns: ["group"], + rows: [{ group: "A" }, { group: "A" }, { group: "B" }, { group: "B" }], + }; + const r = computeBiasCheck(ds, ds, "group"); + expect(r.skewDetected).toBe(false); + expect(r.perGroup.A.retentionPct).toBe(100); + expect(r.perGroup.B.retentionPct).toBe(100); + }); + + test("flags skew when one group loses much more than another", () => { + const before: DatasetView = { + columns: ["group"], + rows: [ + { group: "A" }, { group: "A" }, { group: "A" }, { group: "A" }, { group: "A" }, + { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, + ], + }; + const after: DatasetView = { + columns: ["group"], + rows: [ + { group: "A" }, + { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, + ], + }; + // A retains 20%, B retains 100% → 80-point gap → skew + const r = computeBiasCheck(before, after, "group"); + expect(r.skewDetected).toBe(true); + expect(r.perGroup.A.retentionPct).toBe(20); + expect(r.perGroup.B.retentionPct).toBe(100); + }); + + test("balanced cleanup (5%/4% loss across groups) → no skew", () => { + // Mirrors the §5 storyboard closing beat — 4-5% loss per group. + const before: DatasetView = { + columns: ["group"], + rows: Array.from({ length: 200 }, (_, i) => ({ group: i < 100 ? "A" : "B" })), + }; + const after: DatasetView = { + columns: ["group"], + rows: [ + ...Array.from({ length: 96 }, () => ({ group: "A" })), + ...Array.from({ length: 95 }, () => ({ group: "B" })), + ], + }; + const r = computeBiasCheck(before, after, "group"); + expect(r.skewDetected).toBe(false); + expect(Math.round(r.perGroup.A.retentionPct)).toBe(96); + expect(Math.round(r.perGroup.B.retentionPct)).toBe(95); + }); + + test("groupColumn missing from dataset: returns empty perGroup, no crash", () => { + const ds: DatasetView = { columns: ["x"], rows: [{ x: 1 }] }; + const r = computeBiasCheck(ds, ds, "group"); + expect(r.perGroup).toEqual({}); + expect(r.skewDetected).toBe(false); + }); + + test("custom skewThreshold widens / narrows the trigger", () => { + const before: DatasetView = { + columns: ["g"], + rows: Array.from({ length: 100 }, (_, i) => ({ g: i < 50 ? "A" : "B" })), + }; + const after: DatasetView = { + columns: ["g"], + rows: [ + ...Array.from({ length: 45 }, () => ({ g: "A" })), // 90% + ...Array.from({ length: 40 }, () => ({ g: "B" })), // 80% + ], + }; + // 10-point gap: skew with threshold=5, no skew with threshold=15 + expect(computeBiasCheck(before, after, "g", { skewThresholdPct: 5 }).skewDetected).toBe(true); + expect(computeBiasCheck(before, after, "g", { skewThresholdPct: 15 }).skewDetected).toBe(false); + }); + + test("empty before → no groups, no skew", () => { + const r = computeBiasCheck({ columns: [], rows: [] }, { columns: [], rows: [] }, "g"); + expect(r.perGroup).toEqual({}); + expect(r.skewDetected).toBe(false); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/bias-check.ts b/agent-service/src/agent/tools/dataguard/bias-check.ts new file mode 100644 index 00000000000..0a7b9213a9b --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/bias-check.ts @@ -0,0 +1,131 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Compares per-group row counts in the before / after dataset and flags skew +// — the closing demo beat: "Group A retention: 96%. Group B retention: 95%. +// ✓ No skew introduced." + +import { z } from "zod"; +import { tool } from "ai"; +import type { DatasetView } from "./dataset"; +import type { DataGuardSession } from "./dataguard-session"; + +export interface BiasCheckResult { + groupColumn: string; + perGroup: Record; + maxRetentionGapPct: number; + skewThresholdPct: number; + skewDetected: boolean; +} + +export interface BiasCheckOptions { + skewThresholdPct?: number; +} + +const DEFAULT_SKEW_THRESHOLD = 10; + +export function computeBiasCheck( + before: DatasetView, + after: DatasetView, + groupColumn: string, + options: BiasCheckOptions = {} +): BiasCheckResult { + const threshold = options.skewThresholdPct ?? DEFAULT_SKEW_THRESHOLD; + const perGroup: BiasCheckResult["perGroup"] = {}; + + if (!before.columns.includes(groupColumn) || before.rows.length === 0) { + return { + groupColumn, + perGroup, + maxRetentionGapPct: 0, + skewThresholdPct: threshold, + skewDetected: false, + }; + } + + const beforeCounts = countByGroup(before, groupColumn); + const afterCounts = countByGroup(after, groupColumn); + + for (const [group, beforeN] of beforeCounts) { + const afterN = afterCounts.get(group) ?? 0; + perGroup[group] = { + before: beforeN, + after: afterN, + retentionPct: beforeN > 0 ? (afterN / beforeN) * 100 : 0, + }; + } + + const retentions = Object.values(perGroup).map(g => g.retentionPct); + const maxGap = retentions.length > 0 ? Math.max(...retentions) - Math.min(...retentions) : 0; + + return { + groupColumn, + perGroup, + maxRetentionGapPct: maxGap, + skewThresholdPct: threshold, + skewDetected: maxGap > threshold, + }; +} + +function countByGroup(ds: DatasetView, col: string): Map { + const counts = new Map(); + for (const r of ds.rows) { + const v = r[col]; + if (v === undefined || v === null) continue; + const key = String(v); + counts.set(key, (counts.get(key) ?? 0) + 1); + } + return counts; +} + +// ----- AI SDK tool ----- + +export const TOOL_NAME_BIAS_CHECK = "bias_check"; + +export function createBiasCheckTool(session: DataGuardSession) { + return tool({ + description: `Compare row counts per group in the current dataset vs the pre-cleanup snapshot. Returns retention% per group plus a skew flag. Read-only.`, + inputSchema: z.object({ + groupColumn: z.string().describe("Column whose distinct values define the groups (e.g., 'group', 'cohort')."), + skewThresholdPct: z + .number() + .optional() + .describe("If max(retention%) - min(retention%) exceeds this, skewDetected = true. Default 10."), + beforeDataset: z + .object({ + columns: z.array(z.string()), + rows: z.array(z.record(z.string(), z.unknown())), + }) + .optional() + .describe("Optional explicit 'before' dataset; if omitted, the tool cannot compute bias and returns an error."), + }), + execute: async (input) => { + const after = session.getDataset(); + if (!after) return "[ERROR] No dataset in session; load one before calling bias_check."; + const before = input.beforeDataset ?? null; + if (!before) { + return "[ERROR] bias_check requires a beforeDataset (the pre-cleanup snapshot). Pass it explicitly."; + } + const result = computeBiasCheck(before, after, input.groupColumn, { + skewThresholdPct: input.skewThresholdPct, + }); + return JSON.stringify(result); + }, + }); +} diff --git a/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts b/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts new file mode 100644 index 00000000000..96f7c2fa43c --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { DataGuardSession } from "./dataguard-session"; +import type { DatasetView } from "./dataset"; +import type { DataQualityIssue, FixProposal } from "../../../types/dataguard"; + +function makeIssue(): DataQualityIssue { + return { + issueId: "iss-1", + issueType: "placeholder_value", + column: "age", + description: "5 rows have age=999", + evidence: "5 of 5 placeholder-only.", + affectedRowCount: 5, + detectedAt: "2026-05-14T12:00:00.000Z", + }; +} + +function makeProposal(): FixProposal { + return { + issueId: "iss-1", + issueType: "placeholder_value", + action: "Replace age=999 with NULL", + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + riskTier: "medium", + reason: "out of range", + evidence: "5 rows", + confidence: "high", + targetRowCount: 5, + }; +} + +describe("DataGuardSession", () => { + test("setDataset stores the dataset and resets per-run state", () => { + const s = new DataGuardSession(); + const ds: DatasetView = { columns: ["a"], rows: [{ a: 1 }] }; + s.recordIssue(makeIssue()); + s.setDataset(ds); + expect(s.getDataset()).toBe(ds); + expect(s.getIssues()).toEqual([]); + expect(s.getDecisionLog()).toEqual([]); + expect(s.getFlaggedRows()).toEqual([]); + }); + + test("recordIssue accumulates and dedupes by issueId", () => { + const s = new DataGuardSession(); + s.recordIssue(makeIssue()); + s.recordIssue(makeIssue()); // same issueId — should not duplicate + expect(s.getIssues()).toHaveLength(1); + }); + + test("recordDecision appends a DecisionLogEntry", () => { + const s = new DataGuardSession(); + s.setDataset({ columns: [], rows: [] }); + s.recordDecision({ + proposal: makeProposal(), + verdict: "allow", + applied: true, + }); + const log = s.getDecisionLog(); + expect(log).toHaveLength(1); + expect(log[0].userDecision).toBe("allow"); + expect(log[0].issueType).toBe("placeholder_value"); + expect(log[0].appliedAt).toBeDefined(); + }); + + test("recordDecision with denied: no appliedAt", () => { + const s = new DataGuardSession(); + s.recordDecision({ proposal: makeProposal(), verdict: "deny", applied: false }); + expect(s.getDecisionLog()[0].appliedAt).toBeUndefined(); + }); + + test("addAutoAllowRule registers, matchesAutoAllowRule returns true", () => { + const s = new DataGuardSession(); + s.addAutoAllowRule("placeholder_value"); + expect(s.matchesAutoAllowRule("placeholder_value")).toBe(true); + expect(s.matchesAutoAllowRule("outlier")).toBe(false); + }); + + test("addAutoAllowRule is idempotent (does not duplicate)", () => { + const s = new DataGuardSession(); + s.addAutoAllowRule("placeholder_value"); + s.addAutoAllowRule("placeholder_value"); + expect(s.getAutoAllowRules()).toHaveLength(1); + }); + + test("removeAutoAllowRule clears the rule by id", () => { + const s = new DataGuardSession(); + const rule = s.addAutoAllowRule("placeholder_value"); + expect(s.removeAutoAllowRule(rule.ruleId)).toBe(true); + expect(s.matchesAutoAllowRule("placeholder_value")).toBe(false); + }); + + test("addFlaggedRows merges + dedupes + sorts", () => { + const s = new DataGuardSession(); + s.addFlaggedRows([3, 1, 2]); + s.addFlaggedRows([2, 5]); + expect(s.getFlaggedRows()).toEqual([1, 2, 3, 5]); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/dataguard-session.ts b/agent-service/src/agent/tools/dataguard/dataguard-session.ts new file mode 100644 index 00000000000..1fc33799f8b --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/dataguard-session.ts @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Per-agent DataGuard run state. One DataGuardSession lives on each +// TexeraAgent (lazy-initialized when the first DataGuard tool fires) and +// holds the working dataset, accumulated issues, decision log, flagged rows, +// and auto-allow rules. Independent of the workflow state so resetting one +// does not affect the other. + +import type { + AutoAllowRule, + DataQualityIssue, + DecisionLogEntry, + FixProposal, + IssueType, + Verdict, +} from "../../../types/dataguard"; +import type { DatasetView } from "./dataset"; + +export interface RecordDecisionInput { + proposal: FixProposal; + verdict: Verdict; + modifiedAction?: string; + applied: boolean; +} + +export class DataGuardSession { + private dataset: DatasetView | undefined; + private issues: Map = new Map(); + private proposals: Map = new Map(); + private decisionLog: DecisionLogEntry[] = []; + private flaggedRows: Set = new Set(); + private autoAllowRules: Map = new Map(); + private decisionCounter = 0; + private ruleCounter = 0; + + setDataset(dataset: DatasetView): void { + this.dataset = dataset; + // A new dataset means a fresh DataGuard run — clear the per-run state. + // Auto-allow rules persist (they're a user preference, not run state). + this.issues.clear(); + this.proposals.clear(); + this.decisionLog = []; + this.flaggedRows.clear(); + } + + recordProposal(proposal: FixProposal): void { + this.proposals.set(proposal.issueId, proposal); + } + + getProposal(issueId: string): FixProposal | undefined { + return this.proposals.get(issueId); + } + + getIssue(issueId: string): DataQualityIssue | undefined { + return this.issues.get(issueId); + } + + getDataset(): DatasetView | undefined { + return this.dataset; + } + + updateDataset(dataset: DatasetView): void { + this.dataset = dataset; + } + + recordIssue(issue: DataQualityIssue): void { + this.issues.set(issue.issueId, issue); + } + + getIssues(): DataQualityIssue[] { + return Array.from(this.issues.values()); + } + + recordDecision(input: RecordDecisionInput): DecisionLogEntry { + this.decisionCounter += 1; + const now = new Date().toISOString(); + const entry: DecisionLogEntry = { + decisionId: `dec-${this.decisionCounter}`, + timestamp: now, + issueType: input.proposal.issueType, + targetRowCount: input.proposal.targetRowCount, + proposedAction: input.proposal.action, + userDecision: input.verdict, + modifiedAction: input.modifiedAction, + reason: input.proposal.reason, + confidence: input.proposal.confidence, + appliedAt: input.applied ? now : undefined, + }; + this.decisionLog.push(entry); + return entry; + } + + getDecisionLog(): DecisionLogEntry[] { + return [...this.decisionLog]; + } + + addFlaggedRows(indices: number[]): void { + for (const i of indices) this.flaggedRows.add(i); + } + + getFlaggedRows(): number[] { + return Array.from(this.flaggedRows).sort((a, b) => a - b); + } + + addAutoAllowRule(issueType: IssueType): AutoAllowRule { + // Idempotent: if a rule already exists for this issueType, return it. + for (const r of this.autoAllowRules.values()) { + if (r.issueType === issueType) return r; + } + this.ruleCounter += 1; + const rule: AutoAllowRule = { + ruleId: `rule-${this.ruleCounter}`, + issueType, + createdAt: new Date().toISOString(), + }; + this.autoAllowRules.set(rule.ruleId, rule); + return rule; + } + + removeAutoAllowRule(ruleId: string): boolean { + return this.autoAllowRules.delete(ruleId); + } + + matchesAutoAllowRule(issueType: IssueType): boolean { + for (const r of this.autoAllowRules.values()) { + if (r.issueType === issueType) return true; + } + return false; + } + + getAutoAllowRules(): AutoAllowRule[] { + return Array.from(this.autoAllowRules.values()); + } +} diff --git a/agent-service/src/agent/tools/dataguard/dataguard-tools.ts b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts new file mode 100644 index 00000000000..f58e29df8e4 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts @@ -0,0 +1,190 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Vercel AI SDK tool definitions for DataGuard. +// Three tools exposed to the LLM: +// - profile_dataset (read-only) +// - suggest_fix (read-only) +// - apply_fix (mutating — gated by requestApproval) +// The decision log is written automatically inside apply_fix; an explicit +// write_decision_log tool (Step 10) exports the log to CSV at session end. + +import { z } from "zod"; +import { tool } from "ai"; +import { profileDataset, type ProfileOptions } from "./profile-dataset"; +import { suggestFix, type LlmCallFn } from "./suggest-fix"; +import { applyFix } from "./apply-fix"; +import { requestApproval, type ApprovalGateway } from "./with-approval"; +import type { DataGuardSession } from "./dataguard-session"; +import { createWriteDecisionLogTool, TOOL_NAME_WRITE_DECISION_LOG } from "./decision-log"; +import { createBiasCheckTool, TOOL_NAME_BIAS_CHECK } from "./bias-check"; + +export const TOOL_NAME_PROFILE_DATASET = "profile_dataset"; +export const TOOL_NAME_SUGGEST_FIX = "suggest_fix"; +export const TOOL_NAME_APPLY_FIX = "apply_fix"; + +export interface DataGuardToolContext { + session: DataGuardSession; + gateway: ApprovalGateway; + llmCall: LlmCallFn; +} + +export function createDataGuardTools(ctx: DataGuardToolContext): Record { + return { + [TOOL_NAME_PROFILE_DATASET]: createProfileDatasetTool(ctx), + [TOOL_NAME_SUGGEST_FIX]: createSuggestFixTool(ctx), + [TOOL_NAME_APPLY_FIX]: createApplyFixTool(ctx), + [TOOL_NAME_WRITE_DECISION_LOG]: createWriteDecisionLogTool(ctx.session), + [TOOL_NAME_BIAS_CHECK]: createBiasCheckTool(ctx.session), + }; +} + +function createProfileDatasetTool(ctx: DataGuardToolContext) { + return tool({ + description: `Scan the loaded dataset for quality issues. Read-only. + +Detects four categories: +- missing_value: null / empty / configured missing tokens +- placeholder_value: numeric (999, -1) or string sentinels +- duplicate_id: requires idColumn hint +- out_of_range: requires validRanges hint per column + +Call this once at the start of a DataGuard run. Returns a JSON array of DataQualityIssue records.`, + inputSchema: z.object({ + idColumn: z + .string() + .optional() + .describe("Column name to treat as the unique row identifier. If omitted, no duplicate_id detection runs."), + validRanges: z + .record(z.string(), z.object({ min: z.number(), max: z.number() })) + .optional() + .describe("Per-column valid numeric range. Values outside are flagged as out_of_range."), + placeholderValues: z + .array(z.union([z.string(), z.number()])) + .optional() + .describe("Override the default placeholder list (default: [999, -1, 'unknown', 'Unknown'])."), + missingTokens: z + .array(z.string()) + .optional() + .describe("Override the default missing-token list (default: ['NA', 'N/A', 'n/a', 'null', 'NULL', 'None'])."), + }), + execute: async (input) => { + const dataset = ctx.session.getDataset(); + if (!dataset) { + return "[ERROR] No dataset loaded into DataGuard session. The frontend must call setDataset before invoking profile_dataset."; + } + const options: ProfileOptions = { + idColumn: input.idColumn, + validRanges: input.validRanges, + placeholderValues: input.placeholderValues, + missingTokens: input.missingTokens, + }; + const issues = profileDataset(dataset, options); + for (const issue of issues) ctx.session.recordIssue(issue); + return JSON.stringify({ + datasetRowCount: dataset.rows.length, + datasetColumnCount: dataset.columns.length, + issueCount: issues.length, + issues, + }); + }, + }); +} + +function createSuggestFixTool(ctx: DataGuardToolContext) { + return tool({ + description: `Propose a single concrete fix for a previously-detected issue. Read-only. + +Call after profile_dataset. Pass the issueId from one of the returned issues. Returns a FixProposal that you can then pass to apply_fix.`, + inputSchema: z.object({ + issueId: z.string().describe("The issueId of a DataQualityIssue returned by profile_dataset."), + }), + execute: async (input) => { + const issue = ctx.session.getIssue(input.issueId); + if (!issue) { + return `[ERROR] No issue with id "${input.issueId}". Call profile_dataset first.`; + } + try { + const proposal = await suggestFix(issue, { llmCall: ctx.llmCall }); + ctx.session.recordProposal(proposal); + return JSON.stringify(proposal); + } catch (e) { + return `[ERROR] suggest_fix failed: ${(e as Error).message}`; + } + }, + }); +} + +function createApplyFixTool(ctx: DataGuardToolContext) { + return tool({ + description: `Apply a previously-proposed fix to the dataset. MUTATING — gated by user approval. + +Pass the issueId. The proposal stored from suggest_fix is looked up automatically. For risk tier "low" the fix is auto-applied with a summary line; for "medium" / "high" the user must approve through the chat panel. The result includes the user's verdict.`, + inputSchema: z.object({ + issueId: z.string().describe("The issueId whose proposal should be applied."), + }), + execute: async (input) => { + const proposal = ctx.session.getProposal(input.issueId); + if (!proposal) { + return `[ERROR] No proposal for issueId "${input.issueId}". Call suggest_fix first.`; + } + const dataset = ctx.session.getDataset(); + if (!dataset) { + return `[ERROR] No dataset loaded.`; + } + + const decision = await requestApproval(ctx.gateway, proposal); + + if (decision.verdict === "deny") { + ctx.session.recordDecision({ proposal, verdict: "deny", applied: false }); + return JSON.stringify({ + verdict: "deny", + rowsAffected: 0, + message: "User denied the fix. No changes made.", + }); + } + + // For modify, MVP keeps the original operationKind/params but records the + // user's free-text override in the log. Future iteration can parse the + // modifiedAction back into a structured proposal override. + const modifiedAction = decision.verdict === "modify" ? decision.modifiedAction : undefined; + + try { + const result = applyFix(dataset, proposal); + ctx.session.updateDataset(result.dataset); + if (result.flaggedRows.length > 0) ctx.session.addFlaggedRows(result.flaggedRows); + ctx.session.recordDecision({ + proposal, + verdict: decision.verdict, + modifiedAction, + applied: true, + }); + return JSON.stringify({ + verdict: decision.verdict, + rowsAffected: result.rowsAffected, + flaggedRows: result.flaggedRows, + datasetRowCount: result.dataset.rows.length, + message: `Applied ${proposal.operationKind}. Rows affected: ${result.rowsAffected}.`, + }); + } catch (e) { + return `[ERROR] apply_fix failed: ${(e as Error).message}`; + } + }, + }); +} diff --git a/agent-service/src/agent/tools/dataguard/decision-log.test.ts b/agent-service/src/agent/tools/dataguard/decision-log.test.ts new file mode 100644 index 00000000000..242259d2ecf --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/decision-log.test.ts @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { serializeDecisionLogCsv } from "./decision-log"; +import type { DecisionLogEntry } from "../../../types/dataguard"; + +function entry(overrides: Partial = {}): DecisionLogEntry { + return { + decisionId: "dec-1", + timestamp: "2026-05-14T12:00:30.000Z", + issueType: "placeholder_value", + targetRowCount: 5, + proposedAction: "Replace age=999 with NULL", + userDecision: "allow", + reason: "out of valid range", + confidence: "high", + appliedAt: "2026-05-14T12:00:31.000Z", + ...overrides, + }; +} + +describe("serializeDecisionLogCsv", () => { + test("empty log returns header only", () => { + const csv = serializeDecisionLogCsv([]); + expect(csv.split("\n")).toEqual([ + "decision_id,timestamp,issue_type,target_rows,proposed_action,user_decision,modified_action,reason,confidence,applied_at", + ]); + }); + + test("single row: header + one data row", () => { + const csv = serializeDecisionLogCsv([entry()]); + const lines = csv.split("\n"); + expect(lines).toHaveLength(2); + expect(lines[1]).toContain("dec-1"); + expect(lines[1]).toContain("placeholder_value"); + expect(lines[1]).toContain("allow"); + }); + + test("escapes commas, quotes, and newlines in fields per RFC 4180", () => { + const csv = serializeDecisionLogCsv([ + entry({ + proposedAction: 'Replace "999" with NULL, including row 3', + reason: "line1\nline2", + }), + ]); + const dataRow = csv.split("\n").slice(1).join("\n"); + expect(dataRow).toContain('"Replace ""999"" with NULL, including row 3"'); + expect(dataRow).toContain('"line1\nline2"'); + }); + + test("missing appliedAt and modifiedAction render as empty fields", () => { + const csv = serializeDecisionLogCsv([ + entry({ userDecision: "deny", appliedAt: undefined }), + ]); + const row = csv.split("\n")[1]; + expect(row.endsWith(",")).toBe(true); // appliedAt is the last column and is empty + expect(row).toContain(",,"); // modifiedAction is empty between reason+confidence's neighbors + }); + + test("multiple rows preserve insertion order", () => { + const csv = serializeDecisionLogCsv([ + entry({ decisionId: "dec-1", issueType: "placeholder_value" }), + entry({ decisionId: "dec-2", issueType: "missing_value" }), + entry({ decisionId: "dec-3", issueType: "outlier", userDecision: "deny" }), + ]); + const lines = csv.split("\n").slice(1); + expect(lines[0]).toContain("dec-1"); + expect(lines[1]).toContain("dec-2"); + expect(lines[2]).toContain("dec-3"); + expect(lines[2]).toContain("deny"); + }); + + test("auto_allow_low_risk and auto_allow_remembered survive the round trip", () => { + const csv = serializeDecisionLogCsv([ + entry({ userDecision: "auto_allow_low_risk" }), + entry({ userDecision: "auto_allow_remembered" }), + ]); + expect(csv).toContain("auto_allow_low_risk"); + expect(csv).toContain("auto_allow_remembered"); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/decision-log.ts b/agent-service/src/agent/tools/dataguard/decision-log.ts new file mode 100644 index 00000000000..e9752fdf422 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/decision-log.ts @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// RFC-4180 CSV serializer for the DataGuard decision log. Schema matches +// §4.4 of README_DataGuard_Texera.md exactly so a reviewer can open the +// downloaded CSV and trace every applied/denied/modified fix. + +import { z } from "zod"; +import { tool } from "ai"; +import type { DecisionLogEntry } from "../../../types/dataguard"; +import type { DataGuardSession } from "./dataguard-session"; + +const HEADER_COLUMNS = [ + "decision_id", + "timestamp", + "issue_type", + "target_rows", + "proposed_action", + "user_decision", + "modified_action", + "reason", + "confidence", + "applied_at", +] as const; + +export const TOOL_NAME_WRITE_DECISION_LOG = "write_decision_log"; + +export function serializeDecisionLogCsv(entries: DecisionLogEntry[]): string { + const header = HEADER_COLUMNS.join(","); + const rows = entries.map(rowToCsv); + return [header, ...rows].join("\n"); +} + +function rowToCsv(e: DecisionLogEntry): string { + return [ + csvField(e.decisionId), + csvField(e.timestamp), + csvField(e.issueType), + csvField(String(e.targetRowCount)), + csvField(e.proposedAction), + csvField(e.userDecision), + csvField(e.modifiedAction ?? ""), + csvField(e.reason), + csvField(e.confidence), + csvField(e.appliedAt ?? ""), + ].join(","); +} + +// RFC 4180: a field MUST be quoted if it contains a comma, double-quote, or +// line break. Quotes within a quoted field are escaped by doubling. +function csvField(value: string): string { + if (value === "") return ""; + const needsQuoting = /[",\r\n]/.test(value); + if (!needsQuoting) return value; + return `"${value.replace(/"/g, '""')}"`; +} + +// ----- AI SDK tool (exposed to the LLM) ----- + +export function createWriteDecisionLogTool(session: DataGuardSession) { + return tool({ + description: `Export the DataGuard decision log to CSV. Returns the CSV text. Call this at the end of a DataGuard run to give the user an audit trail of every Allow / Deny / Modify they made.`, + inputSchema: z.object({}), + execute: async () => { + const csv = serializeDecisionLogCsv(session.getDecisionLog()); + return JSON.stringify({ + rows: session.getDecisionLog().length, + bytes: csv.length, + csv, + }); + }, + }); +} diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts b/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts new file mode 100644 index 00000000000..3d3ab4dc632 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts @@ -0,0 +1,150 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { suggestFix, type LlmCallFn } from "./suggest-fix"; +import type { DataQualityIssue } from "../../../types/dataguard"; + +function makeIssue(overrides: Partial = {}): DataQualityIssue { + return { + issueId: "iss-test-1", + issueType: "placeholder_value", + column: "age", + description: "5 rows have age = 999", + evidence: "5 of 5 rows with age=999 have no other anomalies.", + affectedRowCount: 5, + affectedRowIndices: [10, 42, 77, 199, 412], + detectedAt: "2026-05-14T12:00:00.000Z", + ...overrides, + }; +} + +const VALID_RAW_JSON = JSON.stringify({ + action: "Replace age = 999 with NULL", + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + riskTier: "medium", + reason: "999 is outside the valid human-age range and appears to be a placeholder.", + evidence: "5 of 5 rows with age=999 have no other anomalies.", + confidence: "high", + targetRowCount: 5, +}); + +function constantLlm(payload: string): LlmCallFn { + return async () => payload; +} + +describe("suggestFix", () => { + test("parses a valid LLM JSON payload into a FixProposal", async () => { + const issue = makeIssue(); + const proposal = await suggestFix(issue, { llmCall: constantLlm(VALID_RAW_JSON) }); + expect(proposal.issueId).toBe(issue.issueId); + expect(proposal.issueType).toBe("placeholder_value"); + expect(proposal.operationKind).toBe("replace_value"); + expect(proposal.riskTier).toBe("medium"); + expect(proposal.confidence).toBe("high"); + expect(proposal.targetRowCount).toBe(5); + }); + + test("strips ```json``` code fences before parsing", async () => { + const fenced = "```json\n" + VALID_RAW_JSON + "\n```"; + const proposal = await suggestFix(makeIssue(), { llmCall: constantLlm(fenced) }); + expect(proposal.operationKind).toBe("replace_value"); + }); + + test("strips bare ``` fences before parsing", async () => { + const fenced = "```\n" + VALID_RAW_JSON + "\n```"; + const proposal = await suggestFix(makeIssue(), { llmCall: constantLlm(fenced) }); + expect(proposal.riskTier).toBe("medium"); + }); + + test("issueId and issueType are set from the issue, not the LLM", async () => { + // The LLM payload claims a different issueType — we ignore it and use the + // server-side issue's type to keep the contract honest. + const proposalIgnoredFields = { + ...JSON.parse(VALID_RAW_JSON), + issueId: "wrong-id-from-llm", + issueType: "outlier", + }; + const issue = makeIssue({ issueId: "iss-real-7", issueType: "missing_value" }); + const proposal = await suggestFix(issue, { + llmCall: constantLlm(JSON.stringify(proposalIgnoredFields)), + }); + expect(proposal.issueId).toBe("iss-real-7"); + expect(proposal.issueType).toBe("missing_value"); + }); + + test("throws on invalid JSON", async () => { + await expect( + suggestFix(makeIssue(), { llmCall: constantLlm("not json at all") }) + ).rejects.toThrow(/invalid JSON/); + }); + + test("throws when required field is missing", async () => { + const bad = { ...JSON.parse(VALID_RAW_JSON) }; + delete bad.operationKind; + await expect( + suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) }) + ).rejects.toThrow(/schema validation/); + }); + + test("throws when operationKind is not a known enum member", async () => { + const bad = { ...JSON.parse(VALID_RAW_JSON), operationKind: "delete_database" }; + await expect( + suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) }) + ).rejects.toThrow(/schema validation/); + }); + + test("throws when riskTier is not low|medium|high", async () => { + const bad = { ...JSON.parse(VALID_RAW_JSON), riskTier: "critical" }; + await expect( + suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) }) + ).rejects.toThrow(/schema validation/); + }); + + test("passes issue details into the prompt for the LLM", async () => { + let captured = ""; + const issue = makeIssue({ + issueType: "duplicate_id", + column: "sample_id", + description: "3 duplicate sample IDs", + }); + const proposal = await suggestFix(issue, { + llmCall: async (prompt) => { + captured = prompt; + return VALID_RAW_JSON; + }, + }); + expect(captured).toContain("duplicate_id"); + expect(captured).toContain("sample_id"); + expect(captured).toContain("3 duplicate sample IDs"); + expect(proposal).toBeDefined(); + }); + + test("propagates LLM transport errors", async () => { + const issue = makeIssue(); + await expect( + suggestFix(issue, { + llmCall: async () => { + throw new Error("connection refused"); + }, + }) + ).rejects.toThrow(/connection refused/); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.ts b/agent-service/src/agent/tools/dataguard/suggest-fix.ts new file mode 100644 index 00000000000..b4e6feda995 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/suggest-fix.ts @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Takes a DataQualityIssue from profile_dataset and asks an LLM for a single +// concrete FixProposal. Read-only with respect to the dataset: it only +// proposes, never applies. + +import { z } from "zod"; +import type { DataQualityIssue, FixProposal, RiskTier } from "../../../types/dataguard"; + +export type LlmCallFn = (prompt: string) => Promise; + +export interface SuggestFixOptions { + llmCall: LlmCallFn; +} + +const fixProposalSchema = z.object({ + action: z.string().min(1), + operationKind: z.enum([ + "replace_value", + "drop_rows", + "impute", + "flag", + "standardize", + "trim_whitespace", + "rename_column", + ]), + operationParams: z.record(z.string(), z.unknown()), + riskTier: z.enum(["low", "medium", "high"]), + reason: z.string().min(1), + evidence: z.string().min(1), + confidence: z.enum(["low", "medium", "high"]), + targetRowCount: z.number().int().nonnegative(), +}); + +const DEFAULT_RISK_TIER_BY_ISSUE: Record = { + placeholder_value: "medium", + missing_value: "medium", + duplicate_id: "high", + out_of_range: "medium", + outlier: "high", + inconsistent_label: "medium", +}; + +export async function suggestFix( + issue: DataQualityIssue, + options: SuggestFixOptions +): Promise { + const prompt = buildPrompt(issue); + const rawResponse = await options.llmCall(prompt); + const cleaned = stripCodeFences(rawResponse); + + let parsed: unknown; + try { + parsed = JSON.parse(cleaned); + } catch (e) { + throw new Error( + `suggest_fix: LLM returned invalid JSON for issue ${issue.issueId}: ${(e as Error).message}` + ); + } + + const validated = fixProposalSchema.safeParse(parsed); + if (!validated.success) { + throw new Error( + `suggest_fix: LLM proposal failed schema validation for issue ${issue.issueId}: ${validated.error.message}` + ); + } + + // Override LLM-supplied issueId/issueType with the server-side values to + // keep the contract honest: the LLM can suggest *what* to do, but it does + // not control *which* issue this proposal is bound to. + return { + issueId: issue.issueId, + issueType: issue.issueType, + ...validated.data, + }; +} + +export function buildPrompt(issue: DataQualityIssue): string { + const defaultTier = DEFAULT_RISK_TIER_BY_ISSUE[issue.issueType] ?? "medium"; + return `You are a data-cleaning assistant. Propose a single concrete fix for the following data-quality issue. Reply with one JSON object only — no prose, no markdown, no fences. + +Issue: +- type: ${issue.issueType} +- column: ${issue.column} +- description: ${issue.description} +- evidence: ${issue.evidence} +- affectedRowCount: ${issue.affectedRowCount} + +Required JSON shape: +{ + "action": "", + "operationKind": "replace_value | drop_rows | impute | flag | standardize | trim_whitespace | rename_column", + "operationParams": { ...operation-specific params... }, + "riskTier": "low | medium | high", + "reason": "", + "evidence": "", + "confidence": "low | medium | high", + "targetRowCount": ${issue.affectedRowCount} +} + +operationParams by kind: +- replace_value: { "column": string, "match": any, "replacement": any } +- drop_rows: { "rowIndices": number[] } +- impute: { "column": string, "strategy": "mean" | "median" | "mode" } +- flag: { "rowIndices": number[] } +- standardize: { "column": string, "mapping": { [from: string]: string } } +- trim_whitespace: { "column": string } +- rename_column: { "from": string, "to": string } + +Default risk tier for ${issue.issueType}: ${defaultTier}. Override only with a strong reason. Prefer "flag" or "impute" over destructive "drop_rows".`; +} + +function stripCodeFences(s: string): string { + const trimmed = s.trim(); + if (!trimmed.startsWith("```")) return trimmed; + const lines = trimmed.split("\n"); + const last = lines[lines.length - 1]?.trim() ?? ""; + const sliced = last === "```" ? lines.slice(1, -1) : lines.slice(1); + return sliced.join("\n").trim(); +} diff --git a/agent-service/src/agent/tools/dataguard/with-approval.test.ts b/agent-service/src/agent/tools/dataguard/with-approval.test.ts new file mode 100644 index 00000000000..1fff89cb9fd --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/with-approval.test.ts @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { requestApproval, type ApprovalGateway } from "./with-approval"; +import type { FixProposal, IssueType, PermissionDecision, RiskTier } from "../../../types/dataguard"; + +function makeProposal(overrides: Partial = {}): FixProposal { + return { + issueId: "iss-1", + issueType: "placeholder_value", + action: "Replace age=999 with NULL", + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + riskTier: "medium", + reason: "test", + evidence: "test", + confidence: "high", + targetRowCount: 5, + ...overrides, + }; +} + +class MockGateway implements ApprovalGateway { + rules: Set = new Set(); + emitted: Array<{ stepId: string; proposal: FixProposal }> = []; + decisions: Map = new Map(); + private waiters: Map void> = new Map(); + private counter = 0; + + matchesAutoAllowRule(issueType: IssueType): boolean { + return this.rules.has(issueType); + } + generateStepId(): string { + this.counter += 1; + return `mock-step-${this.counter}`; + } + emitPendingApproval(stepId: string, proposal: FixProposal): void { + this.emitted.push({ stepId, proposal }); + } + awaitDecision(stepId: string): Promise { + if (this.decisions.has(stepId)) { + return Promise.resolve(this.decisions.get(stepId)!); + } + return new Promise(resolve => this.waiters.set(stepId, resolve)); + } + resolveLater(stepId: string, decision: PermissionDecision): void { + const w = this.waiters.get(stepId); + if (w) { + this.waiters.delete(stepId); + w(decision); + } else { + this.decisions.set(stepId, decision); + } + } +} + +describe("requestApproval", () => { + test("auto-allows low-risk fixes without prompting", async () => { + const gw = new MockGateway(); + const decision = await requestApproval(gw, makeProposal({ riskTier: "low" })); + expect(decision.verdict).toBe("auto_allow_low_risk"); + expect(gw.emitted).toHaveLength(0); + }); + + test("auto-allows when the issueType matches a remembered rule", async () => { + const gw = new MockGateway(); + gw.rules.add("placeholder_value"); + const decision = await requestApproval(gw, makeProposal({ riskTier: "medium" })); + expect(decision.verdict).toBe("auto_allow_remembered"); + expect(gw.emitted).toHaveLength(0); + }); + + test("medium risk without remembered rule → emits pending and waits", async () => { + const gw = new MockGateway(); + const promise = requestApproval(gw, makeProposal({ riskTier: "medium" })); + // Pending emitted synchronously before the promise resolves. + expect(gw.emitted).toHaveLength(1); + expect(gw.emitted[0].stepId).toBe("mock-step-1"); + + // Simulate user clicking Allow. + gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "allow" }); + + const decision = await promise; + expect(decision.verdict).toBe("allow"); + expect(decision.stepId).toBe("mock-step-1"); + }); + + test("high risk: prompts every time even with a remembered rule", async () => { + const gw = new MockGateway(); + gw.rules.add("outlier"); + const promise = requestApproval(gw, makeProposal({ issueType: "outlier", riskTier: "high" })); + expect(gw.emitted).toHaveLength(1); + gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "deny" }); + const decision = await promise; + expect(decision.verdict).toBe("deny"); + }); + + test("'modify' verdict carries through with the modifiedAction", async () => { + const gw = new MockGateway(); + const promise = requestApproval(gw, makeProposal({ riskTier: "medium" })); + gw.resolveLater("mock-step-1", { + stepId: "mock-step-1", + verdict: "modify", + modifiedAction: "Flag instead of replace", + }); + const decision = await promise; + expect(decision.verdict).toBe("modify"); + expect(decision.modifiedAction).toBe("Flag instead of replace"); + }); + + test("a decision that arrives before the tool awaits is buffered and delivered", async () => { + const gw = new MockGateway(); + // The decision is pre-recorded BEFORE the tool starts awaiting. This + // matches a race where the user clicks before the agent has finished + // emitting the pending step on this side. + gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "allow" }); + const decision = await requestApproval(gw, makeProposal({ riskTier: "medium" })); + expect(decision.verdict).toBe("allow"); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/with-approval.ts b/agent-service/src/agent/tools/dataguard/with-approval.ts new file mode 100644 index 00000000000..454ea6de901 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/with-approval.ts @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// The permission gate: every mutating DataGuard tool calls requestApproval() +// before doing anything. The function returns a PermissionDecision that the +// tool inspects to know whether to apply, skip, or transform its input. + +import type { FixProposal, IssueType, PermissionDecision } from "../../../types/dataguard"; + +// The set of operations the gate needs from its host. Implemented by +// TexeraAgent in production and by a mock in tests, so the gating logic +// itself can be unit-tested without a full agent or a websocket. +export interface ApprovalGateway { + // Does this issueType have a standing "auto-allow" rule? + matchesAutoAllowRule(issueType: IssueType): boolean; + // Mint a fresh step id used to correlate the emitted pending step with the + // user's eventual decision. + generateStepId(): string; + // Add a "pending approval" step into the conversation history and broadcast + // it to subscribed websocket clients. The frontend renders the prompt UI. + emitPendingApproval(stepId: string, proposal: FixProposal): void; + // Resolve when the decision for this step arrives via a websocket message. + awaitDecision(stepId: string): Promise; +} + +export async function requestApproval( + gateway: ApprovalGateway, + proposal: FixProposal +): Promise { + // High-risk fixes ALWAYS prompt — the "remember" rule does not apply. + // This is the same shape Claude Code uses for destructive Bash operations. + if (proposal.riskTier !== "high" && gateway.matchesAutoAllowRule(proposal.issueType)) { + return { stepId: "", verdict: "auto_allow_remembered" }; + } + if (proposal.riskTier === "low") { + return { stepId: "", verdict: "auto_allow_low_risk" }; + } + const stepId = gateway.generateStepId(); + gateway.emitPendingApproval(stepId, proposal); + return gateway.awaitDecision(stepId); +} diff --git a/agent-service/src/server.ts b/agent-service/src/server.ts index a31f9ede115..3bfa0df9f7c 100644 --- a/agent-service/src/server.ts +++ b/agent-service/src/server.ts @@ -325,6 +325,40 @@ const agentsRouter = new Elysia({ prefix: "/agents" }) }; }) + // ---------- DataGuard endpoints ---------- + + .post( + "/:id/dataguard/dataset", + ({ params: { id }, body }) => { + const agent = getAgent(id); + agent.setDataGuardDataset({ + columns: body.columns, + rows: body.rows, + }); + return { ok: true, columns: body.columns.length, rows: body.rows.length }; + }, + { + body: t.Object({ + columns: t.Array(t.String()), + rows: t.Array(t.Record(t.String(), t.Any())), + }), + } + ) + + .get("/:id/dataguard/session", ({ params: { id } }) => { + const agent = getAgent(id); + const session = agent.getDataGuardSession(); + const dataset = session.getDataset(); + return { + datasetRowCount: dataset?.rows.length ?? 0, + datasetColumnCount: dataset?.columns.length ?? 0, + issues: session.getIssues(), + decisionLog: session.getDecisionLog(), + flaggedRows: session.getFlaggedRows(), + autoAllowRules: session.getAutoAllowRules(), + }; + }) + .get("/:id/operator-types", ({ params: { id } }) => { const agent = getAgent(id); const metadataStore = agent.getMetadataStore(); @@ -403,9 +437,15 @@ const agentsRouter = new Elysia({ prefix: "/agents" }) ); interface WsMessage { - type: "message" | "stop"; + type: "message" | "stop" | "decision"; content?: string; messageSource?: "chat" | "feedback"; + // Fields below carry the user's verdict on a pending-approval step. + // Used when type === "decision". See agent/tools/dataguard/with-approval.ts. + stepId?: string; + verdict?: "allow" | "deny" | "modify"; + modifiedAction?: string; + remember?: boolean; } interface OperatorResultSummaryWs { @@ -532,6 +572,29 @@ export function buildApp() { return; } + if (msg.type === "decision") { + if (!msg.stepId || !msg.verdict) { + ws.send( + JSON.stringify({ + type: "error", + error: "decision requires stepId and verdict", + }) + ); + return; + } + const resolved = agent.resolveDecision(msg.stepId, { + stepId: msg.stepId, + verdict: msg.verdict, + modifiedAction: msg.modifiedAction, + remember: msg.remember, + }); + wsLog.info( + { agentId, stepId: msg.stepId, verdict: msg.verdict, resolved }, + "received user decision" + ); + return; + } + if (msg.type === "message") { if (!msg.content || typeof msg.content !== "string") { ws.send(JSON.stringify({ type: "error", error: "Message content is required" })); diff --git a/agent-service/src/types/agent.ts b/agent-service/src/types/agent.ts index 765f5a7cb46..cef2db3e618 100644 --- a/agent-service/src/types/agent.ts +++ b/agent-service/src/types/agent.ts @@ -18,6 +18,7 @@ */ import type { WorkflowContent } from "./workflow"; +import type { FixProposal, RiskTier } from "./dataguard"; export enum AgentState { UNAVAILABLE = "UNAVAILABLE", @@ -35,6 +36,12 @@ export interface TokenUsage { export const INITIAL_STEP_ID = "step-initial"; +export interface PendingApproval { + toolName: string; + proposal: FixProposal; + riskTier: RiskTier; +} + export interface ReActStep { id: string; parentId?: string; @@ -60,6 +67,10 @@ export interface ReActStep { messageSource?: "chat" | "feedback"; beforeWorkflowContent?: WorkflowContent; afterWorkflowContent?: WorkflowContent; + // Present on a step that is awaiting user approval. The agent's ReAct loop + // pauses (inside the mutating tool's execute fn) until a decision WS message + // arrives. See agent/tools/dataguard/with-approval.ts. + pendingApproval?: PendingApproval; } export enum OperatorResultSerializationMode { diff --git a/agent-service/src/types/dataguard.test.ts b/agent-service/src/types/dataguard.test.ts index c256c7e4709..53047e08879 100644 --- a/agent-service/src/types/dataguard.test.ts +++ b/agent-service/src/types/dataguard.test.ts @@ -68,6 +68,7 @@ describe("DataGuard type shapes", () => { test("FixProposal: replace-value, medium risk, high confidence", () => { const proposal: FixProposal = { issueId: "iss-1", + issueType: "placeholder_value", action: "Replace age = 999 with NULL", operationKind: "replace_value", operationParams: { column: "age", match: 999, replacement: null }, @@ -85,6 +86,7 @@ describe("DataGuard type shapes", () => { test("FixProposal: drop-rows, high risk (the storyboard 'deny' case)", () => { const proposal: FixProposal = { issueId: "iss-3", + issueType: "outlier", action: "Drop 3 rows with BMI > 60", operationKind: "drop_rows", operationParams: { rowIndices: [55, 211, 433] }, diff --git a/agent-service/src/types/dataguard.ts b/agent-service/src/types/dataguard.ts index 413e2e37814..d5b17ac1654 100644 --- a/agent-service/src/types/dataguard.ts +++ b/agent-service/src/types/dataguard.ts @@ -64,6 +64,7 @@ export interface DataQualityIssue { export interface FixProposal { issueId: string; + issueType: IssueType; action: string; operationKind: FixOperationKind; operationParams: Record; From 13290ac839354022a97f57183eea865178444e46 Mon Sep 17 00:00:00 2001 From: eugenegujing Date: Thu, 14 May 2026 19:34:27 -0700 Subject: [PATCH 03/14] feat(frontend): DataGuard permission prompt + auto-trigger hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the chat-panel UX: a standalone PermissionPromptComponent that renders inline on any ReActStep with pendingApproval (Allow / Deny / Modify / Allow & remember), AgentService.sendDecision wiring the new WS message, and DataGuardAutoTriggerService that fires when a dataset-reading operator (CSVFileScan, TableFileScan, JSONFileScan, ParallelCSVFileScan) is added to the workflow. AgentPanelComponent subscribes and surfaces a notification — full agent-creation flow remains a follow-up. --- .../agent-chat/agent-chat.component.html | 7 ++ .../agent-chat/agent-chat.component.ts | 2 + .../agent-panel/agent-panel.component.ts | 19 ++++- .../permission-prompt.component.html | 54 +++++++++++++ .../permission-prompt.component.scss | 78 ++++++++++++++++++ .../permission-prompt.component.ts | 80 ++++++++++++++++++ .../workspace/service/agent/agent-types.ts | 21 +++++ .../workspace/service/agent/agent.service.ts | 32 ++++++++ .../agent/data-guard-auto-trigger.service.ts | 81 +++++++++++++++++++ 9 files changed, 373 insertions(+), 1 deletion(-) create mode 100644 frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html create mode 100644 frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss create mode 100644 frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts create mode 100644 frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts diff --git a/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.html b/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.html index d650a0a146b..262da7db2d3 100644 --- a/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.html +++ b/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.html @@ -123,6 +123,13 @@ style="color: #8c8c8c; font-style: italic"> Execute {{ response.toolCalls.length }} tool{{ response.toolCalls.length > 1 ? 's' : '' }} + + + + + + + + +
+ +
+ + +
+
+ + +
+ Decision sent — waiting for the agent to continue. +
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss new file mode 100644 index 00000000000..f0f32539e43 --- /dev/null +++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss @@ -0,0 +1,78 @@ +.dg-permission { + border: 1px solid #e5b800; + border-radius: 8px; + padding: 12px; + margin: 8px 0; + background: #fffbe6; + font-size: 0.9rem; +} + +.dg-permission--resolved { + background: #f0f8ff; + border-color: #91d5ff; + color: #1890ff; + font-style: italic; +} + +.dg-permission__header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 8px; +} + +.dg-permission__tier { + font-size: 0.75rem; + font-weight: 600; + padding: 2px 8px; + border-radius: 4px; + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.dg-permission__tier--low { + background: #d9f7be; + color: #389e0d; +} + +.dg-permission__tier--medium { + background: #fff1b8; + color: #d48806; +} + +.dg-permission__tier--high { + background: #ffccc7; + color: #cf1322; +} + +.dg-permission__body { + margin: 8px 0; +} + +.dg-permission__field { + margin: 4px 0; + line-height: 1.4; +} + +.dg-permission__label { + font-weight: 600; + display: inline-block; + min-width: 90px; + color: #595959; +} + +.dg-permission__actions { + display: flex; + gap: 8px; + margin-top: 8px; + flex-wrap: wrap; +} + +.dg-permission__modify { + margin-top: 8px; + + textarea { + width: 100%; + margin-bottom: 8px; + } +} diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts new file mode 100644 index 00000000000..7423d9aefb8 --- /dev/null +++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Component, Input } from "@angular/core"; +import { NgIf } from "@angular/common"; +import { FormsModule } from "@angular/forms"; +import { NzButtonComponent } from "ng-zorro-antd/button"; +import { NzInputDirective } from "ng-zorro-antd/input"; +import { ReActStep } from "../../../../service/agent/agent-types"; +import { AgentService } from "../../../../service/agent/agent.service"; + +/** + * DataGuard permission prompt. Rendered inline in the agent chat panel for + * any ReActStep whose `pendingApproval` field is set. The user's click sends + * a {type:"decision", stepId, verdict, ...} message over the agent WS; the + * server-side ReAct loop resumes once the awaiting tool promise resolves. + */ +@Component({ + selector: "texera-permission-prompt", + standalone: true, + imports: [NgIf, FormsModule, NzButtonComponent, NzInputDirective], + templateUrl: "./permission-prompt.component.html", + styleUrls: ["./permission-prompt.component.scss"], +}) +export class PermissionPromptComponent { + @Input() step!: ReActStep; + @Input() agentId!: string; + + public isModifying = false; + public modifiedAction = ""; + public submitted = false; + + constructor(private readonly agentService: AgentService) {} + + public onAllow(remember: boolean): void { + if (this.submitted) return; + this.submitted = true; + this.agentService.sendDecision(this.agentId, this.step.id, "allow", { remember }); + } + + public onDeny(): void { + if (this.submitted) return; + this.submitted = true; + this.agentService.sendDecision(this.agentId, this.step.id, "deny"); + } + + public openModify(): void { + if (this.submitted) return; + this.isModifying = true; + this.modifiedAction = this.step.pendingApproval?.proposal.action ?? ""; + } + + public submitModify(): void { + if (this.submitted) return; + this.submitted = true; + this.agentService.sendDecision(this.agentId, this.step.id, "modify", { + modifiedAction: this.modifiedAction, + }); + } + + public cancelModify(): void { + this.isModifying = false; + } +} diff --git a/frontend/src/app/workspace/service/agent/agent-types.ts b/frontend/src/app/workspace/service/agent/agent-types.ts index c687de472a2..c9b3ed29cab 100644 --- a/frontend/src/app/workspace/service/agent/agent-types.ts +++ b/frontend/src/app/workspace/service/agent/agent-types.ts @@ -78,4 +78,25 @@ export interface ReActStep { beforeWorkflowContent?: any; /** Workflow state after this step executed */ afterWorkflowContent?: any; + /** + * DataGuard: a mutating tool is awaiting user approval. When this field is + * set, the chat panel renders the permission-prompt UI (Allow / Deny / + * Modify / Allow & remember). The agent's ReAct loop is paused server-side + * until a WS {type:"decision", stepId, verdict} message resolves it. + */ + pendingApproval?: { + toolName: string; + riskTier: "low" | "medium" | "high"; + proposal: { + issueId: string; + issueType: string; + action: string; + operationKind: string; + operationParams: Record; + reason: string; + evidence: string; + confidence: "low" | "medium" | "high"; + targetRowCount: number; + }; + }; } diff --git a/frontend/src/app/workspace/service/agent/agent.service.ts b/frontend/src/app/workspace/service/agent/agent.service.ts index 2009734030b..01c10b8ea00 100644 --- a/frontend/src/app/workspace/service/agent/agent.service.ts +++ b/frontend/src/app/workspace/service/agent/agent.service.ts @@ -965,6 +965,38 @@ export class AgentService { }); } + /** + * DataGuard: send the user's verdict on a pending-approval step. + * Resolves the awaiting tool execution server-side and lets the ReAct loop + * continue. `remember` (when verdict === "allow") registers an auto-allow + * rule for the issueType so subsequent matching issues skip the prompt. + */ + public sendDecision( + agentId: string, + stepId: string, + verdict: "allow" | "deny" | "modify", + options: { modifiedAction?: string; remember?: boolean } = {} + ): void { + const tracking = this.agentStateTracking.get(agentId); + if (!tracking?.websocket || tracking.websocket.readyState !== WebSocket.OPEN) { + console.error(`Agent ${agentId}: cannot send decision — WebSocket not open`); + return; + } + try { + tracking.websocket.send( + JSON.stringify({ + type: "decision", + stepId, + verdict, + modifiedAction: options.modifiedAction, + remember: options.remember, + }) + ); + } catch (error) { + console.error("Failed to send DataGuard decision:", error); + } + } + /** * Stop generation for an agent via WebSocket. */ diff --git a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts new file mode 100644 index 00000000000..fc6a7242f1b --- /dev/null +++ b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { Observable, filter, map } from "rxjs"; +import { OperatorPredicate } from "../../types/workflow-common.interface"; +import { WorkflowActionService } from "../workflow-graph/model/workflow-action.service"; + +/** + * DataGuard auto-trigger. + * + * Watches the texera-graph for newly-added dataset-reading operators + * (CSVFileScan, TableFileScan, JSONFileScan, …) and emits a hint that the + * agent-panel should auto-launch a DataGuard agent and open the chat panel. + * + * The actual agent-creation flow (POST /agents → POST /agents/:id/dataguard/dataset + * → activate websocket → send "scan this dataset") is owned by the panel + * component. This service is the *event source*, not the orchestrator. + * + * Wire from a panel: + * + * constructor(private trigger: DataGuardAutoTriggerService) {} + * ngOnInit() { + * this.trigger.getDatasetAddedStream().pipe(untilDestroyed(this)).subscribe(op => { + * // create agent, load dataset, send message... + * }); + * } + */ +@Injectable({ providedIn: "root" }) +export class DataGuardAutoTriggerService { + // Operator types that imply "the user just brought a tabular dataset onto + // the canvas." Extend cautiously — every type here triggers DataGuard. + private static readonly DATASET_OPERATOR_TYPES = new Set([ + "CSVFileScan", + "TableFileScan", + "JSONFileScan", + "ParallelCSVFileScan", + ]); + + constructor(private readonly workflowActionService: WorkflowActionService) {} + + /** + * Emits an OperatorPredicate every time a dataset-reading operator is + * added to the workflow. Subscribers should react by auto-launching a + * DataGuard agent and loading the referenced dataset. + */ + public getDatasetAddedStream(): Observable { + return this.workflowActionService + .getTexeraGraph() + .getOperatorAddStream() + .pipe( + filter((op: OperatorPredicate) => + DataGuardAutoTriggerService.DATASET_OPERATOR_TYPES.has(op.operatorType) + ), + map((op: OperatorPredicate) => op) + ); + } + + /** + * For tests / debugging: is a given operatorType one we'd auto-trigger on? + */ + public isDatasetOperatorType(operatorType: string): boolean { + return DataGuardAutoTriggerService.DATASET_OPERATOR_TYPES.has(operatorType); + } +} From b84a457402649a72dc8859f2d752807190c5bcbf Mon Sep 17 00:00:00 2001 From: eugenegujing Date: Thu, 14 May 2026 19:41:11 -0700 Subject: [PATCH 04/14] refactor(agent-service): group DataGuard tests under __tests__/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dataguard/ folder had 8 source + 8 test files in one directory and was getting hard to scan. Move the seven DataGuard-specific test files into a __tests__/ subdirectory and update their relative imports (one extra ../). The types/dataguard.test.ts stays put — it's in src/types/, not under dataguard/, and that folder isn't crowded. bun test still auto-discovers; all 159 tests still pass. --- .../agent/tools/dataguard/{ => __tests__}/apply-fix.test.ts | 6 +++--- .../tools/dataguard/{ => __tests__}/bias-check.test.ts | 4 ++-- .../dataguard/{ => __tests__}/dataguard-session.test.ts | 6 +++--- .../tools/dataguard/{ => __tests__}/decision-log.test.ts | 4 ++-- .../tools/dataguard/{ => __tests__}/profile-dataset.test.ts | 4 ++-- .../tools/dataguard/{ => __tests__}/suggest-fix.test.ts | 4 ++-- .../tools/dataguard/{ => __tests__}/with-approval.test.ts | 4 ++-- 7 files changed, 16 insertions(+), 16 deletions(-) rename agent-service/src/agent/tools/dataguard/{ => __tests__}/apply-fix.test.ts (98%) rename agent-service/src/agent/tools/dataguard/{ => __tests__}/bias-check.test.ts (97%) rename agent-service/src/agent/tools/dataguard/{ => __tests__}/dataguard-session.test.ts (95%) rename agent-service/src/agent/tools/dataguard/{ => __tests__}/decision-log.test.ts (96%) rename agent-service/src/agent/tools/dataguard/{ => __tests__}/profile-dataset.test.ts (98%) rename agent-service/src/agent/tools/dataguard/{ => __tests__}/suggest-fix.test.ts (97%) rename agent-service/src/agent/tools/dataguard/{ => __tests__}/with-approval.test.ts (97%) diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts similarity index 98% rename from agent-service/src/agent/tools/dataguard/apply-fix.test.ts rename to agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts index 1546102b4dd..2f7e706b380 100644 --- a/agent-service/src/agent/tools/dataguard/apply-fix.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts @@ -18,9 +18,9 @@ */ import { describe, expect, test } from "bun:test"; -import { applyFix } from "./apply-fix"; -import type { DatasetView } from "./dataset"; -import type { FixProposal } from "../../../types/dataguard"; +import { applyFix } from "../apply-fix"; +import type { DatasetView } from "../dataset"; +import type { FixProposal } from "../../../../types/dataguard"; function makeProposal(overrides: Partial = {}): FixProposal { return { diff --git a/agent-service/src/agent/tools/dataguard/bias-check.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts similarity index 97% rename from agent-service/src/agent/tools/dataguard/bias-check.test.ts rename to agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts index 04755ba8558..791637396c1 100644 --- a/agent-service/src/agent/tools/dataguard/bias-check.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts @@ -18,8 +18,8 @@ */ import { describe, expect, test } from "bun:test"; -import { computeBiasCheck } from "./bias-check"; -import type { DatasetView } from "./dataset"; +import { computeBiasCheck } from "../bias-check"; +import type { DatasetView } from "../dataset"; describe("computeBiasCheck", () => { test("identical before/after → 100% retention, no skew", () => { diff --git a/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts similarity index 95% rename from agent-service/src/agent/tools/dataguard/dataguard-session.test.ts rename to agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts index 96f7c2fa43c..27afdb8346f 100644 --- a/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts @@ -18,9 +18,9 @@ */ import { describe, expect, test } from "bun:test"; -import { DataGuardSession } from "./dataguard-session"; -import type { DatasetView } from "./dataset"; -import type { DataQualityIssue, FixProposal } from "../../../types/dataguard"; +import { DataGuardSession } from "../dataguard-session"; +import type { DatasetView } from "../dataset"; +import type { DataQualityIssue, FixProposal } from "../../../../types/dataguard"; function makeIssue(): DataQualityIssue { return { diff --git a/agent-service/src/agent/tools/dataguard/decision-log.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts similarity index 96% rename from agent-service/src/agent/tools/dataguard/decision-log.test.ts rename to agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts index 242259d2ecf..f31f1adbaa0 100644 --- a/agent-service/src/agent/tools/dataguard/decision-log.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts @@ -18,8 +18,8 @@ */ import { describe, expect, test } from "bun:test"; -import { serializeDecisionLogCsv } from "./decision-log"; -import type { DecisionLogEntry } from "../../../types/dataguard"; +import { serializeDecisionLogCsv } from "../decision-log"; +import type { DecisionLogEntry } from "../../../../types/dataguard"; function entry(overrides: Partial = {}): DecisionLogEntry { return { diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts similarity index 98% rename from agent-service/src/agent/tools/dataguard/profile-dataset.test.ts rename to agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts index d48d828c98d..0586e338445 100644 --- a/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts @@ -18,8 +18,8 @@ */ import { describe, expect, test } from "bun:test"; -import { profileDataset } from "./profile-dataset"; -import type { DatasetView } from "./dataset"; +import { profileDataset } from "../profile-dataset"; +import type { DatasetView } from "../dataset"; describe("profileDataset", () => { test("clean dataset → empty issue list", () => { diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts similarity index 97% rename from agent-service/src/agent/tools/dataguard/suggest-fix.test.ts rename to agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts index 3d3ab4dc632..99edc46304e 100644 --- a/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts @@ -18,8 +18,8 @@ */ import { describe, expect, test } from "bun:test"; -import { suggestFix, type LlmCallFn } from "./suggest-fix"; -import type { DataQualityIssue } from "../../../types/dataguard"; +import { suggestFix, type LlmCallFn } from "../suggest-fix"; +import type { DataQualityIssue } from "../../../../types/dataguard"; function makeIssue(overrides: Partial = {}): DataQualityIssue { return { diff --git a/agent-service/src/agent/tools/dataguard/with-approval.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts similarity index 97% rename from agent-service/src/agent/tools/dataguard/with-approval.test.ts rename to agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts index 1fff89cb9fd..1cfbbb0b67b 100644 --- a/agent-service/src/agent/tools/dataguard/with-approval.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts @@ -18,8 +18,8 @@ */ import { describe, expect, test } from "bun:test"; -import { requestApproval, type ApprovalGateway } from "./with-approval"; -import type { FixProposal, IssueType, PermissionDecision, RiskTier } from "../../../types/dataguard"; +import { requestApproval, type ApprovalGateway } from "../with-approval"; +import type { FixProposal, IssueType, PermissionDecision, RiskTier } from "../../../../types/dataguard"; function makeProposal(overrides: Partial = {}): FixProposal { return { From 2ad7bfb79781212502a1a0b008419f23dbeaf01a Mon Sep 17 00:00:00 2001 From: eugenegujing Date: Sat, 16 May 2026 00:41:51 -0700 Subject: [PATCH 05/14] feat(dataguard): checklist UI, contract hardening, outlier reshape, end-to-end MVP polish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major iteration on top of the four committed DataGuard MVP commits. Ships the auto-trigger checklist as the primary UX (chat flow stays wired for any future DataGuard-via-LLM path but isn't used by the user-facing flow). Detector model: five categories (was six). The z-score outlier detector was dropped — clustered legitimate extremes (e.g. sustained high glucose readings) were being flagged en masse with no good way for the user to say "those are real." The old `out_of_range` was renamed `outlier` and keeps its validRanges-based definition. `duplicate_id` now auto-infers the ID column from name patterns (`id`, `*_id`, `*Id`, `id_*`, `uid`) so the auto-trigger's empty-body `/scan` still catches duplicates without UI configuration. Fix-operation model: `flag` removed entirely (was a no-op against the data; caused LakeFS "No changes detected" errors after Apply). `warning` added to RiskTier — concrete fix, always prompts, no "remember." `replace_value` now supports `rowIndices` targeting (deterministic; wins over value-based `match`), which fixes a class of silent no-ops where LLM-rounded match values didn't equal the actual cells. Suggest-fix prompt explicitly passes `affectedRowIndices` and instructs `rowIndices`-based replace for outliers. Permission contract: `/apply-batch` body schema is `verdict: "allow" | "deny"` with `additionalProperties: false`. The Elysia app is built with `normalize: false` so unknown legacy fields aren't silently stripped before validation. Global onError converts `code === "VALIDATION"` to HTTP 400. Runtime check rejects `{verdict: "deny", remember: true}`. WS decision handler narrowed the same way. `modifiedAction` removed everywhere, including the decision-log CSV header (9 columns now). Checklist UX: - Auto-trigger CSV-only (`CSVFileScan`, `ParallelCSVFileScan`). JSON/Parquet were trigger-set members but `loadFromOperatorFile` blindly Papa-parsed, producing garbage. - Floating, draggable panel via Angular CDK; cdkDragBoundary="body" so it can't get lost behind the toolbar. - 📍 locate button per row: cycles through `affectedRowIndices` on repeat clicks (per-row cursor in a `Map`, wraps at length). Tooltip previews the next click position. - Result-panel integration via `DataGuardRowNavigatorService` (ReplaySubject with 500ms TTL for cold-mount). `ResultTableFrameComponent` adds a private `pageRendered$ Subject` so the highlight only fires after the new page actually renders, not on an arbitrary 100ms timer. Cross-operator race, viewport-resize during page swap, columnDef-vs-header drift, destroyed-view NG0911 — all guarded. - After-Apply auto-rescan: the panel re-runs `/scan` against the new dataset version so users see real residue instead of stale entries. - Floating reopen icon when panel is closed & shield is ON. Click → fresh scan. Pipeline concurrency is gated by `currentPipeline: Promise | null`: auto-trigger drops silently on slot conflict (spam suppression); user-initiated awaits the in-flight pipeline (with a "queuing your scan…" toast) — at most one `/scan` POST in flight at a time. - Toolbar 🛡 shield (per-workflow ON/OFF, localStorage). Backend: server normalize:false; `/dataguard/load-demo-dataset` deleted as dead code (frontend never called it). DataGuardSession's `flaggedRows` field plus the post-apply `acknowledgedIssues` split removed alongside `flag`. `missing-detection.ts` is the single source of truth for missing/placeholder checks; `applyFix` threads `ScanOptions.missingTokens` through to `impute`. With-approval gates `warning` identically to `high` (never auto-allows, even with a remembered rule). Frontend service ownership: the auto-trigger orchestration subscription moved off `AgentPanelComponent` onto `DataGuardChecklistComponent` (the natural consumer of its output). `selectedCount`/`deniedCount`/`pendingCount` are cached on each state push instead of being three filter walks per CD tick. README_DataGuard.md is the consolidated feature spec, kept in repo root. Tests: 199 pass / 0 fail (419 expect calls), agent-service typecheck clean, frontend `tsc --noEmit` clean. New regression-locks include: `replace_value` with `rowIndices` (no more byte-identical export), `inferIdColumn` for all id-name patterns, "clustered large readings are NOT auto-outliers", warning tier prompts even with remembered rule, modify-verdict + modifiedAction + `{deny, remember:true}` rejection, pipeline serialization (two user-initiated rescans never invoke `/scan` concurrently), per-row locate cycle math. Co-Authored-By: Claude Opus 4.7 --- README_DataGuard.md | 556 ++++++++++++++++ agent-service/demo/README.md | 42 +- agent-service/demo/duplicate_rows_demo.csv | 31 + .../demo/inconsistent_labels_demo.csv | 31 + agent-service/demo/missing_values_demo.csv | 31 + agent-service/demo/outliers_demo.csv | 31 + .../demo/placeholder_values_demo.csv | 31 + agent-service/src/agent/texera-agent.ts | 15 + .../apply-batch-modify-reject.test.ts | 156 +++++ .../__tests__/apply-batch-rescan.test.ts | 111 ++++ .../dataguard/__tests__/apply-fix.test.ts | 140 +++- .../__tests__/dataguard-session.test.ts | 7 - .../__tests__/decision-log-no-modify.test.ts | 128 ++++ .../dataguard/__tests__/decision-log.test.ts | 12 +- .../__tests__/permission-types.test.ts | 121 ++++ .../__tests__/profile-dataset.test.ts | 170 ++++- .../__tests__/with-approval-no-modify.test.ts | 117 ++++ .../dataguard/__tests__/with-approval.test.ts | 27 +- .../src/agent/tools/dataguard/apply-fix.ts | 86 ++- .../tools/dataguard/dataguard-session.ts | 34 +- .../agent/tools/dataguard/dataguard-tools.ts | 27 +- .../src/agent/tools/dataguard/decision-log.ts | 2 - .../tools/dataguard/missing-detection.ts | 86 +++ .../agent/tools/dataguard/profile-dataset.ts | 197 ++++-- .../src/agent/tools/dataguard/suggest-fix.ts | 38 +- .../agent/tools/dataguard/with-approval.ts | 11 +- agent-service/src/server.ts | 273 +++++++- agent-service/src/types/dataguard.test.ts | 43 +- agent-service/src/types/dataguard.ts | 18 +- common/config/src/main/resources/gui.conf | 2 +- .../agent-panel/agent-panel.component.ts | 18 +- .../permission-prompt.component.html | 17 +- .../permission-prompt.component.scss | 14 +- .../permission-prompt.component.ts | 24 +- .../dataguard-checklist.component.html | 160 +++++ .../dataguard-checklist.component.scss | 319 +++++++++ .../dataguard-checklist.component.ts | 369 +++++++++++ .../component/menu/menu.component.html | 22 + .../component/menu/menu.component.scss | 18 + .../component/menu/menu.component.ts | 33 +- .../result-table-frame.component.html | 4 +- .../result-table-frame.component.scss | 26 + .../result-table-frame.component.ts | 96 ++- .../component/workspace.component.html | 4 + .../component/workspace.component.ts | 2 + .../workspace/service/agent/agent-types.ts | 2 +- .../workspace/service/agent/agent.service.ts | 5 +- .../data-guard-auto-trigger.service.spec.ts | 259 ++++++++ .../agent/data-guard-auto-trigger.service.ts | 622 ++++++++++++++++-- .../agent/data-guard-results.service.spec.ts | 178 +++++ .../agent/data-guard-results.service.ts | 132 ++++ .../data-guard-row-navigator.service.spec.ts | 122 ++++ .../agent/data-guard-row-navigator.service.ts | 95 +++ .../agent/data-guard-settings.service.ts | 88 +++ 54 files changed, 4854 insertions(+), 349 deletions(-) create mode 100644 README_DataGuard.md create mode 100644 agent-service/demo/duplicate_rows_demo.csv create mode 100644 agent-service/demo/inconsistent_labels_demo.csv create mode 100644 agent-service/demo/missing_values_demo.csv create mode 100644 agent-service/demo/outliers_demo.csv create mode 100644 agent-service/demo/placeholder_values_demo.csv create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/apply-batch-rescan.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/decision-log-no-modify.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts create mode 100644 agent-service/src/agent/tools/dataguard/missing-detection.ts create mode 100644 frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html create mode 100644 frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss create mode 100644 frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts create mode 100644 frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts create mode 100644 frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts create mode 100644 frontend/src/app/workspace/service/agent/data-guard-results.service.ts create mode 100644 frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts create mode 100644 frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts create mode 100644 frontend/src/app/workspace/service/agent/data-guard-settings.service.ts diff --git a/README_DataGuard.md b/README_DataGuard.md new file mode 100644 index 00000000000..baf0dafbf2a --- /dev/null +++ b/README_DataGuard.md @@ -0,0 +1,556 @@ +# DataGuard — Permission-Gated Data Cleaning for Texera + +> **Tagline:** AI suggests. Humans authorize. Texera records. +> +> **One-sentence pitch:** DataGuard is a conversational agent inside Texera that proposes data-cleaning actions one at a time and asks the user's permission before applying each — the Claude Code experience, but for data instead of code. + +--- + +## 1. Problem + +Data cleaning is rarely "just technical." Cleaning decisions can introduce bias, remove rare-but-meaningful cases, or silently change the meaning of a dataset — especially dangerous in scientific or high-stakes data. + +Typical pain points: +- A missing glucose value may not be random. +- An `age = 999` value may be a placeholder, not a real age. +- A duplicate sample ID with conflicting labels may need expert review. +- A statistical outlier may be a meaningful rare case. + +Today, two common workflows both fail in different ways: + +| Approach | Failure mode | +|---|---| +| Manual scripts (pandas in a notebook) | Opaque, hard to audit, no provenance, doesn't scale beyond one person | +| Auto-clean tools | Black-box decisions, no explanation, no human control over high-impact actions | + +**DataGuard's claim:** the *interaction model* is the missing piece, not the algorithms. Treat data-cleaning decisions the way Claude Code treats file edits — **ask permission, explain reasoning, log every decision.** + +--- + +## 2. Why this design + +Texera's execution engine (Amber) is a streaming, actor-based system. It does **not** natively support pausing a workflow mid-execution to wait for a user click on an in-canvas approval table. Building such a "pause-and-await-user" operator would require changes in three places (Amber engine, gRPC control protocol, Angular result panel). + +DataGuard sidesteps all of this by living in `agent-service/` rather than in the workflow graph: + +| Layer | Reused | +|---|---| +| Conversation state | Existing agent-service session management | +| Permission UX | Same UI pattern Claude Code uses for tool authorization | +| LLM gateway | The existing `LLM_ENDPOINT` (OpenAI-compatible) wired into `agent-service` | +| Data processing | TypeScript pure functions in agent-service — no new operators required | +| Workflow execution | Existing Texera workflow API (no new gRPC, no Amber changes) | + +**No Amber changes, no new operators, no new protocols.** + +--- + +## 3. User experience + +### 3.1 Trigger + +The user does **nothing special**. When a dataset-reading operator is added to the workflow canvas — currently `CSVFileScan` or `ParallelCSVFileScan` — the auto-trigger fires: + +1. Resolves the workflow context (id, per-workflow shield setting). +2. Finds or creates a per-workflow agent on agent-service. +3. Reads the operator's `fileName`, fetches the bytes via `DatasetService`, parses with `papaparse`. +4. POSTs `{columns, rows}` to `/dataguard/dataset` so the agent has the data in memory. +5. POSTs `/dataguard/scan` — the server runs `profile_dataset` then `suggest_fix` per issue, **bypassing the LLM ReAct loop** (so the LLM can't decide to call `deleteOperator` and vaporize the user's workflow). +6. Publishes the scan result to `DataGuardResultsService`. + +The dedicated `` floating panel slides in. The chat panel is not involved. + +### 3.2 The checklist panel + +- **Floating, draggable** — `cdkDrag` on the panel, header is the `cdkDragHandle`. Position is session-only (no localStorage persistence). `cdkDragBoundary="body"` keeps the panel inside the viewport so it can't get lost behind the toolbar. +- **Risk-tier chip** per row (LOW / MEDIUM / HIGH / WARNING) — color coded. +- **Default verdict per row** — `low` is pre-checked Allow; everything else (`medium` / `high` / `warning`) starts `pending` so the user makes an explicit call. +- **Per-row controls**: checkbox to allow, "Skip" button to deny, "Always do this" remember toggle (hidden for `high` and `warning`, and hidden whenever there's no proposal at all). +- **Bulk actions**: "Fix all" / "Skip all" buttons in the row header. +- **Apply button**: `Fix N & run` posts the batch to `/dataguard/apply-batch`, writes the cleaned data back as a new dataset version, repoints the operator, and auto-runs the workflow. **After Apply succeeds, the panel automatically re-scans the new dataset version** so the user sees real residue (or "all clean") instead of stale entries from before the fix. +- **"Scan again"** footer button (visible when state is `done` or `error`) re-runs DataGuard on the *current* dataset version — supports iterative cleanup (v1 → Apply → v2 → Scan again → Apply → v3 → …). + +### 3.3 Locate this issue → result panel + +Each row's `In column X · affects N row(s)` line is a clickable **📍 locate** button. Clicking it: + +1. Highlights the source `CSVFileScan` operator on the graph. +2. Opens / focuses the Result Panel for that operator. +3. Navigates to the page containing the next affected row and flashes the cell. + +The button **cycles** — every click advances a per-row cursor through `affectedRowIndices`, wrapping back to the first after the last. Each row owns its own cursor (a `Map` on the checklist component), so toggling verdicts elsewhere doesn't reset it. The tooltip previews the next position (`Show next affected row (i of N)`). + +Plumbing: `DataGuardRowNavigatorService` is a `ReplaySubject(1, 500ms)`. The 500 ms buffer covers the cold-mount case where the user clicks locate while the Result Panel is collapsed and `ResultTableFrameComponent` is async-instantiated. `ResultTableFrameComponent` subscribes, paginates (via an internal `pageRendered$ Subject` so the flash lands AFTER the new page renders — not after a 100 ms guess), and applies `dg-row-highlight` + `dg-cell-highlight` for 2 s (matched to the SCSS pulse animation). Cross-operator races, viewport resize during page swap, columnDef-vs-header naming drift, and stale closures on a destroyed view are all explicitly guarded. + +### 3.4 Toolbar shield + floating icon + +A `🛡` button in the workflow toolbar toggles DataGuard per-workflow (persisted in localStorage). When the panel is closed (`state === "idle"`) but the shield is ON, a small floating DataGuard icon appears on the canvas. Clicking it always triggers a fresh scan of whatever dataset operator is on the canvas (via `DataGuardAutoTriggerService.rescanAny()`). + +Resolution order for "what to scan?": +1. The previously-scanned operator if it still exists on the canvas. +2. Otherwise the first dataset-reading operator on the canvas. +3. Otherwise warn the user: "drop a dataset operator first." + +**Concurrency control.** The pipeline is gated by `currentPipeline: Promise | null` instead of a boolean. Two regimes: +- **Auto-trigger** (`userInitiated: false`, driven by operator-add / property-change debounce): if the slot is occupied, **drop silently** to preserve the original spam-suppression semantics. +- **User-initiated** (`userInitiated: true`, the floater click or the panel's Scan-again button): if the slot is occupied, **await** the in-flight pipeline (with a "queuing your scan…" toast) then start a fresh one. The panel state flips to `"scanning"` immediately so the user never sees a dead click. At most one `/scan` POST is in flight at a time. + +This means rescan works at any time, even after the user explicitly closed the panel — `state.sourceOperatorId` being gone after `reset()` doesn't break the flow, and a click during a slow scan doesn't double-fire LLM cost. + +--- + +## 4. System architecture + +``` +┌───────────────────────────┐ ┌──────────────────────────────────────┐ +│ Texera frontend (Angular) │ chat │ agent-service (Bun / TS) │ +│ │◄─── ws ──┤ │ +│ ┌─────────────────────┐ │ │ TexeraAgent (DataGuard host) │ +│ │ DataGuard checklist │ │ │ ├── DataGuardSession │ +│ │ + floating reopen │ │ REST │ │ • dataset (in-memory) │ +│ │ + toolbar shield │◄──┼──────────┤ │ • issues / proposals / log │ +│ └─────────────────────┘ │ │ │ • auto-allow rules │ +│ ┌─────────────────────┐ │ │ ├── Tools: │ +│ │ Auto-trigger │ │ │ │ • profile_dataset (read-only) │ +│ │ service │ │ │ │ • suggest_fix (read-only) │ +│ │ + DataGuard results │ │ │ │ • apply_fix (mutating) │ +│ │ state │ │ │ │ • write_decision_log │ +│ └─────────────────────┘ │ │ │ • bias_check │ +└───────────────────────────┘ │ └── LLM gateway (LiteLLM) │ + └──────────────┬───────────────────────┘ + │ + ┌───────▼────────┐ + │ Texera storage │ + │ (dataset/ │ + │ file-service) │ + └────────────────┘ +``` + +**Two entry points to the same backend:** + +- **REST (the primary path):** `/scan` and `/apply-batch` are server-driven — no LLM in the loop during the user's interaction. The LLM is invoked exactly once per issue inside `/scan`, to render a structured `FixProposal` from the raw `DataQualityIssue`. This keeps the user's flow deterministic and fast. +- **WS (the chat path):** `` still works if the agent is prompted via chat and the ReAct loop reaches `apply_fix` for a mutating action. The shape is `{type: "decision", stepId, verdict, remember?}`. This path is not used by the auto-trigger but is kept for any future chat-driven DataGuard flow. + +--- + +## 5. Detectors (five categories) + +`profile_dataset` runs entirely without an LLM and emits `DataQualityIssue` records of five types: + +| Type | Detection | +|---|---| +| `missing_value` | `null` / empty / configured missing tokens (default: `na`, `n/a`, `null`, `none`, `nan` — case-insensitive, whitespace-trimmed) | +| `placeholder_value` | Numeric sentinels (`999`, `-1`) or string sentinels (`unknown`, `Unknown`) — overridable via `placeholderValues` | +| `duplicate_id` | Honors `idColumn` hint; **falls back to a column-name heuristic** (`id`, `*_id`, `*Id`, `id_*`, `uid`) when no hint is supplied so the auto-trigger's empty-body `/scan` still catches duplicates. Flags repeated IDs (with or without conflicting labels) | +| `outlier` | Requires `validRanges` hint per column (`{min, max}`); flags numeric values outside the range. Skips rows already flagged as placeholders to avoid double-counting | +| `inconsistent_label` | Low-cardinality string columns where `trim+lowercase` keys collide on multiple raw spellings (e.g., `Male` / `male` / `MALE`); picks the most-frequent spelling as canonical | + +> **Note on outlier semantics.** An earlier z-score based detector (flag anything with `|z| > 3`) was deliberately removed: clustered legitimate extremes (e.g. sustained high glucose readings in a clinical dataset) were being flagged en masse and the user had no good way to tell the agent "those are real". The current `outlier` detector requires the caller to opt in by stating a hard range per column — the user owns the definition. + +The `missing` detector is centralized in `missing-detection.ts` — `profile_dataset` and `apply_fix` (impute) share it, so what the profiler flags is exactly what imputation treats as missing. + +--- + +## 6. Risk tiers (four levels) + +`suggest_fix` annotates every proposal with a `RiskTier`. The tier governs both the UI (pre-check / badge color / "remember" availability) and the permission gate: + +| Tier | Color | Default checkbox | "Allow & remember"? | Use case | +|---|---|---|---|---| +| `low` | green | pre-checked Allow | yes | Trim whitespace, standardize column names, drop fully empty rows | +| `medium` | yellow | pending (unchecked) | yes | Impute missing values, standardize inconsistent labels | +| `high` | red | pending (unchecked) | **no** | Drop rows, resolve conflicting duplicate IDs | +| `warning` | orange | pending (unchecked) | **no** | Concrete fix exists but the agent specifically wants a human to eyeball it — e.g. clamping an outlier that might be a real extreme value | + +The `warning` tier was introduced to replace the earlier `flag` operation kind (which was a no-op against the data and just recorded row indices on the session). Every proposal now produces a real concrete change to the data; "please review this one manually" is conveyed through the warning tier instead of a no-op operation. This also fixes a downstream LakeFS bug: when every "applied" fix is a real mutation, the exported CSV genuinely differs from the source and the version-create commit succeeds. + +`profile_dataset` default tiers: + +| Issue type | Default tier | +|---|---| +| `missing_value` | medium | +| `placeholder_value` | medium | +| `inconsistent_label` | medium | +| `duplicate_id` | high | +| `outlier` | **warning** | + +`suggest_fix` is allowed to override the default with a strong reason; the LLM prompt explicitly instructs it to prefer clamping via `replace_value` over destructive `drop_rows`, and to set `riskTier="warning"` when the user really should eyeball the fix. + +--- + +## 7. Fix operation kinds (six) + +`FixOperationKind` is the closed enum of mutations `apply_fix` knows how to execute: + +| Kind | Params | Effect | +|---|---|---| +| `replace_value` | `{column, replacement, rowIndices?, match?}` | Swap cells. Two targeting modes: `rowIndices` (deterministic, used by outlier proposals) wins when both are present. `match` (value-based) is for cases like "replace every `unknown` with null". `rowIndices` was added because LLM-generated `match` values for numeric outliers silently no-op'd when the LLM rounded the cell value (e.g. `match: 950` vs cell `949.7`), producing byte-identical exports that LakeFS rejected. | +| `drop_rows` | `{rowIndices: number[]}` | Remove rows by index | +| `impute` | `{column, strategy: "mean" \| "median" \| "mode"}` | Fill missing cells; honors session `missingTokens` override | +| `standardize` | `{column, mapping: {from: to}}` | Replace cell values via explicit mapping | +| `trim_whitespace` | `{column}` | Strip leading/trailing whitespace | +| `rename_column` | `{from, to}` | Rename column and rewrite per-row keys | + +`apply_fix` is a **pure function** — it never mutates the input `DatasetView`, just returns a new one. Optional `ApplyOptions.missingTokens` is threaded through from the session's scan options so `impute` treats the same set of cells as missing that the profiler flagged. + +--- + +## 8. Permission model + +Every mutating tool call passes through `requestApproval(gateway, proposal)`: + +``` +verdict resolution: + if riskTier is "high" or "warning": + always prompt the user + else if issueType has an autoAllowRule in the session: + return auto_allow_remembered + else if riskTier is "low": + return auto_allow_low_risk + else: + emit pendingApproval step, wait for user decision via WS +``` + +The `Verdict` union: `"allow" | "deny" | "auto_allow_low_risk" | "auto_allow_remembered"`. **`"modify"` was deliberately cut** — the legacy handler recorded a user-supplied free-text override but still executed the original `operationParams`, which silently lied to users. Modify will return only with a real natural-language → operationParams parser (post-MVP). + +### 8.1 Contract enforcement + +The HTTP and WS handlers strictly enforce this contract: + +- `/apply-batch` body schema: `verdict: t.Union([t.Literal("allow"), t.Literal("deny")])` with `additionalProperties: false` on each decision object. Unknown fields (e.g., legacy `modifiedAction`) cause a typebox validation rejection. +- The Elysia app is built with `normalize: false` so unknown fields aren't silently stripped before validation hits. +- A global `onError` handler converts `code === "VALIDATION"` to HTTP 400 instead of the default 500. +- A runtime check on `/apply-batch` rejects `{verdict: "deny", remember: true}` with a friendly 400 — `remember` only applies when the user approves a fix. +- The WS `decision` handler narrows the same way: `verdict?: "allow" | "deny"` only, `modifiedAction` removed, and the handler explicitly rejects an invalid verdict and `deny+remember=true` with an error message. + +--- + +## 9. Decision log + +Every approved or denied action is appended to a structured per-session log. The log serializes to RFC-4180 CSV with the 9-column schema: + +``` +decision_id, timestamp, issue_type, target_rows, proposed_action, +user_decision, reason, confidence, applied_at +``` + +`applied_at` is empty for denied entries. The CSV is exported by the `write_decision_log` tool (LLM-invocable) or read directly from `session.getDecisionLog()`. + +The `modified_action` column was cut alongside the `"modify"` verdict. + +--- + +## 10. Backend API surface + +All under `${API_PREFIX}/agents/:id`: + +| Method + Path | Purpose | +|---|---| +| `POST /dataguard/dataset` | Load `{columns: string[], rows: Record[]}` into the agent's `DataGuardSession`. Resets per-run state (issues, proposals, decision log). | +| `POST /dataguard/scan` | Run `profile_dataset` then `suggest_fix` per issue. Body: optional `{idColumn?, validRanges?, placeholderValues?, missingTokens?}`. Persists scan options on the session for the verification re-scan. Returns `{issueCount, issues, proposals}`. | +| `POST /dataguard/apply-batch` | Apply the user's checked decisions. Body: `{decisions: [{issueId, verdict: "allow"|"deny", remember?}]}`. Runs `apply_fix` per allowed proposal, records every decision (including denies). Re-profiles the cleaned dataset and returns `{applied, denied, failed, datasetRowCount, results, residualIssues, residualCount}`. | +| `GET /dataguard/export-csv` | Return the in-memory cleaned dataset as a CSV blob. Used by the frontend to push the new version back to the source dataset. | +| `GET /dataguard/session` | Inspect session state (issue list, decision log, auto-allow rules). | +| `WS /agents/:id/react` `{type:"decision",…}` | Resolve a pending-approval step. Used by the chat flow only; the checklist path uses `/apply-batch`. | + +--- + +## 11. Frontend components + +### 11.1 Services (DI singletons, `providedIn: 'root'`) + +| Service | Responsibility | +|---|---| +| `DataGuardAutoTriggerService` | Owns the operator-add / property-change subscription, runs the orchestration pipeline (resolve workflow → load dataset → scan → publish). Exposes `startOrchestration()`, `applyBatch(decisions)`, `rescanCurrent()`, `rescanAny()`. Concurrency-gated by `currentPipeline: Promise \| null` (see §3.4). | +| `DataGuardResultsService` | `BehaviorSubject` that drives the checklist UI. States: `idle → scanning → ready → applying → done / error`. Exposes `setState(patch)`, `updateEntry(issueId, patch)`, `reset()`. | +| `DataGuardSettingsService` | Per-workflow shield ON/OFF, persisted in `localStorage` (`dataguard.enabled.wid.`). Default ON. | +| `DataGuardRowNavigatorService` | `ReplaySubject(1, 500ms)` driving the 📍 locate flow. Includes pure helpers `pageIndexFor(rowIndex, pageSize)` and `nextCycleStep(indices, cursor)`. | +| `AgentService.sendDecision(agentId, stepId, verdict, options)` | WS sender for the chat flow's `{type:"decision",…}` message. | + +### 11.2 Components + +| Component | Role | +|---|---| +| `` (standalone) | The floating, draggable checklist panel. Subscribes to `DataGuardResultsService`, owns the orchestration subscription (so it lives whenever the checklist itself can render), renders rows / risk-tier badges / Apply button / Scan-again footer / floating reopen icon. Holds the per-row `locateCursors: Map` driving cyclic 📍 navigation. | +| `` (standalone) | The inline chat-bubble approval prompt — used by the chat-driven path in `agent-chat`. Allow / Deny / Allow-&-remember (the last is hidden for `high` and `warning`). | +| `` (modified) | Toolbar 🛡 shield button — toggles `DataGuardSettingsService.isEnabled(wid)`. | +| `` (modified) | Subscribes to `DataGuardRowNavigatorService`. On a locate request, navigates the paginator to the target page and chains off an internal `pageRendered$ Subject` so `applyFlash()` only fires after the new page actually renders. 2 s highlight via `HIGHLIGHT_DURATION_MS`; `ngOnDestroy` clears the timer to avoid NG0911. | + +The checklist component lives at `bottom: 100px; right: 80px` by default. When dragged elsewhere it stays where the user put it for the session; refreshing returns to the default anchor. The floating reopen icon occupies the same default position. + +--- + +## 12. Auto-trigger dataset operator set + +```ts +private static readonly DATASET_OPERATOR_TYPES = new Set([ + "CSVFileScan", + "ParallelCSVFileScan", +]); +``` + +CSV-only for now — `loadFromOperatorFile` pipes every blob through `Papa.parse`, so adding `JSONFileScan` / `TableFileScan` / Parquet would either crash or produce garbage rows. Per-format parsing is the obvious follow-up. + +--- + +## 13. File map + +### 13.1 agent-service + +``` +src/types/dataguard.ts Shared types: RiskTier, Confidence, IssueType, + FixOperationKind, Verdict, DataQualityIssue, + FixProposal, PermissionDecision, DecisionLogEntry, + AutoAllowRule +src/types/dataguard.test.ts Fixture tests (literal-union shapes) +src/types/agent.ts ReActStep.pendingApproval + PendingApproval interface + +src/agent/tools/dataguard/ + dataset.ts DatasetView type + missing-detection.ts Shared isMissing / placeholderHit / toNumber + profile-dataset.ts Five-detector profiler, no LLM; + inferIdColumn helper for auto-detecting + ID columns by name pattern + suggest-fix.ts LLM-driven proposal generator, zod-validated; + prompt passes affectedRowIndices and instructs + rowIndices-based replace_value for outliers + apply-fix.ts Pure-function applier for the six op kinds. + replace_value supports rowIndices targeting; + ApplyOptions.missingTokens honored by impute + with-approval.ts Permission gate (handles low/medium/high/warning; + warning never auto-allows, even with remember rules) + dataguard-session.ts Per-agent state: dataset, issues, proposals, + decisionLog, autoAllowRules, ScanOptions + decision-log.ts 9-col CSV serializer + AI-SDK tool + (modified_action column removed by #11a) + bias-check.ts Per-group retention diff + AI-SDK tool + dataguard-tools.ts AI SDK tool({...}) definitions (5 tools) + __tests__/*.test.ts Test files — apply-fix, suggest-fix, profile-dataset, + with-approval, dataguard-session, decision-log, + bias-check, apply-batch-rescan, plus the + contract-lock files: permission-types, + apply-batch-modify-reject, decision-log-no-modify, + with-approval-no-modify + +src/agent/texera-agent.ts Implements ApprovalGateway; registers DataGuard + tools; exposes public callLlm(prompt) used by /scan +src/server.ts Elysia routes (app built with normalize:false so + additionalProperties:false on body schemas + actually rejects legacy fields): + POST /dataguard/dataset + POST /dataguard/scan + POST /dataguard/apply-batch (rejects "modify" + verdict, modifiedAction field, and + {verdict:"deny", remember:true}) + GET /dataguard/export-csv + GET /dataguard/session + WS decision message branch + onError: VALIDATION → 400 +``` + +### 13.2 frontend + +``` +src/app/workspace/ + service/agent/ + agent-types.ts ReActStep.pendingApproval field (mirror of backend); + riskTier includes "warning" + agent.service.ts sendDecision(agentId, stepId, verdict, {remember}) + — WS sender for the chat flow + data-guard-auto-trigger.service.ts Orchestration pipeline (scan / apply-batch / + rescanAny / rescanCurrent), debounced operator-add + + operator-property-change subscription. + Concurrency-gated via currentPipeline Promise: + user-initiated awaits in-flight, auto-trigger + drops silently. After-Apply auto-rescan. + Includes pure helper resolveRescanTarget(state, graph). + data-guard-auto-trigger.service.spec.ts Tests for resolveRescanTarget decision tree + + serialization test (no concurrent /scan) + data-guard-results.service.ts BehaviorSubject driving the UI + data-guard-results.service.spec.ts State-shape tests + data-guard-settings.service.ts Per-workflow shield ON/OFF (localStorage) + data-guard-row-navigator.service.ts ReplaySubject for 📍 locate flow; + pageIndexFor + nextCycleStep pure helpers + data-guard-row-navigator.service.spec.ts Cycle-walk + ReplaySubject TTL + negative-cursor + coercion + serialization edge cases + component/ + dataguard-checklist/ — floating draggable + panel (cdkDrag + cdkDragHandle + cdkDragBoundary), + row checklist, 📍 locate button (cyclic), + category roll-up, Scan-again, floating reopen icon + result-panel/result-table-frame/ Modified to subscribe to row-navigator and flash + cells via pageRendered$ Subject (waits for new + page render, not arbitrary timeout) + agent/agent-panel/ + permission-prompt/ — inline approval UI + used by the chat-driven path + agent-chat/ Renders inside the + step loop when pendingApproval is set + agent-panel.component.ts No longer owns the auto-trigger subscription — + that moved to the checklist component + menu/ Toolbar 🛡 shield button (per-workflow toggle) + workspace.component.{ts,html} Mounts +``` + +--- + +## 14. Setup + +DataGuard runs as part of the standard Texera microservices + agent-service stack, plus a local LiteLLM proxy that wraps an LLM provider with an OpenAI-compatible API. + +### 14.1 One-time setup + +```bash +# 1) API key for your LLM provider (e.g., Anthropic) — exported once +echo 'export ANTHROPIC_API_KEY=sk-ant-…your-key…' >> ~/.zshrc +source ~/.zshrc + +# 2) Python venv for LiteLLM proxy +python3.12 -m venv ~/UCI/TexeraProject/venv312 +~/UCI/TexeraProject/venv312/bin/pip install --upgrade pip +~/UCI/TexeraProject/venv312/bin/pip install 'litellm[proxy]' + +# 3) Bun for agent-service +brew install oven-sh/bun/bun + +# 4) Yarn 4 via Corepack for frontend +corepack enable + +# 5) GUI feature flag in Texera config +# Set: common/config/src/main/resources/gui.conf +# copilot-enabled = true +``` + +### 14.2 Daily startup + +```bash +# Terminal 1 — LiteLLM proxy on :4000 (OpenAI-style API over your provider) +source ~/UCI/TexeraProject/venv312/bin/activate +cd ~/UCI/TexeraProject/texera +litellm --config bin/litellm-config.yaml + +# Terminal 2 — Texera Scala microservices +# IntelliJ "texera micro services" run config, or: +# bin/single-node/docker compose up -d + +# Terminal 3 — agent-service +cd ~/UCI/TexeraProject/texera/agent-service +bun install # only if node_modules absent +bun run dev # :3001 with --watch reload + +# Terminal 4 — Frontend +cd ~/UCI/TexeraProject/texera/frontend +yarn install # only if node_modules absent +yarn start # :4200, proxies /api/* per proxy.config.json +``` + +Then open `http://localhost:4200`, sign in, open or create a workflow. + +### 14.3 End-to-end flow + +1. Confirm the 🛡 shield is ON (toolbar — twotone icon = ON, outline = OFF). +2. Drop a `CSVFileScan` operator and point it at any dataset in the system. +3. The checklist panel slides in. While `/scan` runs, a loading message replaces the row list (typically a few seconds — one LLM call per issue). +4. Review each row. `LOW` rows are pre-checked; `MEDIUM` / `HIGH` / `WARNING` need an explicit Allow. Click the **📍** on any row to jump to the affected row in the Result Panel — clicking again cycles to the next affected row, wrapping after the last. +5. Click **Fix N & run** — the cleaned data is written back as a new dataset version with a timestamp-suffixed name, the operator is repointed at the new version, and the workflow auto-runs. The panel then **auto-re-scans** the new version so you immediately see whether anything is still left. +6. Click **Scan again** at any time to iterate against the current version. +7. Close the panel. The floating DataGuard icon appears (if the shield is ON) — click it any time to trigger a fresh scan; the panel re-opens immediately even if another scan is in flight (queued, never concurrent). + +--- + +## 15. Testing + +```bash +cd agent-service +bun run typecheck # exit 0 +bun test # 199 pass / 0 fail (419 expect calls) + +cd frontend +npx tsc --noEmit # exit 0 +``` + +Test coverage spans: + +- Types fixtures (12) — verifies the literal unions accept and reject the right members. +- Profile (20+) — per-detector cases including the validRanges-based outlier, the explicit "clustered large readings are NOT auto-outliers" assertion, and the `inferIdColumn` heuristic across all id-name patterns (`id`, `*_id`, `*Id`, `id_*`). +- Suggest (10+) — LLM-response schema validation. +- Apply (16) — every op kind round-trips; original dataset never mutated; `replace_value` with `rowIndices` regression-locks the LakeFS "no changes detected" bug; `missingTokens` override threads through to impute. +- With-approval (7) — low/medium/high/warning gating, the `warning`-with-remembered-rule case, and the buffered-decision race. +- Session (8) — recordIssue/recordDecision/auto-allow lifecycle. +- Decision log (6) + decision-log-no-modify (2) — RFC-4180 CSV shape and the post-#11a 9-column schema lock. +- Apply-batch end-to-end (12+) — Modify-verdict rejection, `additionalProperties` rejection, `verdict==="deny" && remember===true` rejection, residual re-scan correctness. +- Permission-types (4) — `@ts-expect-error` locks that `"modify"` and `modifiedAction` cannot type-check anywhere. +- Frontend specs — `DataGuardRowNavigatorService` (cycle math, ReplaySubject TTL, negative-cursor coercion); `DataGuardAutoTriggerService` (resolveRescanTarget decision tree + pipeline serialization proof). + +--- + +## 16. Differentiation + +Closest UX overlap among Texera AI proposals is `mengw15`'s **UDF Copilot** (Claude-Code-style permission UX), but it operates on **code in the Monaco editor**, not data. Complementary, not competing. + +| Project | Object of AI | User | Surface | +|---|---|---|---| +| UDF Copilot | Code | Developer | Monaco editor | +| Macro Operators | Workflow structure | Workflow author | Canvas | +| Self-healing workflows | Workflow JSON | Workflow author | Canvas | +| **DataGuard** | **Data** | **Domain expert / scientist** | **Chat panel + checklist** | + +**Theme positioning:** the only proposal targeting the *Data / AI for Science* track. + +**Sibling feature: WorkflowGuard.** Applies the same permission UX to *workflow edits* (`addOperator` / `modifyOperator` / `deleteOperator`). Independent feature, shared `pendingApproval` mechanism. See `README_WorkflowGuard_Texera.md`. + +--- + +## 17. Why it matters + +Pure AI automation in data cleaning is risky: +- AI may silently remove scientifically meaningful outliers. +- AI may introduce bias by removing data from one group more than another. +- AI may misinterpret placeholder values as real values. +- AI may make irreversible transformations the user never sanctioned. + +DataGuard treats the human as the **decision-maker**, not the **reviewer of a finished job**: +- AI provides suggestions, not final decisions. +- Every mutating action requires explicit authorization (or pre-authorization via "remember", but only at the user's request and only for tiers ≤ medium). +- Each decision is supported by evidence and confidence. +- Every step is logged for audit and reproducibility. +- The workflow is fully replayable from the decision log. + +**The interaction model is the contribution.** DataGuard demonstrates that Claude Code's permission-based UX — already proven for code — translates naturally to data work, and is *especially* valuable in scientific contexts where reversibility and trust matter most. + +--- + +## 18. HCI contribution + +DataGuard contributes a concrete instance of: + +``` +AI detects → AI explains with evidence → AI proposes specific action + ↓ +Human decides (Allow / Allow & remember / Deny) + ↓ +System applies (only if approved) → System records → Continue +``` + +Relevant concepts: +- Permission-based AI agency (Claude Code, MCP tool authorization). +- Trust calibration through evidence + confidence display. +- Risk-tiered auto-apply (low-risk transparency vs. high-risk gating, plus the `warning` tier for "concrete fix, but verify"). +- Decision provenance and audit trail. +- Reproducibility via replayable decision logs. +- Mixed-initiative interaction with the human always at the final boundary. + +--- + +## 19. Post-MVP follow-ups + +- **JSON / Parquet operator support.** Auto-trigger is intentionally narrowed to `CSVFileScan` + `ParallelCSVFileScan`. Adding `JSONFileScan` / `TableFileScan` / Parquet needs `loadFromOperatorFile` to branch by suffix instead of force-`Papa.parse`-ing every blob. +- **Modify verdict.** Currently cut. Returns only with a real natural-language → `operationParams` parser; the legacy "modify" recorded a free-text override but executed the original params, which silently lied. +- **Iceberg-backed decision log.** Via the existing Lakekeeper integration. Currently CSV. +- **`run_cleaning_workflow` tool.** Distributed cleaning by delegating to a Texera workflow for datasets that don't fit in memory. +- **`--replay decision_log.csv`.** Reproduce a cleaned dataset from a saved log without LLM calls. +- **System-prompt switch when DataGuard is active.** If we want the chat path back, the agent's system prompt should temporarily become DataGuard-focused (currently workflow-centric). +- **Disabled tools per agent.** Pass `disabledTools: ["addOperator", …]` when the auto-trigger creates an agent, so even an accidental chat doesn't risk workflow mutation. +- **Bias-check banner in the panel.** Currently `bias_check.ts` produces structured output but only the chat path consumes it. +- **Persist drag position across refresh.** Today it resets to bottom-right on reload. +- **`pageRendered$` integration cleanup.** The result-table-frame change exposes a private completion signal; a small refactor could publish it as a public `Observable` so other consumers (e.g. a "scroll to row N" feature) can subscribe. diff --git a/agent-service/demo/README.md b/agent-service/demo/README.md index 968987a0703..e148521626f 100644 --- a/agent-service/demo/README.md +++ b/agent-service/demo/README.md @@ -14,7 +14,7 @@ column (`diabetic_outcome`) is the label. | 4 | `missing_value` | S005, S007, S009, S014 | empty `glucose` in Group A (imbalanced) | | 5 | `missing_value` | S045, S046, S048 | `age = "N/A"`, `glucose = " "`, `glucose = "null"` | | 6 | `duplicate_id` | S001, S017 | sample_id repeats with conflicting `diabetic_outcome` | -| 7 | `out_of_range` | S041, S042, S044 | `bmi > 60` (clinical max ~70 — possibly real, possibly error) | +| 7 | `outlier` | S041, S042, S044 | `bmi > 60` outside user-supplied `validRanges` (possible real extreme — flagged as `warning` tier) | ## How to load into DataGuard @@ -28,13 +28,43 @@ curl -X POST http://localhost:8000/api/agents//dataguard/dataset \ }')" ``` -Or via the demo script (Step 13's frontend auto-trigger handles this in the -real flow — once a `CSVFileScan` operator is added that references this file, -DataGuard auto-launches). +The frontend auto-trigger handles this in the real flow — once a `CSVFileScan` +operator is added that references this file, DataGuard auto-launches. + +## Single-category demo files + +For testing one detector at a time, each of these CSVs concentrates pollution +in a single category so it's obvious which detector is firing: + +| File | Issue category | What's wrong | +|---|---|---| +| `missing_values_demo.csv` | `missing_value` | empty / `N/A` / `NA` / `null` cells across multiple columns | +| `placeholder_values_demo.csv` | `placeholder_value` | `999`, `-1`, `Unknown` / `unknown` sentinels | +| `duplicate_rows_demo.csv` | `duplicate_id` | repeated `sample_id`s, some with conflicting outcomes | +| `outliers_demo.csv` | `outlier` | negative ages, BMI > 200, blood pressure > 250 — fires only when `validRanges` is supplied at scan time | +| `inconsistent_labels_demo.csv` | `inconsistent_label` | `Male` / `male` / `MALE` and `Female` / `female` / `FEMALE` mixed | + +The `outliers_demo.csv` requires `validRanges` to be set when scanning (the +profiler does not auto-detect numerical outliers via z-score — that variant was +removed because it flagged legitimate clustered extremes as errors). The other +four fire on default scan options. + +Suggested `validRanges` for the outlier demo: + +```json +{ + "age": { "min": 0, "max": 120 }, + "bmi": { "min": 10, "max": 60 }, + "blood_pressure": { "min": 40, "max": 200 } +} +``` ## Bias-check expectation Group A: 22 rows. Group B: 23 rows. After cleaning, missingness imbalance (more empties in A) means naive imputation drops ~18% of A but only ~4% of B -— DataGuard surfaces this and recommends `flag` instead of `impute` for the -missing-glucose issue (the §5 storyboard "Modify" beat). +— DataGuard surfaces this and proposes a `replace_value` fix tagged with +`riskTier: "warning"` so the user explicitly confirms instead of letting +imputation run silently. The earlier `flag` operation kind was removed — +every fix is now a concrete change, and "please review manually" is conveyed +through the warning tier instead. diff --git a/agent-service/demo/duplicate_rows_demo.csv b/agent-service/demo/duplicate_rows_demo.csv new file mode 100644 index 00000000000..99bc4f4586a --- /dev/null +++ b/agent-service/demo/duplicate_rows_demo.csv @@ -0,0 +1,31 @@ +sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome +S001,45,110,28.1,80,A,0 +S002,52,140,30.5,85,A,1 +S003,38,95,24.0,70,A,0 +S004,46,130,29.8,82,A,1 +S005,41,115,27.5,78,A,0 +S006,47,125,28.0,80,A,1 +S007,55,132,30.0,85,A,1 +S008,49,118,28.9,82,A,0 +S009,42,114,26.7,75,A,0 +S010,53,135,29.1,88,A,1 +S001,45,110,28.1,80,A,0 +S002,52,142,30.5,85,A,1 +S007,55,132,30.0,85,A,0 +S011,46,108,27.3,79,A,0 +S012,44,142,30.8,87,A,1 +S013,43,112,26.4,76,A,0 +S014,51,128,29.5,84,A,1 +S015,48,120,28.6,81,A,0 +S016,40,98,25.2,72,A,0 +S017,57,148,32.7,92,A,1 +S017,57,148,32.7,92,A,1 +S018,44,115,27.8,79,A,0 +S019,50,128,30.0,85,A,1 +S020,39,102,24.9,73,A,0 +S004,46,130,29.8,82,A,1 +S021,46,113,28.4,80,B,0 +S022,54,138,31.5,88,B,1 +S023,42,108,27.0,77,B,0 +S024,48,122,29.3,82,B,1 +S010,53,135,29.1,88,A,0 diff --git a/agent-service/demo/inconsistent_labels_demo.csv b/agent-service/demo/inconsistent_labels_demo.csv new file mode 100644 index 00000000000..bc8759a93c5 --- /dev/null +++ b/agent-service/demo/inconsistent_labels_demo.csv @@ -0,0 +1,31 @@ +sample_id,age,glucose,bmi,gender,group,diabetic_outcome +S001,45,110,28.1,Male,A,0 +S002,52,140,30.5,Female,A,1 +S003,38,95,24.0,male,A,0 +S004,46,130,29.8,Female,A,1 +S005,41,115,27.5,MALE,A,0 +S006,47,125,28.0,female,A,1 +S007,55,132,30.0,Male,A,1 +S008,49,118,28.9,Female,A,0 +S009,42,114,26.7,Male,A,0 +S010,53,135,29.1,FEMALE,A,1 +S011,46,108,27.3,Male,A,0 +S012,44,142,30.8,Female,A,1 +S013,43,112,26.4,male,A,0 +S014,51,128,29.5,Female,A,1 +S015,48,120,28.6,Male,A,0 +S016,40,98,25.2,Female,A,0 +S017,57,148,32.7,Male,A,1 +S018,44,115,27.8,Female,A,0 +S019,50,128,30.0,Male,A,1 +S020,39,102,24.9,Female,A,0 +S021,46,113,28.4,Male,B,0 +S022,54,138,31.5,Female,B,1 +S023,42,108,27.0,Male,B,0 +S024,48,122,29.3,female,B,1 +S025,45,116,28.7,Male,B,0 +S026,52,134,30.6,Female,B,1 +S027,41,105,25.8,Male,B,0 +S028,49,124,29.0,Female,B,1 +S029,47,118,28.2,male,B,0 +S030,43,110,26.9,Female,B,0 diff --git a/agent-service/demo/missing_values_demo.csv b/agent-service/demo/missing_values_demo.csv new file mode 100644 index 00000000000..07d1cfab2a0 --- /dev/null +++ b/agent-service/demo/missing_values_demo.csv @@ -0,0 +1,31 @@ +sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome +S001,45,110,28.1,80,A,0 +S002,52,140,30.5,85,A,1 +S003,38,95,24.0,70,A,0 +S004,,130,29.8,82,A,1 +S005,41,,27.5,78,A,0 +S006,47,125,,80,A,1 +S007,55,,33.0,90,A,1 +S008,49,118,28.9,,A,0 +S009,42,,26.7,75,A,0 +S010,53,135,29.1,88,A,1 +S011,46,108,27.3,79,A,0 +S012,N/A,142,30.8,87,A,1 +S013,43,112,26.4,76,A,0 +S014,51,NA,29.5,84,A,1 +S015,48,120,28.6,81,A,0 +S016,40,98,25.2,72,A,0 +S017,57,148,32.7,92,A,1 +S018,44,115,27.8,79,A,0 +S019,50,128,30.0,85,A,1 +S020,39,102,24.9,73,A,0 +S021,46,113,28.4,80,B,0 +S022,54,138,31.5,88,B,1 +S023,42,108,27.0,77,B,0 +S024,48,122,29.3,82,B,1 +S025,45,116,28.7,79,B,0 +S026,null,134,30.6,86,B,1 +S027,41,105,25.8,74,B,0 +S028,49,124,29.0,83,B,1 +S029,47,118,28.2,81,B,0 +S030,43,110,26.9,76,B,0 diff --git a/agent-service/demo/outliers_demo.csv b/agent-service/demo/outliers_demo.csv new file mode 100644 index 00000000000..2ddd4320679 --- /dev/null +++ b/agent-service/demo/outliers_demo.csv @@ -0,0 +1,31 @@ +sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome +S001,45,110,28.1,80,A,0 +S002,52,140,30.5,85,A,1 +S003,38,95,24.0,70,A,0 +S004,200,130,29.8,82,A,1 +S005,41,115,27.5,78,A,0 +S006,-5,125,28.0,80,A,1 +S007,55,132,300,85,A,1 +S008,49,118,28.9,82,A,0 +S009,42,114,26.7,250,A,0 +S010,53,135,29.1,88,A,1 +S011,46,108,27.3,79,A,0 +S012,44,142,30.8,87,A,1 +S013,43,112,26.4,76,A,0 +S014,51,128,29.5,84,A,1 +S015,180,120,28.6,81,A,0 +S016,40,98,25.2,72,A,0 +S017,57,148,500,92,A,1 +S018,44,115,27.8,79,A,0 +S019,50,128,30.0,85,A,1 +S020,39,102,24.9,73,A,0 +S021,46,113,28.4,80,B,0 +S022,54,138,31.5,300,B,1 +S023,42,108,27.0,77,B,0 +S024,48,122,29.3,82,B,1 +S025,45,116,28.7,79,B,0 +S026,52,134,30.6,86,B,1 +S027,-2,105,25.8,74,B,0 +S028,49,124,29.0,83,B,1 +S029,47,118,28.2,81,B,0 +S030,43,110,26.9,76,B,0 diff --git a/agent-service/demo/placeholder_values_demo.csv b/agent-service/demo/placeholder_values_demo.csv new file mode 100644 index 00000000000..00d16b50df4 --- /dev/null +++ b/agent-service/demo/placeholder_values_demo.csv @@ -0,0 +1,31 @@ +sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome +S001,45,110,28.1,80,A,0 +S002,52,140,30.5,85,A,1 +S003,38,95,24.0,70,A,0 +S004,999,130,29.8,82,A,1 +S005,41,999,27.5,78,A,0 +S006,47,125,28.0,80,A,1 +S007,Unknown,128,30.0,85,A,1 +S008,49,118,28.9,82,A,0 +S009,42,114,26.7,75,A,0 +S010,53,135,29.1,88,A,1 +S011,46,108,27.3,79,A,0 +S012,999,142,30.8,87,A,1 +S013,43,112,26.4,76,A,0 +S014,51,unknown,29.5,84,A,1 +S015,48,120,28.6,81,A,0 +S016,40,98,25.2,72,A,0 +S017,57,148,32.7,92,A,1 +S018,44,115,27.8,79,A,0 +S019,50,128,30.0,85,A,1 +S020,39,102,24.9,73,A,0 +S021,46,113,-1,80,B,0 +S022,54,138,31.5,88,B,1 +S023,42,108,27.0,77,B,0 +S024,48,122,-1,82,B,1 +S025,45,116,28.7,-1,B,0 +S026,52,134,30.6,86,B,1 +S027,Unknown,105,25.8,74,B,0 +S028,49,124,29.0,83,B,1 +S029,47,118,28.2,81,B,0 +S030,43,110,26.9,76,B,0 diff --git a/agent-service/src/agent/texera-agent.ts b/agent-service/src/agent/texera-agent.ts index f70cbadc358..f2bc6a2736a 100644 --- a/agent-service/src/agent/texera-agent.ts +++ b/agent-service/src/agent/texera-agent.ts @@ -272,6 +272,21 @@ export class TexeraAgent implements ApprovalGateway { return this.state; } + /** + * Stateless LLM call used by DataGuard's server-driven /scan endpoint. + * Bypasses the ReAct loop (no tool calls, no step recording) — just sends + * a prompt to the configured model and returns its text response. Used to + * generate FixProposals server-side without going through the chat panel. + */ + public async callLlm(prompt: string): Promise { + const result = await generateText({ + model: this.model, + prompt, + temperature: 0.2, + }); + return result.text; + } + getWorkflowState(): WorkflowState { return this.workflowState; } diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts new file mode 100644 index 00000000000..f5bfec93c4f --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts @@ -0,0 +1,156 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Contract tests for cutting the Modify verdict (task #11a / #15) and for +// the remember-flag scope rule (task #12). The HTTP body schema on +// POST /api/agents/:id/dataguard/apply-batch must: +// +// • reject verdict: "modify" +// • reject the modifiedAction field (no longer part of the contract) +// • reject { verdict: "deny", remember: true } — remember is only meaningful for "allow" +// • still accept { verdict: "allow", remember: true } and { verdict: "deny" } +// +// All assertions are at the schema layer (Elysia body validation runs before +// the handler), so we don't need a real loaded dataset or LLM-derived +// proposals to exercise them. + +import { beforeEach, describe, expect, test } from "bun:test"; +import { buildApp, _resetAgentStoreForTests } from "../../../../server"; +import { env } from "../../../../config/env"; + +const API = env.API_PREFIX; +const app = buildApp(); + +function url(path: string): string { + return `http://localhost${path}`; +} + +async function postJson(path: string, body: unknown): Promise { + return app.handle( + new Request(url(path), { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }) + ); +} + +async function readJson(res: Response): Promise { + return (await res.json()) as T; +} + +async function createAgent(): Promise { + const res = await postJson(`${API}/agents`, { modelType: "test-model" }); + const body = await readJson<{ id: string }>(res); + return body.id; +} + +beforeEach(() => { + _resetAgentStoreForTests(); +}); + +describe(`POST ${API}/agents/:id/dataguard/apply-batch — Modify verdict cut (#11a)`, () => { + test("rejects verdict: \"modify\" with a 4xx body-schema error", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [{ issueId: "iss-1", verdict: "modify" }], + }); + expect(res.status).toBeGreaterThanOrEqual(400); + expect(res.status).toBeLessThan(500); + }); + + test("rejects unknown field `modifiedAction` on a decision entry", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [ + { issueId: "iss-1", verdict: "allow", modifiedAction: "Flag instead of replace" }, + ], + }); + expect(res.status).toBeGreaterThanOrEqual(400); + expect(res.status).toBeLessThan(500); + }); + + test("still accepts verdict: \"allow\" (baseline — parity check that the cut didn't over-reach)", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [{ issueId: "iss-not-loaded", verdict: "allow" }], + }); + // No proposal recorded for this issueId, so the handler returns 200 with + // a per-result error string — the SCHEMA accepted the body, which is the + // point of this test. + expect(res.status).toBe(200); + }); + + test("still accepts verdict: \"deny\" (baseline)", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [{ issueId: "iss-not-loaded", verdict: "deny" }], + }); + expect(res.status).toBe(200); + }); + + test("rejects a mixed batch where ANY decision uses verdict: \"modify\"", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [ + { issueId: "iss-1", verdict: "allow" }, + { issueId: "iss-2", verdict: "modify" }, + { issueId: "iss-3", verdict: "deny" }, + ], + }); + expect(res.status).toBeGreaterThanOrEqual(400); + expect(res.status).toBeLessThan(500); + }); +}); + +describe(`POST ${API}/agents/:id/dataguard/apply-batch — remember flag scope (#12)`, () => { + test("rejects { verdict: \"deny\", remember: true } — remember only applies to allow", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [{ issueId: "iss-1", verdict: "deny", remember: true }], + }); + expect(res.status).toBeGreaterThanOrEqual(400); + expect(res.status).toBeLessThan(500); + }); + + test("accepts { verdict: \"allow\", remember: true } (baseline)", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [{ issueId: "iss-not-loaded", verdict: "allow", remember: true }], + }); + // Same as above — handler can't find the proposal but the schema accepted. + expect(res.status).toBe(200); + }); + + test("accepts { verdict: \"deny\", remember: false } — only `remember: true` + deny is the forbidden combo", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [{ issueId: "iss-not-loaded", verdict: "deny", remember: false }], + }); + expect(res.status).toBe(200); + }); + + test("accepts { verdict: \"deny\" } with `remember` omitted entirely", async () => { + const id = await createAgent(); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [{ issueId: "iss-not-loaded", verdict: "deny" }], + }); + expect(res.status).toBe(200); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-rescan.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-rescan.test.ts new file mode 100644 index 00000000000..22461c0d799 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-rescan.test.ts @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Verification re-scan contract: POST /apply-batch must re-run profileDataset +// on the cleaned dataset and return residualIssues. The UI uses this to show +// users honest residue instead of silently claiming success. + +import { beforeEach, describe, expect, test } from "bun:test"; +import { buildApp, _resetAgentStoreForTests, _getAgentForTests } from "../../../../server"; +import { env } from "../../../../config/env"; +import type { DataQualityIssue } from "../../../../types/dataguard"; + +const API = env.API_PREFIX; +const app = buildApp(); + +function url(path: string): string { + return `http://localhost${path}`; +} + +async function postJson(path: string, body: unknown): Promise { + return app.handle( + new Request(url(path), { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }) + ); +} + +async function readJson(res: Response): Promise { + return (await res.json()) as T; +} + +async function createAgent(): Promise { + const res = await postJson(`${API}/agents`, { modelType: "test-model" }); + const body = await readJson<{ id: string }>(res); + return body.id; +} + +beforeEach(() => { + _resetAgentStoreForTests(); +}); + +describe(`POST ${API}/agents/:id/dataguard/apply-batch — verification re-scan`, () => { + test("response includes residualIssues + residualCount fields", async () => { + const id = await createAgent(); + const agent = _getAgentForTests(id)!; + agent.getDataGuardSession().setDataset({ + columns: ["x"], + rows: [{ x: 1 }], + }); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [], + }); + expect(res.status).toBe(200); + const body = await readJson<{ residualIssues: unknown; residualCount: unknown }>(res); + expect(Array.isArray(body.residualIssues)).toBe(true); + expect(typeof body.residualCount).toBe("number"); + }); + + test("residualIssues empty when cleaned dataset has nothing left to flag", async () => { + const id = await createAgent(); + const agent = _getAgentForTests(id)!; + const session = agent.getDataGuardSession(); + // Pristine dataset → no proposals to apply, profiler finds nothing. + session.setDataset({ + columns: ["age"], + rows: [{ age: 30 }, { age: 40 }, { age: 50 }], + }); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [], + }); + const body = await readJson<{ residualCount: number; residualIssues: DataQualityIssue[] }>(res); + expect(body.residualCount).toBe(0); + expect(body.residualIssues).toEqual([]); + }); + + test("residualIssues surfaces leftovers when proposals leave data dirty", async () => { + const id = await createAgent(); + const agent = _getAgentForTests(id)!; + const session = agent.getDataGuardSession(); + // Dataset with a placeholder "999" the user denied — re-scan should still + // flag it because nothing was fixed. + session.setDataset({ + columns: ["age"], + rows: [{ age: 30 }, { age: 999 }, { age: 40 }], + }); + const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, { + decisions: [], + }); + const body = await readJson<{ residualCount: number; residualIssues: DataQualityIssue[] }>(res); + expect(body.residualCount).toBeGreaterThan(0); + expect(body.residualIssues.some(i => i.issueType === "placeholder_value")).toBe(true); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts index 2f7e706b380..e222b382de0 100644 --- a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts @@ -57,6 +57,54 @@ describe("applyFix", () => { expect(result.dataset.rows[3].age).toBe(null); }); + test("replace_value with rowIndices: targets rows by index, ignores cell value", () => { + // Regression for the "No changes detected in dataset" LakeFS error: when + // the LLM proposed `match` that didn't equal the cell exactly (rounding, + // string-vs-number, etc.), cellEquals silently no-op'd and the exported + // CSV was byte-identical → version commit aborted. rowIndices is the + // deterministic escape hatch used by outlier proposals. + const ds: DatasetView = { + columns: ["glucose"], + rows: [{ glucose: 100 }, { glucose: 949.7 }, { glucose: 120 }, { glucose: 815.3 }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "replace_value", + operationParams: { column: "glucose", rowIndices: [1, 3], replacement: 110 }, + }) + ); + expect(result.rowsAffected).toBe(2); + expect(result.dataset.rows[0].glucose).toBe(100); + expect(result.dataset.rows[1].glucose).toBe(110); + expect(result.dataset.rows[2].glucose).toBe(120); + expect(result.dataset.rows[3].glucose).toBe(110); + }); + + test("replace_value: rowIndices wins when both rowIndices and match are present", () => { + // Deterministic precedence: if both are supplied (legacy LLM output), + // honor the index-based targeting since match is the fragile path. + const ds: DatasetView = { + columns: ["x"], + rows: [{ x: 1 }, { x: 2 }, { x: 3 }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "replace_value", + operationParams: { + column: "x", + rowIndices: [0], + match: 999, // would match nothing — only rowIndices should win + replacement: 99, + }, + }) + ); + expect(result.rowsAffected).toBe(1); + expect(result.dataset.rows[0].x).toBe(99); + expect(result.dataset.rows[1].x).toBe(2); + }); + test("replace_value: original dataset is not mutated", () => { const ds: DatasetView = { columns: ["age"], @@ -140,41 +188,105 @@ describe("applyFix", () => { expect(result.dataset.rows[2].v).toBe(4); }); - test("impute mode: fills missing with most common string", () => { + test("impute: treats profiler missing-tokens (N/A, NULL, null, Unknown, whitespace) as missing", () => { + // Regression: apply-fix's isMissing used to only recognize null/undefined/NaN/"" + // while the profiler flagged "N/A", "NULL", "null", "Unknown" as missing. Result: + // after Fix-and-run, the cleaned CSV still contained those literal tokens because + // impute silently skipped them. Both sides must agree. const ds: DatasetView = { - columns: ["c"], + columns: ["glucose"], rows: [ - { c: "A" }, { c: "A" }, { c: "B" }, { c: null }, { c: "" }, + { glucose: 100 }, + { glucose: "NULL" }, + { glucose: 120 }, + { glucose: "null" }, + { glucose: "N/A" }, + { glucose: " " }, + { glucose: 140 }, ], }; const result = applyFix( ds, makeProposal({ operationKind: "impute", - operationParams: { column: "c", strategy: "mode" }, + operationParams: { column: "glucose", strategy: "median" }, + }) + ); + // 4 missing tokens replaced; non-missing observations [100, 120, 140] → median 120. + expect(result.rowsAffected).toBe(4); + expect(result.dataset.rows[1].glucose).toBe(120); + expect(result.dataset.rows[3].glucose).toBe(120); + expect(result.dataset.rows[4].glucose).toBe(120); + expect(result.dataset.rows[5].glucose).toBe(120); + }); + + test("impute mode: missing-token strings are not counted as mode candidates", () => { + // "NULL" appearing twice must not be voted the mode just because it's the + // most frequent string — it's a missing-marker, not data. + const ds: DatasetView = { + columns: ["group"], + rows: [ + { group: "A" }, + { group: "NULL" }, + { group: "A" }, + { group: "NULL" }, + { group: "B" }, + { group: null }, + ], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "impute", + operationParams: { column: "group", strategy: "mode" }, }) ); + expect(result.dataset.rows[1].group).toBe("A"); + expect(result.dataset.rows[3].group).toBe("A"); + expect(result.dataset.rows[5].group).toBe("A"); + }); + + test("impute respects session-supplied missingTokens override", () => { + // Regression for the missingTokens-not-threaded bug: a user who set + // {missingTokens: ["xyz"]} at scan time would have rows whose value is + // literally "xyz" flagged by the profiler but silently skipped by impute + // (which only knew about the default tokens). Threading ApplyOptions + // through fixes this. + const ds: DatasetView = { + columns: ["v"], + rows: [{ v: 1 }, { v: "xyz" }, { v: 3 }, { v: "xyz" }, { v: 5 }], + }; + const result = applyFix( + ds, + makeProposal({ + operationKind: "impute", + operationParams: { column: "v", strategy: "median" }, + }), + { missingTokens: ["xyz"] } + ); + // Non-missing values [1, 3, 5] → median 3, both "xyz" cells replaced. expect(result.rowsAffected).toBe(2); - expect(result.dataset.rows[3].c).toBe("A"); - expect(result.dataset.rows[4].c).toBe("A"); + expect(result.dataset.rows[1].v).toBe(3); + expect(result.dataset.rows[3].v).toBe(3); }); - test("flag: does not mutate rows, populates flaggedRows", () => { + test("impute mode: fills missing with most common string", () => { const ds: DatasetView = { - columns: ["x"], - rows: [{ x: 1 }, { x: 2 }, { x: 3 }], + columns: ["c"], + rows: [ + { c: "A" }, { c: "A" }, { c: "B" }, { c: null }, { c: "" }, + ], }; const result = applyFix( ds, makeProposal({ - operationKind: "flag", - operationParams: { rowIndices: [0, 2] }, + operationKind: "impute", + operationParams: { column: "c", strategy: "mode" }, }) ); expect(result.rowsAffected).toBe(2); - expect(result.flaggedRows).toEqual([0, 2]); - expect(result.dataset.rows[0].x).toBe(1); - expect(result.dataset.rows[2].x).toBe(3); + expect(result.dataset.rows[3].c).toBe("A"); + expect(result.dataset.rows[4].c).toBe("A"); }); test("trim_whitespace: trims string cells in target column", () => { diff --git a/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts index 27afdb8346f..389858105bf 100644 --- a/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts @@ -58,7 +58,6 @@ describe("DataGuardSession", () => { expect(s.getDataset()).toBe(ds); expect(s.getIssues()).toEqual([]); expect(s.getDecisionLog()).toEqual([]); - expect(s.getFlaggedRows()).toEqual([]); }); test("recordIssue accumulates and dedupes by issueId", () => { @@ -110,10 +109,4 @@ describe("DataGuardSession", () => { expect(s.matchesAutoAllowRule("placeholder_value")).toBe(false); }); - test("addFlaggedRows merges + dedupes + sorts", () => { - const s = new DataGuardSession(); - s.addFlaggedRows([3, 1, 2]); - s.addFlaggedRows([2, 5]); - expect(s.getFlaggedRows()).toEqual([1, 2, 3, 5]); - }); }); diff --git a/agent-service/src/agent/tools/dataguard/__tests__/decision-log-no-modify.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/decision-log-no-modify.test.ts new file mode 100644 index 00000000000..146f5b0961e --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/__tests__/decision-log-no-modify.test.ts @@ -0,0 +1,128 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Decision-log contract tests for the Modify-verdict cut (task #11a / #15). +// The CSV-shaped audit trail in §4.4 of the design doc currently carries a +// `modified_action` column. After #11a that column is gone and so is the +// `modifiedAction` property on each in-memory DecisionLogEntry produced by +// DataGuardSession.recordDecision. + +import { describe, expect, test } from "bun:test"; +import { serializeDecisionLogCsv } from "../decision-log"; +import { DataGuardSession } from "../dataguard-session"; +import type { DecisionLogEntry, FixProposal } from "../../../../types/dataguard"; + +function entry(overrides: Partial = {}): DecisionLogEntry { + return { + decisionId: "dec-1", + timestamp: "2026-05-15T12:00:00.000Z", + issueType: "placeholder_value", + targetRowCount: 5, + proposedAction: "Replace age=999 with NULL", + userDecision: "allow", + reason: "out of valid range", + confidence: "high", + appliedAt: "2026-05-15T12:00:01.000Z", + ...overrides, + }; +} + +function proposal(overrides: Partial = {}): FixProposal { + return { + issueId: "iss-1", + issueType: "placeholder_value", + action: "Replace age=999 with NULL", + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + riskTier: "medium", + reason: "out of valid range", + evidence: "5 of 5 placeholder rows", + confidence: "high", + targetRowCount: 5, + ...overrides, + }; +} + +describe("serializeDecisionLogCsv header — modified_action column removed (#11a)", () => { + test("empty-log header is the 9-column schema with no `modified_action`", () => { + const csv = serializeDecisionLogCsv([]); + const header = csv.split("\n")[0]; + expect(header).toBe( + "decision_id,timestamp,issue_type,target_rows,proposed_action,user_decision,reason,confidence,applied_at" + ); + expect(header).not.toContain("modified_action"); + }); + + test("data rows have exactly 9 fields (matching the 9-column header)", () => { + const csv = serializeDecisionLogCsv([entry()]); + const lines = csv.split("\n"); + const headerCols = lines[0].split(",").length; + // Use a CSV-aware split for the data row to handle quoted fields safely; + // here the fixture has no commas inside fields so a plain split is fine. + const dataCols = lines[1].split(",").length; + expect(headerCols).toBe(9); + expect(dataCols).toBe(9); + }); +}); + +describe("DataGuardSession.recordDecision — modifiedAction is gone (#11a)", () => { + test("written entries never carry a `modifiedAction` field", () => { + const session = new DataGuardSession(); + session.recordProposal(proposal()); + session.recordDecision({ + proposal: proposal(), + verdict: "allow", + applied: true, + }); + const log = session.getDecisionLog(); + expect(log).toHaveLength(1); + expect(log[0]).not.toHaveProperty("modifiedAction"); + }); + + test("a denied decision likewise has no `modifiedAction`", () => { + const session = new DataGuardSession(); + session.recordProposal(proposal()); + session.recordDecision({ + proposal: proposal(), + verdict: "deny", + applied: false, + }); + const log = session.getDecisionLog(); + expect(log[0]).not.toHaveProperty("modifiedAction"); + }); + + test("auto_allow_low_risk and auto_allow_remembered entries also lack `modifiedAction`", () => { + const session = new DataGuardSession(); + session.recordProposal(proposal()); + session.recordDecision({ + proposal: proposal(), + verdict: "auto_allow_low_risk", + applied: true, + }); + session.recordDecision({ + proposal: proposal({ issueId: "iss-2" }), + verdict: "auto_allow_remembered", + applied: true, + }); + const log = session.getDecisionLog(); + expect(log).toHaveLength(2); + expect(log[0]).not.toHaveProperty("modifiedAction"); + expect(log[1]).not.toHaveProperty("modifiedAction"); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts index f31f1adbaa0..da8ddf4e496 100644 --- a/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts @@ -38,9 +38,12 @@ function entry(overrides: Partial = {}): DecisionLogEntry { describe("serializeDecisionLogCsv", () => { test("empty log returns header only", () => { + // The `modified_action` column was removed by #11a (Modify verdict cut); + // the canonical 9-column header lives below — `decision-log-no-modify.test.ts` + // also locks this contract from the opposite direction. const csv = serializeDecisionLogCsv([]); expect(csv.split("\n")).toEqual([ - "decision_id,timestamp,issue_type,target_rows,proposed_action,user_decision,modified_action,reason,confidence,applied_at", + "decision_id,timestamp,issue_type,target_rows,proposed_action,user_decision,reason,confidence,applied_at", ]); }); @@ -65,13 +68,14 @@ describe("serializeDecisionLogCsv", () => { expect(dataRow).toContain('"line1\nline2"'); }); - test("missing appliedAt and modifiedAction render as empty fields", () => { + test("missing appliedAt renders as an empty trailing field", () => { + // Post-#11a the schema is 9 cols; appliedAt is the last and can be blank + // for denied decisions (nothing was applied). const csv = serializeDecisionLogCsv([ entry({ userDecision: "deny", appliedAt: undefined }), ]); const row = csv.split("\n")[1]; - expect(row.endsWith(",")).toBe(true); // appliedAt is the last column and is empty - expect(row).toContain(",,"); // modifiedAction is empty between reason+confidence's neighbors + expect(row.endsWith(",")).toBe(true); }); test("multiple rows preserve insertion order", () => { diff --git a/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts new file mode 100644 index 00000000000..06402063524 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Type-level contract tests for the Modify-verdict cut (task #11a / #15). +// +// `tsc --noEmit` is part of the QA gate (bun run typecheck). Any one of the +// ts-expect-error directives below failing to fire WILL produce a compile +// error of the form "Unused 'ts-expect-error' directive". (Avoiding the @ +// prefix in this comment so tsc does not parse it as a directive itself.) +// Backend's job (in #15) is to narrow `Verdict` to "allow" | "deny" (plus the +// two internal auto_allow_* sentinels) and to drop `modifiedAction` from +// `PermissionDecision`, `DecisionLogEntry`, and `RecordDecisionInput`. + +import { describe, expect, test } from "bun:test"; +import type { + DecisionLogEntry, + PermissionDecision, + Verdict, +} from "../../../../types/dataguard"; +import type { RecordDecisionInput } from "../dataguard-session"; + +describe("Verdict type — Modify is gone (#11a)", () => { + test("a value of \"modify\" is NOT assignable to Verdict", () => { + // @ts-expect-error "modify" is removed from the Verdict union by #11a. + const v: Verdict = "modify"; + // The .toBe argument is also "modify" and that argument is statically + // checked against the Verdict union — silence the same error here too. + // @ts-expect-error + expect(v).toBe("modify"); + }); + + test("\"allow\" and \"deny\" remain valid Verdict members", () => { + const allow: Verdict = "allow"; + const deny: Verdict = "deny"; + expect(allow).toBe("allow"); + expect(deny).toBe("deny"); + }); + + test("the two internal auto_allow_* sentinels remain valid", () => { + const low: Verdict = "auto_allow_low_risk"; + const remembered: Verdict = "auto_allow_remembered"; + expect(low).toBe("auto_allow_low_risk"); + expect(remembered).toBe("auto_allow_remembered"); + }); +}); + +describe("PermissionDecision — modifiedAction is gone (#11a)", () => { + test("constructing a PermissionDecision with `modifiedAction` is a type error", () => { + const d: PermissionDecision = { + stepId: "step-1", + verdict: "allow", + // @ts-expect-error `modifiedAction` is removed from PermissionDecision by #11a. + modifiedAction: "Flag instead of replace", + }; + expect(d.stepId).toBe("step-1"); + }); + + test("a minimal PermissionDecision with only stepId + verdict still type-checks", () => { + const d: PermissionDecision = { stepId: "step-1", verdict: "deny" }; + expect(d.verdict).toBe("deny"); + }); +}); + +describe("DecisionLogEntry — modifiedAction is gone (#11a)", () => { + test("constructing a DecisionLogEntry with `modifiedAction` is a type error", () => { + const e: DecisionLogEntry = { + decisionId: "dec-1", + timestamp: "2026-05-15T00:00:00.000Z", + issueType: "placeholder_value", + targetRowCount: 5, + proposedAction: "Replace age=999 with NULL", + userDecision: "allow", + // @ts-expect-error `modifiedAction` is removed from DecisionLogEntry by #11a. + modifiedAction: "Flag instead of replace", + reason: "test", + confidence: "high", + }; + expect(e.decisionId).toBe("dec-1"); + }); +}); + +describe("RecordDecisionInput — modifiedAction is gone (#11a)", () => { + test("DataGuardSession.recordDecision callers can no longer pass `modifiedAction`", () => { + const proposal = { + issueId: "iss-1", + issueType: "placeholder_value" as const, + action: "Replace age=999 with NULL", + operationKind: "replace_value" as const, + operationParams: {}, + riskTier: "medium" as const, + reason: "test", + evidence: "test", + confidence: "high" as const, + targetRowCount: 5, + }; + const input: RecordDecisionInput = { + proposal, + verdict: "allow", + // @ts-expect-error `modifiedAction` is removed from RecordDecisionInput by #11a. + modifiedAction: "Flag instead of replace", + applied: true, + }; + expect(input.verdict).toBe("allow"); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts index 0586e338445..8a22e1d0c23 100644 --- a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts @@ -121,6 +121,59 @@ describe("profileDataset", () => { expect(dup!.affectedRowCount).toBe(2); }); + test("auto-infers idColumn from name patterns (sample_id) when caller omits it", () => { + // The auto-trigger pipeline POSTs /scan with an empty body — without this + // inference, dup-ID detection would never fire on user files. Match must + // be conservative (id-like column names only), not value-based. + const ds: DatasetView = { + columns: ["sample_id", "age"], + rows: [ + { sample_id: "S1", age: 30 }, + { sample_id: "S2", age: 40 }, + { sample_id: "S1", age: 30 }, + ], + }; + const issues = profileDataset(ds); + const dup = issues.find(i => i.issueType === "duplicate_id"); + expect(dup).toBeDefined(); + expect(dup!.column).toBe("sample_id"); + expect(dup!.affectedRowCount).toBe(2); + }); + + test("auto-infer recognizes bare `id`, `*Id`, `id_*` patterns too", () => { + const cases: Array<{ col: string }> = [ + { col: "id" }, + { col: "userId" }, + { col: "id_card" }, + { col: "ID" }, + ]; + for (const { col } of cases) { + const ds: DatasetView = { + columns: [col, "value"], + rows: [ + { [col]: "a", value: 1 }, + { [col]: "a", value: 2 }, + { [col]: "b", value: 3 }, + ], + }; + const issues = profileDataset(ds); + const dup = issues.find(i => i.issueType === "duplicate_id"); + expect(dup).toBeDefined(); + expect(dup!.column).toBe(col); + } + }); + + test("auto-infer does NOT fire when no column name looks like an ID", () => { + // Conservative: just having repeated values isn't enough — the user's + // workflow may legitimately have duplicate categorical labels. + const ds: DatasetView = { + columns: ["color", "qty"], + rows: [{ color: "red", qty: 1 }, { color: "red", qty: 2 }], + }; + const issues = profileDataset(ds); + expect(issues.find(i => i.issueType === "duplicate_id")).toBeUndefined(); + }); + test("no idColumn → no duplicate_id issue even with repeated values", () => { const ds: DatasetView = { columns: ["x"], @@ -130,7 +183,7 @@ describe("profileDataset", () => { expect(issues.find(i => i.issueType === "duplicate_id")).toBeUndefined(); }); - test("validRanges → detects out-of-range values", () => { + test("validRanges → detects outlier values", () => { const ds: DatasetView = { columns: ["bmi"], rows: [ @@ -138,18 +191,18 @@ describe("profileDataset", () => { ], }; const issues = profileDataset(ds, { validRanges: { bmi: { min: 10, max: 60 } } }); - const oor = issues.find(i => i.issueType === "out_of_range"); - expect(oor).toBeDefined(); - expect(oor!.affectedRowCount).toBe(2); + const outlier = issues.find(i => i.issueType === "outlier"); + expect(outlier).toBeDefined(); + expect(outlier!.affectedRowCount).toBe(2); }); - test("placeholder values are not double-counted as out_of_range", () => { + test("placeholder values are not double-counted as outliers", () => { const ds: DatasetView = { columns: ["age"], rows: [{ age: 25 }, { age: 999 }, { age: 30 }], }; const issues = profileDataset(ds, { validRanges: { age: { min: 0, max: 130 } } }); - expect(issues.find(i => i.issueType === "out_of_range")).toBeUndefined(); + expect(issues.find(i => i.issueType === "outlier")).toBeUndefined(); expect(issues.find(i => i.issueType === "placeholder_value")).toBeDefined(); }); @@ -204,6 +257,109 @@ describe("profileDataset", () => { expect(kinds.has("placeholder_value")).toBe(true); expect(kinds.has("missing_value")).toBe(true); expect(kinds.has("duplicate_id")).toBe(true); - expect(kinds.has("out_of_range")).toBe(true); + expect(kinds.has("outlier")).toBe(true); + }); + + describe("outlier detector (validRanges-based)", () => { + test("does not fire without validRanges hint — earlier z-score variant was removed", () => { + // 19 values near 50, one extreme reading at 500. The old z-score + // detector would have flagged the 500 automatically; the validRanges + // detector requires the caller to opt in by supplying a hard range. + const rows: Array> = []; + for (let i = 0; i < 19; i++) rows.push({ v: 50 + (i % 3) }); + rows.push({ v: 500 }); + const issues = profileDataset({ columns: ["v"], rows }); + expect(issues.find(i => i.issueType === "outlier")).toBeUndefined(); + }); + + test("flags values outside the user-supplied range", () => { + const rows: Array> = []; + for (let i = 0; i < 19; i++) rows.push({ age: 30 + (i % 5) }); + rows.push({ age: 500 }); + const issues = profileDataset( + { columns: ["age"], rows }, + { validRanges: { age: { min: 0, max: 120 } } } + ); + const outlier = issues.find(i => i.issueType === "outlier" && i.column === "age"); + expect(outlier).toBeDefined(); + expect(outlier!.affectedRowCount).toBe(1); + expect(outlier!.affectedRowIndices).toEqual([19]); + }); + + test("does NOT flag clusters of consecutive large readings — the whole point of the redesign", () => { + // The earlier z-score detector would have flagged half of this column. + // We deliberately don't: the user owns the definition of "out of range", + // and unless they say so, clustered extremes are real data. + const rows: Array> = []; + for (let i = 0; i < 10; i++) rows.push({ glucose: 100 }); + for (let i = 0; i < 10; i++) rows.push({ glucose: 400 }); // sustained high + const issues = profileDataset({ columns: ["glucose"], rows }); + expect(issues.find(i => i.issueType === "outlier")).toBeUndefined(); + }); + }); + + describe("inconsistent_label detector", () => { + test("flags rows using non-canonical spellings of the same label", () => { + const ds: DatasetView = { + columns: ["gender"], + rows: [ + { gender: "Male" }, + { gender: "Male" }, + { gender: "Male" }, + { gender: "male" }, + { gender: "MALE" }, + { gender: "Female" }, + { gender: "Female" }, + { gender: "female" }, + ], + }; + const issues = profileDataset(ds); + const ilab = issues.find(i => i.issueType === "inconsistent_label" && i.column === "gender"); + expect(ilab).toBeDefined(); + // "male" and "MALE" are non-canonical variants of "Male" (most common); + // "female" is non-canonical variant of "Female". Total: 3 rows. + expect(ilab!.affectedRowCount).toBe(3); + }); + + test("ignores columns that are pure data (every value unique)", () => { + const ds: DatasetView = { + columns: ["note"], + rows: Array.from({ length: 30 }, (_, i) => ({ note: `note-${i}` })), + }; + const issues = profileDataset(ds); + expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined(); + }); + + test("ignores columns above cardinality cap (likely free text)", () => { + const rows: Array> = []; + for (let i = 0; i < 25; i++) rows.push({ tag: `tag${i}` }); + // Add a clear inconsistency for the 26th distinct group — but we exceed the cap. + rows.push({ tag: "tag1 " }); // would collide with "tag1" if checked + const issues = profileDataset( + { columns: ["tag"], rows }, + { inconsistentLabelMaxCardinality: 20 } + ); + expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined(); + }); + + test("disabled when inconsistentLabelMaxCardinality = 0", () => { + const ds: DatasetView = { + columns: ["g"], + rows: [{ g: "A" }, { g: "a" }, { g: "A" }], + }; + const issues = profileDataset(ds, { inconsistentLabelMaxCardinality: 0 }); + expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined(); + }); + + test("does not flag when all spellings agree (genuine low-cardinality categorical)", () => { + const ds: DatasetView = { + columns: ["group"], + rows: [ + { group: "A" }, { group: "A" }, { group: "A" }, { group: "B" }, { group: "B" }, + ], + }; + const issues = profileDataset(ds); + expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined(); + }); }); }); diff --git a/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts new file mode 100644 index 00000000000..8161e261ad2 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts @@ -0,0 +1,117 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Post-cut behavior pin for the with-approval permission gate (#11a). +// The legacy chat-flow gate (`requestApproval` in with-approval.ts) is still +// wired for any future caller (e.g. WorkflowGuard). After Modify is cut, the +// gate must: +// +// • still resolve cleanly on "allow" and "deny" verdicts arriving over WS +// • never receive a "modify" verdict — and the type system enforces this +// (the @ts-expect-error below proves `verdict: "modify"` won't compile) +// +// The pre-existing test in `with-approval.test.ts` named +// "'modify' verdict carries through with the modifiedAction" is expected to +// be removed by backend during #15 (it constructs a PermissionDecision +// literal containing `verdict: "modify"`, which will be a type error after +// the cut). + +import { describe, expect, test } from "bun:test"; +import { requestApproval, type ApprovalGateway } from "../with-approval"; +import type { + FixProposal, + IssueType, + PermissionDecision, +} from "../../../../types/dataguard"; + +function makeProposal(overrides: Partial = {}): FixProposal { + return { + issueId: "iss-1", + issueType: "placeholder_value", + action: "Replace age=999 with NULL", + operationKind: "replace_value", + operationParams: { column: "age", match: 999, replacement: null }, + riskTier: "medium", + reason: "test", + evidence: "test", + confidence: "high", + targetRowCount: 5, + ...overrides, + }; +} + +class MockGateway implements ApprovalGateway { + rules: Set = new Set(); + emitted: Array<{ stepId: string; proposal: FixProposal }> = []; + private decisions: Map = new Map(); + private waiters: Map void> = new Map(); + private counter = 0; + + matchesAutoAllowRule(issueType: IssueType): boolean { + return this.rules.has(issueType); + } + generateStepId(): string { + this.counter += 1; + return `mock-step-${this.counter}`; + } + emitPendingApproval(stepId: string, proposal: FixProposal): void { + this.emitted.push({ stepId, proposal }); + } + awaitDecision(stepId: string): Promise { + if (this.decisions.has(stepId)) return Promise.resolve(this.decisions.get(stepId)!); + return new Promise(resolve => this.waiters.set(stepId, resolve)); + } + resolveLater(stepId: string, decision: PermissionDecision): void { + const w = this.waiters.get(stepId); + if (w) { + this.waiters.delete(stepId); + w(decision); + } else { + this.decisions.set(stepId, decision); + } + } +} + +describe("requestApproval after Modify cut (#11a)", () => { + test("medium-risk allow flow round-trips with no modifiedAction in sight", async () => { + const gw = new MockGateway(); + const promise = requestApproval(gw, makeProposal({ riskTier: "medium" })); + expect(gw.emitted).toHaveLength(1); + gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "allow" }); + const decision = await promise; + expect(decision.verdict).toBe("allow"); + expect(decision).not.toHaveProperty("modifiedAction"); + }); + + test("high-risk deny flow round-trips with no modifiedAction in sight", async () => { + const gw = new MockGateway(); + const promise = requestApproval(gw, makeProposal({ riskTier: "high" })); + expect(gw.emitted).toHaveLength(1); + gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "deny" }); + const decision = await promise; + expect(decision.verdict).toBe("deny"); + expect(decision).not.toHaveProperty("modifiedAction"); + }); + + test("type system rejects a PermissionDecision literal with verdict: \"modify\"", () => { + // @ts-expect-error "modify" is no longer a Verdict member after #11a. + const bad: PermissionDecision = { stepId: "x", verdict: "modify" }; + expect(bad.stepId).toBe("x"); + }); +}); diff --git a/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts index 1cfbbb0b67b..c92ffe43d1c 100644 --- a/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts +++ b/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts @@ -112,17 +112,26 @@ describe("requestApproval", () => { expect(decision.verdict).toBe("deny"); }); - test("'modify' verdict carries through with the modifiedAction", async () => { + test("warning tier: prompts every time even with a remembered rule", async () => { + // The whole point of `warning` is "we have a concrete fix but need human + // judgement". A remembered rule from earlier in the session must NOT + // auto-approve it — otherwise the warning tier becomes pointless. const gw = new MockGateway(); - const promise = requestApproval(gw, makeProposal({ riskTier: "medium" })); - gw.resolveLater("mock-step-1", { - stepId: "mock-step-1", - verdict: "modify", - modifiedAction: "Flag instead of replace", - }); + gw.rules.add("outlier"); + const promise = requestApproval(gw, makeProposal({ issueType: "outlier", riskTier: "warning" })); + expect(gw.emitted).toHaveLength(1); + gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "allow" }); + const decision = await promise; + expect(decision.verdict).toBe("allow"); + }); + + test("warning tier without a remembered rule: still prompts (no auto_allow_low_risk)", async () => { + const gw = new MockGateway(); + const promise = requestApproval(gw, makeProposal({ riskTier: "warning" })); + expect(gw.emitted).toHaveLength(1); + gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "deny" }); const decision = await promise; - expect(decision.verdict).toBe("modify"); - expect(decision.modifiedAction).toBe("Flag instead of replace"); + expect(decision.verdict).toBe("deny"); }); test("a decision that arrives before the tool awaits is buffered and delivered", async () => { diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.ts b/agent-service/src/agent/tools/dataguard/apply-fix.ts index 77d2c0f85f4..fef924e9684 100644 --- a/agent-service/src/agent/tools/dataguard/apply-fix.ts +++ b/agent-service/src/agent/tools/dataguard/apply-fix.ts @@ -24,14 +24,27 @@ import type { FixProposal } from "../../../types/dataguard"; import type { DatasetView } from "./dataset"; +import { isMissing as isCellMissing } from "./missing-detection"; export interface ApplyResult { dataset: DatasetView; rowsAffected: number; - flaggedRows: number[]; } -export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResult { +export interface ApplyOptions { + // Extra tokens — beyond the DEFAULT_MISSING_TOKENS — that should also be + // treated as missing. Threaded through from the user's /scan call so that + // a session configured with custom `missingTokens` (e.g., ["xyz"]) sees + // those same tokens treated as missing during `impute`. Without this, + // apply-fix and the profiler would silently disagree on what's missing. + missingTokens?: string[]; +} + +export function applyFix( + dataset: DatasetView, + proposal: FixProposal, + options: ApplyOptions = {} +): ApplyResult { const rows = dataset.rows.map(r => ({ ...r })); let columns = [...dataset.columns]; const params = proposal.operationParams; @@ -39,16 +52,36 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu switch (proposal.operationKind) { case "replace_value": { const column = params.column as string; - const match = params.match; const replacement = params.replacement; + // Two targeting modes: + // - rowIndices: deterministic, used for outlier / placeholder + // where the profiler already knows exactly which rows are wrong. + // - match: value-based, used for cases like "replace every 'unknown' with null". + // rowIndices wins when both are present. Without rowIndices support, + // replace_value silently no-ops whenever the LLM-supplied `match` is + // slightly off (e.g. 950 vs 950.0 vs "950") — that turned every outlier + // proposal into a byte-identical re-export, which then made LakeFS abort + // the version commit with "No changes detected in dataset". + const targetIndices = params.rowIndices as number[] | undefined; let affected = 0; - for (const r of rows) { - if (cellEquals(r[column], match)) { - r[column] = replacement; - affected++; + if (targetIndices && targetIndices.length > 0) { + const indexSet = new Set(targetIndices); + for (let i = 0; i < rows.length; i++) { + if (indexSet.has(i)) { + rows[i][column] = replacement; + affected++; + } + } + } else { + const match = params.match; + for (const r of rows) { + if (cellEquals(r[column], match)) { + r[column] = replacement; + affected++; + } } } - return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + return { dataset: { columns, rows }, rowsAffected: affected }; } case "drop_rows": { @@ -57,31 +90,21 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu return { dataset: { columns, rows: kept }, rowsAffected: rows.length - kept.length, - flaggedRows: [], }; } case "impute": { const column = params.column as string; const strategy = params.strategy as "mean" | "median" | "mode"; - const fill = computeImputeValue(rows, column, strategy); + const fill = computeImputeValue(rows, column, strategy, options.missingTokens); let affected = 0; for (const r of rows) { - if (isMissing(r[column])) { + if (isCellMissing(r[column], options.missingTokens)) { r[column] = fill; affected++; } } - return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; - } - - case "flag": { - const indices = (params.rowIndices as number[]).slice(); - return { - dataset: { columns, rows }, - rowsAffected: indices.length, - flaggedRows: indices, - }; + return { dataset: { columns, rows }, rowsAffected: affected }; } case "trim_whitespace": { @@ -97,7 +120,7 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu } } } - return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + return { dataset: { columns, rows }, rowsAffected: affected }; } case "standardize": { @@ -111,7 +134,7 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu affected++; } } - return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + return { dataset: { columns, rows }, rowsAffected: affected }; } case "rename_column": { @@ -126,7 +149,7 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu affected++; } } - return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] }; + return { dataset: { columns, rows }, rowsAffected: affected }; } default: @@ -144,23 +167,20 @@ function cellEquals(a: unknown, b: unknown): boolean { return false; } -function isMissing(v: unknown): boolean { - if (v === null || v === undefined) return true; - if (typeof v === "number" && Number.isNaN(v)) return true; - if (typeof v === "string" && v === "") return true; - return false; -} - function computeImputeValue( rows: Record[], column: string, - strategy: "mean" | "median" | "mode" + strategy: "mean" | "median" | "mode", + missingTokens?: string[] ): unknown { const numericValues: number[] = []; const stringCounts = new Map(); for (const r of rows) { const v = r[column]; - if (isMissing(v)) continue; + // Honor the session's missingTokens override so impute treats the exact + // same set of cells as missing that the profiler flagged. Without this, + // the user sees "NULL"/"N/A" still in the cleaned CSV after Fix-and-run. + if (isCellMissing(v, missingTokens)) continue; if (typeof v === "number" && Number.isFinite(v)) { numericValues.push(v); } else if (typeof v === "string") { diff --git a/agent-service/src/agent/tools/dataguard/dataguard-session.ts b/agent-service/src/agent/tools/dataguard/dataguard-session.ts index 1fc33799f8b..6b7ced8fcff 100644 --- a/agent-service/src/agent/tools/dataguard/dataguard-session.ts +++ b/agent-service/src/agent/tools/dataguard/dataguard-session.ts @@ -19,8 +19,8 @@ // Per-agent DataGuard run state. One DataGuardSession lives on each // TexeraAgent (lazy-initialized when the first DataGuard tool fires) and -// holds the working dataset, accumulated issues, decision log, flagged rows, -// and auto-allow rules. Independent of the workflow state so resetting one +// holds the working dataset, accumulated issues, decision log, and +// auto-allow rules. Independent of the workflow state so resetting one // does not affect the other. import type { @@ -36,20 +36,36 @@ import type { DatasetView } from "./dataset"; export interface RecordDecisionInput { proposal: FixProposal; verdict: Verdict; - modifiedAction?: string; applied: boolean; } +// Profiler options the user supplied at /scan time. Stored so the post-apply +// re-scan can use the same configuration the issues were originally found with. +export interface ScanOptions { + idColumn?: string; + validRanges?: Record; + placeholderValues?: Array; + missingTokens?: string[]; +} + export class DataGuardSession { private dataset: DatasetView | undefined; private issues: Map = new Map(); private proposals: Map = new Map(); private decisionLog: DecisionLogEntry[] = []; - private flaggedRows: Set = new Set(); private autoAllowRules: Map = new Map(); + private scanOptions: ScanOptions = {}; private decisionCounter = 0; private ruleCounter = 0; + setScanOptions(opts: ScanOptions): void { + this.scanOptions = { ...opts }; + } + + getScanOptions(): ScanOptions { + return this.scanOptions; + } + setDataset(dataset: DatasetView): void { this.dataset = dataset; // A new dataset means a fresh DataGuard run — clear the per-run state. @@ -57,7 +73,6 @@ export class DataGuardSession { this.issues.clear(); this.proposals.clear(); this.decisionLog = []; - this.flaggedRows.clear(); } recordProposal(proposal: FixProposal): void { @@ -98,7 +113,6 @@ export class DataGuardSession { targetRowCount: input.proposal.targetRowCount, proposedAction: input.proposal.action, userDecision: input.verdict, - modifiedAction: input.modifiedAction, reason: input.proposal.reason, confidence: input.proposal.confidence, appliedAt: input.applied ? now : undefined, @@ -111,14 +125,6 @@ export class DataGuardSession { return [...this.decisionLog]; } - addFlaggedRows(indices: number[]): void { - for (const i of indices) this.flaggedRows.add(i); - } - - getFlaggedRows(): number[] { - return Array.from(this.flaggedRows).sort((a, b) => a - b); - } - addAutoAllowRule(issueType: IssueType): AutoAllowRule { // Idempotent: if a rule already exists for this issueType, return it. for (const r of this.autoAllowRules.values()) { diff --git a/agent-service/src/agent/tools/dataguard/dataguard-tools.ts b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts index f58e29df8e4..50bb5ea3adf 100644 --- a/agent-service/src/agent/tools/dataguard/dataguard-tools.ts +++ b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts @@ -59,11 +59,12 @@ function createProfileDatasetTool(ctx: DataGuardToolContext) { return tool({ description: `Scan the loaded dataset for quality issues. Read-only. -Detects four categories: +Detects five categories: - missing_value: null / empty / configured missing tokens - placeholder_value: numeric (999, -1) or string sentinels - duplicate_id: requires idColumn hint -- out_of_range: requires validRanges hint per column +- outlier: requires validRanges hint per column — flags numeric values outside [min, max]. The earlier z-score variant was removed because it flagged legitimate consecutive large readings. +- inconsistent_label: low-cardinality string columns where trim+lowercase keys collide on multiple raw spellings (e.g. "Male"/"male"/"MALE") Call this once at the start of a DataGuard run. Returns a JSON array of DataQualityIssue records.`, inputSchema: z.object({ @@ -74,7 +75,7 @@ Call this once at the start of a DataGuard run. Returns a JSON array of DataQual validRanges: z .record(z.string(), z.object({ min: z.number(), max: z.number() })) .optional() - .describe("Per-column valid numeric range. Values outside are flagged as out_of_range."), + .describe("Per-column valid numeric range. Values outside are flagged as outlier."), placeholderValues: z .array(z.union([z.string(), z.number()])) .optional() @@ -135,7 +136,13 @@ function createApplyFixTool(ctx: DataGuardToolContext) { return tool({ description: `Apply a previously-proposed fix to the dataset. MUTATING — gated by user approval. -Pass the issueId. The proposal stored from suggest_fix is looked up automatically. For risk tier "low" the fix is auto-applied with a summary line; for "medium" / "high" the user must approve through the chat panel. The result includes the user's verdict.`, +Pass the issueId. The proposal stored from suggest_fix is looked up automatically. Risk-tier behaviour: +- "low": auto-applied with a summary line (no prompt). +- "medium": user must approve through the chat panel; "Allow & remember" is offered so future similar fixes auto-apply. +- "high": user must approve every time; "remember" is NOT offered (always-ask, e.g. drop_rows). +- "warning": user must approve every time; "remember" is NOT offered. Used for fixes that are concrete but where domain judgement is required (e.g. outlier clamping that might destroy a real extreme value). + +The result includes the user's verdict.`, inputSchema: z.object({ issueId: z.string().describe("The issueId whose proposal should be applied."), }), @@ -160,25 +167,19 @@ Pass the issueId. The proposal stored from suggest_fix is looked up automaticall }); } - // For modify, MVP keeps the original operationKind/params but records the - // user's free-text override in the log. Future iteration can parse the - // modifiedAction back into a structured proposal override. - const modifiedAction = decision.verdict === "modify" ? decision.modifiedAction : undefined; - try { - const result = applyFix(dataset, proposal); + const result = applyFix(dataset, proposal, { + missingTokens: ctx.session.getScanOptions().missingTokens, + }); ctx.session.updateDataset(result.dataset); - if (result.flaggedRows.length > 0) ctx.session.addFlaggedRows(result.flaggedRows); ctx.session.recordDecision({ proposal, verdict: decision.verdict, - modifiedAction, applied: true, }); return JSON.stringify({ verdict: decision.verdict, rowsAffected: result.rowsAffected, - flaggedRows: result.flaggedRows, datasetRowCount: result.dataset.rows.length, message: `Applied ${proposal.operationKind}. Rows affected: ${result.rowsAffected}.`, }); diff --git a/agent-service/src/agent/tools/dataguard/decision-log.ts b/agent-service/src/agent/tools/dataguard/decision-log.ts index e9752fdf422..7d2c5bc9003 100644 --- a/agent-service/src/agent/tools/dataguard/decision-log.ts +++ b/agent-service/src/agent/tools/dataguard/decision-log.ts @@ -33,7 +33,6 @@ const HEADER_COLUMNS = [ "target_rows", "proposed_action", "user_decision", - "modified_action", "reason", "confidence", "applied_at", @@ -55,7 +54,6 @@ function rowToCsv(e: DecisionLogEntry): string { csvField(String(e.targetRowCount)), csvField(e.proposedAction), csvField(e.userDecision), - csvField(e.modifiedAction ?? ""), csvField(e.reason), csvField(e.confidence), csvField(e.appliedAt ?? ""), diff --git a/agent-service/src/agent/tools/dataguard/missing-detection.ts b/agent-service/src/agent/tools/dataguard/missing-detection.ts new file mode 100644 index 00000000000..afa6e463ae9 --- /dev/null +++ b/agent-service/src/agent/tools/dataguard/missing-detection.ts @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Single source of truth for "is this cell missing / placeholder?" Used by both +// the profiler (which flags issues) and the applier (which fills/replaces). +// Why: when the two disagree, impute silently leaves cells that the profiler +// flagged as missing — the user sees "NULL"/"N/A" still in the cleaned CSV. + +export const DEFAULT_PLACEHOLDERS: ReadonlyArray = [ + 999, + -1, + "unknown", + "Unknown", +]; + +// Case-insensitive set of tokens that mean "no value was recorded." Compared +// against the *trimmed*, lowercased cell so whitespace and case can't smuggle +// a missing cell past the check. +const MISSING_TOKENS_LOWER: ReadonlySet = new Set([ + "na", + "n/a", + "null", + "none", + "nan", +]); + +// Kept for places that still want the raw token list (e.g., the profiler's +// ProfileOptions API surface). +export const DEFAULT_MISSING_TOKENS: ReadonlyArray = [ + "NA", + "N/A", + "n/a", + "null", + "NULL", + "None", +]; + +export function isMissing(value: unknown, extraTokens: ReadonlyArray = []): boolean { + if (value === null || value === undefined) return true; + if (typeof value === "number" && Number.isNaN(value)) return true; + if (typeof value !== "string") return false; + const trimmed = value.trim(); + if (trimmed === "") return true; + if (MISSING_TOKENS_LOWER.has(trimmed.toLowerCase())) return true; + if (extraTokens.includes(value) || extraTokens.includes(trimmed)) return true; + return false; +} + +export function toNumber(value: unknown): number | undefined { + if (typeof value === "number" && Number.isFinite(value)) return value; + if (typeof value === "string" && value.trim() !== "") { + const n = Number(value); + if (Number.isFinite(n)) return n; + } + return undefined; +} + +export function placeholderHit( + value: unknown, + placeholders: ReadonlyArray +): string | number | undefined { + for (const p of placeholders) { + if (typeof p === "string" && typeof value === "string" && p === value) return p; + if (typeof p === "number") { + const n = toNumber(value); + if (n !== undefined && n === p) return p; + } + } + return undefined; +} diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.ts index 08cd8723bd2..a5c55d2167d 100644 --- a/agent-service/src/agent/tools/dataguard/profile-dataset.ts +++ b/agent-service/src/agent/tools/dataguard/profile-dataset.ts @@ -22,6 +22,13 @@ import type { DataQualityIssue } from "../../../types/dataguard"; import type { DatasetView } from "./dataset"; +import { + DEFAULT_MISSING_TOKENS, + DEFAULT_PLACEHOLDERS, + isMissing as isCellMissing, + placeholderHit, + toNumber, +} from "./missing-detection"; export interface ProfileOptions { // Column to treat as a unique identifier; duplicates flagged as duplicate_id. @@ -34,29 +41,12 @@ export interface ProfileOptions { missingTokens?: string[]; // Above this row count, affectedRowIndices is omitted from the issue. maxIndicesInIssue?: number; + // Max distinct values a string column may have before inconsistent-label + // detection skips it (free-text columns will hit this and be ignored). + // Default 20. Set to 0 to disable label detection entirely. + inconsistentLabelMaxCardinality?: number; } -// Placeholders are sentinel *values* that look like data but are actually -// "no value." Kept distinct from missing-tokens to avoid double-flagging -// the same cell under two issue types. -const DEFAULT_PLACEHOLDERS: Array = [ - 999, - -1, - "unknown", - "Unknown", -]; - -// Tokens that mean "no data was recorded." Empty string is always treated -// as missing without needing to be listed. -const DEFAULT_MISSING_TOKENS: string[] = [ - "NA", - "N/A", - "n/a", - "null", - "NULL", - "None", -]; - const DEFAULT_MAX_INDICES_IN_ISSUE = 50; let issueCounter = 0; @@ -69,37 +59,8 @@ function nowIso(): string { return new Date().toISOString(); } -function isMissing(value: unknown, missingTokens: string[]): boolean { - if (value === null || value === undefined) return true; - if (typeof value === "number" && Number.isNaN(value)) return true; - if (typeof value === "string") { - if (value === "") return true; - if (missingTokens.includes(value)) return true; - } - return false; -} - -function toNumber(value: unknown): number | undefined { - if (typeof value === "number" && Number.isFinite(value)) return value; - if (typeof value === "string" && value.trim() !== "") { - const n = Number(value); - if (Number.isFinite(n)) return n; - } - return undefined; -} - -function placeholderHit( - value: unknown, - placeholders: Array -): string | number | undefined { - for (const p of placeholders) { - if (typeof p === "string" && typeof value === "string" && p === value) return p; - if (typeof p === "number") { - const n = toNumber(value); - if (n !== undefined && n === p) return p; - } - } - return undefined; +function isMissing(value: unknown, missingTokens: ReadonlyArray): boolean { + return isCellMissing(value, missingTokens); } function maybeIndices( @@ -109,6 +70,26 @@ function maybeIndices( return indices.length <= cap ? indices : undefined; } +// Guess which column is the row identifier when the caller didn't specify one. +// Conservative: only matches columns whose names look unambiguously like IDs. +// Tries the cheapest, most-specific patterns first so e.g. `sample_id` wins +// over a generic `id_card` column elsewhere in the schema. Returns undefined +// when nothing matches — the caller skips dup-ID detection in that case. +function inferIdColumn(columns: ReadonlyArray): string | undefined { + const matchers: Array<(name: string) => boolean> = [ + name => /^id$/i.test(name), + name => /_id$/i.test(name), + name => /^id_/i.test(name), + name => /Id$/.test(name), + name => /^.+_uid$/i.test(name) || /^uid$/i.test(name), + ]; + for (const m of matchers) { + const hit = columns.find(c => m(c)); + if (hit) return hit; + } + return undefined; +} + export function profileDataset( dataset: DatasetView, options: ProfileOptions = {} @@ -119,9 +100,9 @@ export function profileDataset( const detectedAt = nowIso(); const issues: DataQualityIssue[] = []; - // Pre-compute placeholder hits per row/column so out_of_range can avoid - // double-counting and missing-value can avoid flagging a row that has a - // string placeholder like "N/A" twice. + // Pre-compute placeholder hits per row/column so outlier-detection can + // skip rows already flagged elsewhere and missing-value can avoid flagging + // a row that has a string placeholder like "N/A" twice. const placeholderHitByColRow = new Map>(); for (const col of dataset.columns) { const map = new Map(); @@ -170,9 +151,14 @@ export function profileDataset( }); } - // Duplicate-ID detector (only when idColumn is configured and exists). - if (options.idColumn && dataset.columns.includes(options.idColumn)) { - const idCol = options.idColumn; + // Duplicate-ID detector. Honors options.idColumn when set; otherwise tries + // to infer one from column names (e.g. "sample_id" → use it). Without this + // inference the auto-trigger's empty-body /scan would never find dup IDs in + // user datasets — users don't configure scan options through the checklist UI. + const idCol = options.idColumn && dataset.columns.includes(options.idColumn) + ? options.idColumn + : inferIdColumn(dataset.columns); + if (idCol) { const positions = new Map(); for (let i = 0; i < dataset.rows.length; i++) { const v = dataset.rows[i][idCol]; @@ -205,28 +191,103 @@ export function profileDataset( } } - // Out-of-range detector — skips rows already flagged as placeholders so we - // don't surface the same row under two issue types. + // Inconsistent-label detector. For each low-cardinality string column, + // group raw values by a normalized key (trim + lowercase). If two or more + // raw spellings collapse to the same key, every row using a non-canonical + // spelling is flagged. Example: "Male"/"male"/"M" all map to "m"/"male" + // depending on key choice. We use trim+lowercase so "Yes"/"yes"/" yes " + // collide as expected. + const labelMaxCardinality = options.inconsistentLabelMaxCardinality ?? 20; + if (labelMaxCardinality > 0) { + for (const col of dataset.columns) { + const placeholderHits = placeholderHitByColRow.get(col)!; + // Count distinct non-missing, non-placeholder string values. + const raw = new Map(); + let nonStringSeen = false; + for (let i = 0; i < dataset.rows.length; i++) { + if (placeholderHits.has(i)) continue; + const v = dataset.rows[i][col]; + if (isMissing(v, missingTokens)) continue; + if (typeof v !== "string") { + nonStringSeen = true; + continue; + } + const list = raw.get(v); + if (list) list.push(i); + else raw.set(v, [i]); + } + if (nonStringSeen || raw.size === 0 || raw.size > labelMaxCardinality) continue; + + // Group by normalized key. A key with >1 raw spelling = inconsistent. + const groups = new Map; rows: number[] }>(); + for (const [rawValue, rows] of raw) { + const key = rawValue.trim().toLowerCase(); + const existing = groups.get(key); + if (existing) { + existing.spellings.add(rawValue); + existing.rows.push(...rows); + // Prefer the most common spelling as canonical (heuristic). + if (rows.length > (raw.get(existing.canonical)?.length ?? 0)) { + existing.canonical = rawValue; + } + } else { + groups.set(key, { canonical: rawValue, spellings: new Set([rawValue]), rows: [...rows] }); + } + } + const inconsistentRows: number[] = []; + const examples: string[] = []; + for (const g of groups.values()) { + if (g.spellings.size > 1) { + // Flag every row that uses a non-canonical spelling. + for (const [rawValue, rows] of raw) { + if (rawValue !== g.canonical && g.spellings.has(rawValue)) { + inconsistentRows.push(...rows); + } + } + examples.push(`{${Array.from(g.spellings).join(" / ")}}`); + } + } + if (inconsistentRows.length === 0) continue; + inconsistentRows.sort((a, b) => a - b); + issues.push({ + issueId: nextIssueId(), + issueType: "inconsistent_label", + column: col, + description: `${inconsistentRows.length} row(s) in ${col} use non-canonical label spellings`, + evidence: `Mixed spellings (showing up to 3): ${examples.slice(0, 3).join(", ")}`, + affectedRowCount: inconsistentRows.length, + affectedRowIndices: maybeIndices(inconsistentRows, indexCap), + detectedAt, + }); + } + } + + // Outlier detector — values that fall outside a user-supplied hard range. + // We deliberately do NOT auto-detect outliers via z-score: legitimate + // consecutive large readings (e.g. clusters of high glucose in a clinical + // dataset) would be flagged en masse. Requires the caller to opt in by + // providing validRanges per column. Skips rows already flagged as + // placeholders so the same row doesn't surface under two issue types. if (options.validRanges) { for (const [col, range] of Object.entries(options.validRanges)) { if (!dataset.columns.includes(col)) continue; const placeholderHits = placeholderHitByColRow.get(col)!; - const oorIndices: number[] = []; + const outlierIndices: number[] = []; for (let i = 0; i < dataset.rows.length; i++) { if (placeholderHits.has(i)) continue; const v = toNumber(dataset.rows[i][col]); if (v === undefined) continue; - if (v < range.min || v > range.max) oorIndices.push(i); + if (v < range.min || v > range.max) outlierIndices.push(i); } - if (oorIndices.length === 0) continue; + if (outlierIndices.length === 0) continue; issues.push({ issueId: nextIssueId(), - issueType: "out_of_range", + issueType: "outlier", column: col, - description: `${oorIndices.length} row(s) in ${col} fall outside [${range.min}, ${range.max}]`, - evidence: `Valid range: [${range.min}, ${range.max}]; violations: ${oorIndices.length}.`, - affectedRowCount: oorIndices.length, - affectedRowIndices: maybeIndices(oorIndices, indexCap), + description: `${outlierIndices.length} row(s) in ${col} fall outside the valid range [${range.min}, ${range.max}]`, + evidence: `Valid range: [${range.min}, ${range.max}]; violations: ${outlierIndices.length}.`, + affectedRowCount: outlierIndices.length, + affectedRowIndices: maybeIndices(outlierIndices, indexCap), detectedAt, }); } diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.ts b/agent-service/src/agent/tools/dataguard/suggest-fix.ts index b4e6feda995..01554959c9d 100644 --- a/agent-service/src/agent/tools/dataguard/suggest-fix.ts +++ b/agent-service/src/agent/tools/dataguard/suggest-fix.ts @@ -36,25 +36,29 @@ const fixProposalSchema = z.object({ "replace_value", "drop_rows", "impute", - "flag", "standardize", "trim_whitespace", "rename_column", ]), operationParams: z.record(z.string(), z.unknown()), - riskTier: z.enum(["low", "medium", "high"]), + riskTier: z.enum(["low", "medium", "high", "warning"]), reason: z.string().min(1), evidence: z.string().min(1), confidence: z.enum(["low", "medium", "high"]), targetRowCount: z.number().int().nonnegative(), }); +// `warning` here = "we have a concrete fix but recommend a human eyeball it +// first" — the checklist UI defaults these to unchecked. Used for cases where +// auto-applying could destroy meaningful signal (outliers that might be real +// extremes, biologically plausible out-of-range values). const DEFAULT_RISK_TIER_BY_ISSUE: Record = { placeholder_value: "medium", missing_value: "medium", duplicate_id: "high", - out_of_range: "medium", - outlier: "high", + // `outlier` is the validRanges-based detector — see profile-dataset.ts. The + // earlier z-score outlier detector was removed. + outlier: "warning", inconsistent_label: "medium", }; @@ -94,6 +98,10 @@ export async function suggestFix( export function buildPrompt(issue: DataQualityIssue): string { const defaultTier = DEFAULT_RISK_TIER_BY_ISSUE[issue.issueType] ?? "medium"; + const indicesLine = + issue.affectedRowIndices && issue.affectedRowIndices.length > 0 + ? `\n- affectedRowIndices: [${issue.affectedRowIndices.join(", ")}]` + : ""; return `You are a data-cleaning assistant. Propose a single concrete fix for the following data-quality issue. Reply with one JSON object only — no prose, no markdown, no fences. Issue: @@ -101,14 +109,14 @@ Issue: - column: ${issue.column} - description: ${issue.description} - evidence: ${issue.evidence} -- affectedRowCount: ${issue.affectedRowCount} +- affectedRowCount: ${issue.affectedRowCount}${indicesLine} Required JSON shape: { "action": "", - "operationKind": "replace_value | drop_rows | impute | flag | standardize | trim_whitespace | rename_column", + "operationKind": "replace_value | drop_rows | impute | standardize | trim_whitespace | rename_column", "operationParams": { ...operation-specific params... }, - "riskTier": "low | medium | high", + "riskTier": "low | medium | high | warning", "reason": "", "evidence": "", "confidence": "low | medium | high", @@ -116,15 +124,25 @@ Required JSON shape: } operationParams by kind: -- replace_value: { "column": string, "match": any, "replacement": any } +- replace_value: { "column": string, "replacement": any, "rowIndices"?: number[], "match"?: any } + Use "rowIndices" when the issue gives you affectedRowIndices — index targeting + is deterministic. Use "match" only for value-based swaps (e.g. replace every + literal "unknown" with null) when you don't have indices. Never invent a + "match" value from aggregate stats — silent no-ops happen when match doesn't + equal the cell exactly (e.g. 950 vs 950.0 vs "950"). - drop_rows: { "rowIndices": number[] } - impute: { "column": string, "strategy": "mean" | "median" | "mode" } -- flag: { "rowIndices": number[] } - standardize: { "column": string, "mapping": { [from: string]: string } } - trim_whitespace: { "column": string } - rename_column: { "from": string, "to": string } -Default risk tier for ${issue.issueType}: ${defaultTier}. Override only with a strong reason. Prefer "flag" or "impute" over destructive "drop_rows".`; +Rules: +- Every proposal must be a concrete, applicable fix — no "flag-and-do-nothing" options. +- For outlier issues (values outside the valid range stated in description / evidence): use replace_value with "rowIndices" set to the affectedRowIndices and "replacement" set to the nearest valid bound (min or max from the range), or to null if no bound is sensible. Never use "match" for outliers — always use rowIndices. +- Set riskTier="warning" when you have a concrete fix but the user really should eyeball it before applying — typical for outlier values that are unusual but possibly real (extreme biological readings, etc.). The UI defaults these to unchecked so the user makes an explicit call. +- Prefer impute over drop_rows for missing values. + +Default risk tier for ${issue.issueType}: ${defaultTier}. Override only with a strong reason.`; } function stripCodeFences(s: string): string { diff --git a/agent-service/src/agent/tools/dataguard/with-approval.ts b/agent-service/src/agent/tools/dataguard/with-approval.ts index 454ea6de901..025a7d101f9 100644 --- a/agent-service/src/agent/tools/dataguard/with-approval.ts +++ b/agent-service/src/agent/tools/dataguard/with-approval.ts @@ -43,9 +43,16 @@ export async function requestApproval( gateway: ApprovalGateway, proposal: FixProposal ): Promise { - // High-risk fixes ALWAYS prompt — the "remember" rule does not apply. + // `high` and `warning` ALWAYS prompt — the "remember" rule does not apply. // This is the same shape Claude Code uses for destructive Bash operations. - if (proposal.riskTier !== "high" && gateway.matchesAutoAllowRule(proposal.issueType)) { + // + // `warning` exists specifically because the agent is *not* confident enough + // to act without a human eyeball (e.g., outliers that might be real extreme + // values). Letting an "Allow & remember placeholder_value" rule from earlier + // in the session auto-approve a warning-tier fix would defeat the whole + // point of the tier. + const alwaysPrompt = proposal.riskTier === "high" || proposal.riskTier === "warning"; + if (!alwaysPrompt && gateway.matchesAutoAllowRule(proposal.issueType)) { return { stepId: "", verdict: "auto_allow_remembered" }; } if (proposal.riskTier === "low") { diff --git a/agent-service/src/server.ts b/agent-service/src/server.ts index 3bfa0df9f7c..bfbaaa3ee7a 100644 --- a/agent-service/src/server.ts +++ b/agent-service/src/server.ts @@ -30,6 +30,7 @@ import { createLogger } from "./logger"; const log = createLogger("Server"); const wsLog = createLogger("WS"); +import type { DataQualityIssue } from "./types/dataguard"; import type { AgentInfo, AgentDelegateConfig, @@ -137,12 +138,22 @@ function getAgent(agentId: string): TexeraAgent { return agent; } -const agentsRouter = new Elysia({ prefix: "/agents" }) +// `normalize: false` keeps unknown fields in the parsed body so additionalProperties:false +// schemas can reject them (Elysia 1.4 strips by default otherwise — see #11a tests). +const agentsRouter = new Elysia({ prefix: "/agents", normalize: false }) // Error handler must live on the same Elysia instance whose routes throw, or // its scope will not see the errors. Elysia 1.x defaults to local scoping for // .onError, so attach here rather than on the outer app. - .onError(({ error, set }) => { + .onError(({ error, code, set }) => { log.error({ err: error }, "request error"); + // Elysia body-schema rejection — surface as 400 so callers can distinguish + // bad input from server bugs. Without this, every typebox validation error + // ends up as a 500 and the modify-cut tests can't tell whether the route + // rejected the bad verdict or crashed. + if (code === "VALIDATION") { + set.status = 400; + return { error: error instanceof Error ? error.message : String(error) }; + } const errorMessage = error instanceof Error ? error.message : String(error); if (errorMessage === "Agent not found") { set.status = 404; @@ -345,6 +356,228 @@ const agentsRouter = new Elysia({ prefix: "/agents" }) } ) + // Server-driven DataGuard scan. Runs profile_dataset + suggest_fix entirely + // server-side (no chat / no LLM tool loop), returns a flat list of issues + // each paired with a FixProposal. The checklist UI consumes this directly. + // + // Body (optional): { idColumn?, validRanges?, placeholderValues?, missingTokens? } + // — same profile_dataset options, all optional. + .post( + "/:id/dataguard/scan", + async ({ params: { id }, body }) => { + const agent = getAgent(id); + const session = agent.getDataGuardSession(); + const dataset = session.getDataset(); + if (!dataset) { + return { error: "No dataset loaded. Call /dataguard/dataset first." }; + } + const { profileDataset } = await import("./agent/tools/dataguard/profile-dataset"); + const { suggestFix } = await import("./agent/tools/dataguard/suggest-fix"); + const scanOptions = { + idColumn: body?.idColumn, + validRanges: body?.validRanges, + placeholderValues: body?.placeholderValues, + missingTokens: body?.missingTokens, + }; + session.setScanOptions(scanOptions); + const issues = profileDataset(dataset, scanOptions); + for (const issue of issues) session.recordIssue(issue); + + // Generate a proposal per issue in parallel. Each calls the LLM once. + const llmCall = (prompt: string) => agent.callLlm(prompt); + const proposals = await Promise.all( + issues.map(async issue => { + try { + const p = await suggestFix(issue, { llmCall }); + session.recordProposal(p); + return { issueId: issue.issueId, proposal: p, error: null }; + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e); + return { issueId: issue.issueId, proposal: null, error: msg }; + } + }) + ); + return { issueCount: issues.length, issues, proposals }; + }, + { + body: t.Optional( + t.Object({ + idColumn: t.Optional(t.String()), + validRanges: t.Optional(t.Record(t.String(), t.Object({ min: t.Number(), max: t.Number() }))), + placeholderValues: t.Optional(t.Array(t.Union([t.String(), t.Number()]))), + missingTokens: t.Optional(t.Array(t.String())), + }) + ), + } + ) + + // Apply a user-selected batch of FixProposals (the checklist UI sends this + // when the user clicks "Apply Selected"). Each entry in `decisions` is one + // checkbox row: { issueId, verdict, remember? }. + .post( + "/:id/dataguard/apply-batch", + async ({ params: { id }, body, set }) => { + const agent = getAgent(id); + const session = agent.getDataGuardSession(); + const { applyFix } = await import("./agent/tools/dataguard/apply-fix"); + + // `remember` only applies when the user approves a fix — it adds the + // issueType to autoAllowRules so future similar fixes are pre-approved. + // Pairing it with deny is nonsense and would silently teach the agent + // an unintended rule, so we reject the whole batch (#12). + const badRemember = body.decisions.find(d => d.verdict === "deny" && d.remember === true); + if (badRemember) { + set.status = 400; + return { + error: `decision for issueId="${badRemember.issueId}" combines verdict="deny" with remember=true; remember only applies to allow.`, + }; + } + + // Belt-and-suspenders rejection of legacy fields (e.g., #11a's + // `modifiedAction`). The typebox schema sets additionalProperties:false, + // but Elysia's body parser sometimes strips unknown keys before + // validation; this explicit check guarantees an honest 400. + const KNOWN_KEYS = new Set(["issueId", "verdict", "remember"]); + const rawBody = body as unknown as { decisions: Array> }; + for (let i = 0; i < rawBody.decisions.length; i++) { + const entry = rawBody.decisions[i]; + const extras = Object.keys(entry).filter(k => !KNOWN_KEYS.has(k)); + if (extras.length > 0) { + set.status = 400; + return { + error: `decision at index ${i} has unknown field(s): ${extras.join(", ")}. Allowed: issueId, verdict, remember.`, + }; + } + } + + const results: Array<{ + issueId: string; + verdict: string; + applied: boolean; + rowsAffected: number; + error?: string; + }> = []; + + let dataset = session.getDataset(); + if (!dataset) return { error: "No dataset loaded." }; + + for (const decision of body.decisions) { + const proposal = session.getProposal(decision.issueId); + if (!proposal) { + results.push({ + issueId: decision.issueId, + verdict: decision.verdict, + applied: false, + rowsAffected: 0, + error: "no proposal for this issueId — call /scan first", + }); + continue; + } + if (decision.verdict === "deny") { + session.recordDecision({ proposal, verdict: "deny", applied: false }); + results.push({ issueId: decision.issueId, verdict: "deny", applied: false, rowsAffected: 0 }); + continue; + } + try { + // Thread the session's scan-time missingTokens into applyFix so that + // user-configured tokens (e.g. ["xyz"]) are treated as missing by + // impute, matching what the profiler flagged. + const out = applyFix(dataset, proposal, { + missingTokens: session.getScanOptions().missingTokens, + }); + dataset = out.dataset; + session.updateDataset(dataset); + session.recordDecision({ + proposal, + verdict: decision.verdict, + applied: true, + }); + if (decision.remember) { + session.addAutoAllowRule(proposal.issueType); + } + results.push({ + issueId: decision.issueId, + verdict: decision.verdict, + applied: true, + rowsAffected: out.rowsAffected, + }); + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e); + results.push({ + issueId: decision.issueId, + verdict: decision.verdict, + applied: false, + rowsAffected: 0, + error: msg, + }); + } + } + // Verification pass: re-run the profiler on the cleaned dataset so the + // UI can surface anything the proposals didn't actually fix. These are + // genuine leftovers (denied fixes, anything impute couldn't compute, + // etc.). The earlier split into "acknowledged" (flag-for-review) is + // gone because the flag op kind is gone — every proposal now produces a + // real concrete change. + let residualIssues: DataQualityIssue[] = []; + if (dataset) { + const { profileDataset } = await import("./agent/tools/dataguard/profile-dataset"); + residualIssues = profileDataset(dataset, session.getScanOptions()); + } + return { + applied: results.filter(r => r.applied).length, + denied: results.filter(r => r.verdict === "deny").length, + failed: results.filter(r => !r.applied && r.verdict !== "deny").length, + datasetRowCount: dataset?.rows.length ?? 0, + results, + residualIssues, + residualCount: residualIssues.length, + }; + }, + { + body: t.Object({ + decisions: t.Array( + t.Object( + { + issueId: t.String(), + // "modify" was cut by #11a — body schema must reject it (the + // legacy handler executed the original proposalParams anyway and + // only logged the user's free-text, which silently lied to users). + verdict: t.Union([t.Literal("allow"), t.Literal("deny")]), + remember: t.Optional(t.Boolean()), + }, + // Reject legacy fields like `modifiedAction` outright instead of + // silently dropping them — callers that still send them are buggy. + { additionalProperties: false } + ) + ), + }), + } + ) + + // Return the in-memory cleaned dataset as a CSV blob. The frontend uses this + // after "Apply selected" to upload the cleaned data back as a new dataset + // version, then auto-runs the workflow. + .get("/:id/dataguard/export-csv", ({ params: { id }, set }) => { + const agent = getAgent(id); + const dataset = agent.getDataGuardSession().getDataset(); + if (!dataset) { + set.status = 404; + return "No dataset loaded."; + } + const escape = (v: unknown): string => { + if (v === null || v === undefined) return ""; + const s = String(v); + return /[",\n\r]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s; + }; + const lines: string[] = []; + lines.push(dataset.columns.map(escape).join(",")); + for (const row of dataset.rows) { + lines.push(dataset.columns.map(c => escape(row[c])).join(",")); + } + set.headers["content-type"] = "text/csv; charset=utf-8"; + return lines.join("\n"); + }) + .get("/:id/dataguard/session", ({ params: { id } }) => { const agent = getAgent(id); const session = agent.getDataGuardSession(); @@ -354,7 +587,6 @@ const agentsRouter = new Elysia({ prefix: "/agents" }) datasetColumnCount: dataset?.columns.length ?? 0, issues: session.getIssues(), decisionLog: session.getDecisionLog(), - flaggedRows: session.getFlaggedRows(), autoAllowRules: session.getAutoAllowRules(), }; }) @@ -442,9 +674,10 @@ interface WsMessage { messageSource?: "chat" | "feedback"; // Fields below carry the user's verdict on a pending-approval step. // Used when type === "decision". See agent/tools/dataguard/with-approval.ts. + // "modify" verdict was cut by #11a (it silently lied — the handler ran the + // original proposalParams and just logged the user's free-text). stepId?: string; - verdict?: "allow" | "deny" | "modify"; - modifiedAction?: string; + verdict?: "allow" | "deny"; remember?: boolean; } @@ -515,7 +748,10 @@ function broadcastToAgent(agentId: string, message: WsOutgoingMessage): void { } export function buildApp() { - return new Elysia() + // `normalize: false` so body schemas with additionalProperties:false can + // reject unknown fields (Elysia 1.4 silently strips them by default — see + // the #11a modify-reject tests). + return new Elysia({ normalize: false }) .use(cors()) .group(env.API_PREFIX, app => app @@ -582,10 +818,27 @@ export function buildApp() { ); return; } + if (msg.verdict !== "allow" && msg.verdict !== "deny") { + ws.send( + JSON.stringify({ + type: "error", + error: `verdict must be "allow" or "deny" (got "${msg.verdict}")`, + }) + ); + return; + } + if (msg.verdict === "deny" && msg.remember === true) { + ws.send( + JSON.stringify({ + type: "error", + error: "remember=true only applies to allow decisions", + }) + ); + return; + } const resolved = agent.resolveDecision(msg.stepId, { stepId: msg.stepId, verdict: msg.verdict, - modifiedAction: msg.modifiedAction, remember: msg.remember, }); wsLog.info( @@ -663,6 +916,12 @@ export function _resetAgentStoreForTests(): void { agentCounter = 0; } +// Reach an agent by id from a test so a test can seed its DataGuardSession +// directly (avoids running the LLM-backed /scan to set up state). +export function _getAgentForTests(id: string): TexeraAgent | undefined { + return agentStore.get(id); +} + function printStartupMessage(app: ReturnType) { const LINE = "=".repeat(60); console.log(LINE); diff --git a/agent-service/src/types/dataguard.test.ts b/agent-service/src/types/dataguard.test.ts index 53047e08879..534e657ad06 100644 --- a/agent-service/src/types/dataguard.test.ts +++ b/agent-service/src/types/dataguard.test.ts @@ -113,6 +113,8 @@ describe("DataGuard type shapes", () => { }; expect(entry.userDecision).toBe("allow"); expect(entry.appliedAt).toBeDefined(); + // @ts-expect-error modifiedAction was cut from DecisionLogEntry by #11a; + // this assertion locks the absence of the property at the type level. expect(entry.modifiedAction).toBeUndefined(); }); @@ -131,23 +133,6 @@ describe("DataGuard type shapes", () => { expect(entry.appliedAt).toBeUndefined(); }); - test("DecisionLogEntry: modified — carries modifiedAction", () => { - const entry: DecisionLogEntry = { - decisionId: "dec-3", - timestamp: "2026-05-14T12:02:00.000Z", - issueType: "missing_value", - targetRowCount: 17, - proposedAction: "Impute missing glucose with group median", - userDecision: "modify", - modifiedAction: "Flag for manual review", - reason: "Imbalance across groups makes imputation risky.", - confidence: "medium", - appliedAt: "2026-05-14T12:02:05.000Z", - }; - expect(entry.userDecision).toBe("modify"); - expect(entry.modifiedAction).toBe("Flag for manual review"); - }); - test("AutoAllowRule: per-issue-type policy", () => { const rule: AutoAllowRule = { ruleId: "rule-1", @@ -164,17 +149,6 @@ describe("DataGuard type shapes", () => { remember: true, }; expect(decision.remember).toBe(true); - expect(decision.modifiedAction).toBeUndefined(); - }); - - test("PermissionDecision: modify with modifiedAction", () => { - const decision: PermissionDecision = { - stepId: "step-42", - verdict: "modify", - modifiedAction: "Flag for manual review instead of impute", - }; - expect(decision.verdict).toBe("modify"); - expect(decision.modifiedAction).toBeDefined(); }); test("PermissionDecision: deny", () => { @@ -186,13 +160,12 @@ describe("DataGuard type shapes", () => { }); test("Literal unions accept all documented members", () => { - const risks: RiskTier[] = ["low", "medium", "high"]; + const risks: RiskTier[] = ["low", "medium", "high", "warning"]; const confidences: Confidence[] = ["low", "medium", "high"]; const issueTypes: IssueType[] = [ "placeholder_value", "missing_value", "duplicate_id", - "out_of_range", "outlier", "inconsistent_label", ]; @@ -200,7 +173,6 @@ describe("DataGuard type shapes", () => { "replace_value", "drop_rows", "impute", - "flag", "standardize", "trim_whitespace", "rename_column", @@ -208,14 +180,13 @@ describe("DataGuard type shapes", () => { const verdicts: Verdict[] = [ "allow", "deny", - "modify", "auto_allow_low_risk", "auto_allow_remembered", ]; - expect(risks).toHaveLength(3); + expect(risks).toHaveLength(4); expect(confidences).toHaveLength(3); - expect(issueTypes).toHaveLength(6); - expect(opKinds).toHaveLength(7); - expect(verdicts).toHaveLength(5); + expect(issueTypes).toHaveLength(5); + expect(opKinds).toHaveLength(6); + expect(verdicts).toHaveLength(4); }); }); diff --git a/agent-service/src/types/dataguard.ts b/agent-service/src/types/dataguard.ts index d5b17ac1654..6eb113dd080 100644 --- a/agent-service/src/types/dataguard.ts +++ b/agent-service/src/types/dataguard.ts @@ -21,15 +21,21 @@ // DataGuard tools (profile_dataset / suggest_fix / apply_fix / write_decision_log), // the agent's permission-gating layer, and the chat-panel approval UI. -export type RiskTier = "low" | "medium" | "high"; +// `warning` marks a fix the agent thinks is risky enough to recommend manual +// review — UI defaults the checkbox to unchecked and renders an orange badge. +// The fix itself is still concrete and applicable (no more no-op "flag" ops). +export type RiskTier = "low" | "medium" | "high" | "warning"; export type Confidence = "low" | "medium" | "high"; +// `outlier` is the validRanges-based detector. Earlier there was a separate +// z-score "outlier" detector that flagged anything beyond ±3σ — too aggressive +// (it removed legitimately large but consecutive readings), so it was dropped. +// The remaining detector requires the user to supply a hard min/max per column. export type IssueType = | "placeholder_value" | "missing_value" | "duplicate_id" - | "out_of_range" | "outlier" | "inconsistent_label"; @@ -37,15 +43,17 @@ export type FixOperationKind = | "replace_value" | "drop_rows" | "impute" - | "flag" | "standardize" | "trim_whitespace" | "rename_column"; +// "modify" was removed for MVP (#11a) to avoid silent fallback — the legacy +// handler recorded a user-supplied action override in the log but always +// executed the original proposal.operationParams. Revisit post-hackathon +// with a real natural-language → operationParams parser. export type Verdict = | "allow" | "deny" - | "modify" | "auto_allow_low_risk" | "auto_allow_remembered"; @@ -78,7 +86,6 @@ export interface FixProposal { export interface PermissionDecision { stepId: string; verdict: Verdict; - modifiedAction?: string; remember?: boolean; } @@ -89,7 +96,6 @@ export interface DecisionLogEntry { targetRowCount: number; proposedAction: string; userDecision: Verdict; - modifiedAction?: string; reason: string; confidence: Confidence; appliedAt?: string; diff --git a/common/config/src/main/resources/gui.conf b/common/config/src/main/resources/gui.conf index d58d94ac7b9..9bef054a404 100644 --- a/common/config/src/main/resources/gui.conf +++ b/common/config/src/main/resources/gui.conf @@ -109,7 +109,7 @@ gui { active-time-in-minutes = ${?GUI_WORKFLOW_WORKSPACE_ACTIVE_TIME_IN_MINUTES} # whether AI copilot feature is enabled - copilot-enabled = false + copilot-enabled = true copilot-enabled = ${?GUI_WORKFLOW_WORKSPACE_COPILOT_ENABLED} # the limit of columns to be displayed in the result table diff --git a/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts index 34f88ced675..914f7306031 100644 --- a/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts +++ b/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts @@ -21,7 +21,6 @@ import { Component, HostListener, Input, OnDestroy, OnInit, OnChanges, SimpleCha import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; import { NzResizeEvent, NzResizableDirective, NzResizeHandlesComponent } from "ng-zorro-antd/resizable"; import { AgentService, AgentInfo } from "../../../service/agent/agent.service"; -import { DataGuardAutoTriggerService } from "../../../service/agent/data-guard-auto-trigger.service"; import { WorkflowActionService } from "../../../service/workflow-graph/model/workflow-action.service"; import { NotificationService } from "../../../../common/service/notification/notification.service"; import { calculateTotalTranslate3d } from "../../../../common/util/panel-dock"; @@ -99,8 +98,7 @@ export class AgentPanelComponent implements OnInit, OnDestroy, OnChanges { constructor( private agentService: AgentService, private workflowActionService: WorkflowActionService, - private notificationService: NotificationService, - private dataGuardAutoTrigger: DataGuardAutoTriggerService + private notificationService: NotificationService ) {} ngOnInit(): void { @@ -128,20 +126,6 @@ export class AgentPanelComponent implements OnInit, OnDestroy, OnChanges { this.tryActivateAgentFromInput(); }); - // DataGuard auto-trigger: when a dataset-reading operator is added to the - // workflow, notify the user that DataGuard is ready to scan. Full - // integration (create agent + POST /agents/:id/dataguard/dataset + send - // initial "scan this dataset" message) is a follow-up that needs the - // referenced CSV bytes — for now we surface the trigger so the - // §5 storyboard "no manual invocation" beat is in place. - this.dataGuardAutoTrigger - .getDatasetAddedStream() - .pipe(untilDestroyed(this)) - .subscribe(op => { - this.notificationService.info( - `DataGuard ready to scan ${op.operatorType}. Open the chat panel and ask "scan this dataset for quality issues".` - ); - }); } ngOnChanges(changes: SimpleChanges): void { diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html index e2544f8434d..58963b0893a 100644 --- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html +++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html @@ -24,28 +24,15 @@ -
+
- -
- -
- -
- - -
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss index f0f32539e43..07dc1c545a3 100644 --- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss +++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss @@ -45,6 +45,11 @@ color: #cf1322; } +.dg-permission__tier--warning { + background: #ffe7ba; + color: #ad4e00; +} + .dg-permission__body { margin: 8px 0; } @@ -67,12 +72,3 @@ margin-top: 8px; flex-wrap: wrap; } - -.dg-permission__modify { - margin-top: 8px; - - textarea { - width: 100%; - margin-bottom: 8px; - } -} diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts index 7423d9aefb8..b5b0315dd4b 100644 --- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts +++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts @@ -19,9 +19,7 @@ import { Component, Input } from "@angular/core"; import { NgIf } from "@angular/common"; -import { FormsModule } from "@angular/forms"; import { NzButtonComponent } from "ng-zorro-antd/button"; -import { NzInputDirective } from "ng-zorro-antd/input"; import { ReActStep } from "../../../../service/agent/agent-types"; import { AgentService } from "../../../../service/agent/agent.service"; @@ -34,7 +32,7 @@ import { AgentService } from "../../../../service/agent/agent.service"; @Component({ selector: "texera-permission-prompt", standalone: true, - imports: [NgIf, FormsModule, NzButtonComponent, NzInputDirective], + imports: [NgIf, NzButtonComponent], templateUrl: "./permission-prompt.component.html", styleUrls: ["./permission-prompt.component.scss"], }) @@ -42,8 +40,6 @@ export class PermissionPromptComponent { @Input() step!: ReActStep; @Input() agentId!: string; - public isModifying = false; - public modifiedAction = ""; public submitted = false; constructor(private readonly agentService: AgentService) {} @@ -59,22 +55,4 @@ export class PermissionPromptComponent { this.submitted = true; this.agentService.sendDecision(this.agentId, this.step.id, "deny"); } - - public openModify(): void { - if (this.submitted) return; - this.isModifying = true; - this.modifiedAction = this.step.pendingApproval?.proposal.action ?? ""; - } - - public submitModify(): void { - if (this.submitted) return; - this.submitted = true; - this.agentService.sendDecision(this.agentId, this.step.id, "modify", { - modifiedAction: this.modifiedAction, - }); - } - - public cancelModify(): void { - this.isModifying = false; - } } diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html new file mode 100644 index 00000000000..f50626061a0 --- /dev/null +++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html @@ -0,0 +1,160 @@ + +
+ +
+ + + DataGuard + + {{ statusBadge() }} + +
+ + +
+ + We checked {{ scan.datasetRows }} rows in + {{ scan.datasetSource }}. + +
{{ scan.message }}
+
+ + +
+  Looking for problems in your data… +
+ + +
+ + Your data looks good — nothing to fix. +
+ + +
+ + {{ c.count }} {{ c.label }}{{ c.count === 1 ? "" : "s" }}· + +
+ + +
+ {{ selectedCount }} to fix · {{ deniedCount }} skipped + + + + +
+ + +
    +
  • +
    + + + {{ riskTierLabel(entry) }} + +
    + +
    + +
    {{ entry.proposal.reason }}
    +
    + ⚠ We couldn't suggest a fix for this one. You can skip it. +
    +
    + + +
    + + +
    +
  • +
+ + +
+ +
+ +
+ + +
+
+ + + diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss new file mode 100644 index 00000000000..efc9146f712 --- /dev/null +++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss @@ -0,0 +1,319 @@ +.dg-panel { + position: fixed; + bottom: 100px; + right: 80px; + width: 480px; + max-height: 70vh; + background: #fff; + border: 1px solid #d9d9d9; + border-radius: 8px; + box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15); + z-index: 10; + display: flex; + flex-direction: column; + font-size: 0.85rem; +} + +.dg-panel__header { + display: flex; + align-items: center; + gap: 8px; + padding: 10px 12px; + border-bottom: 1px solid #f0f0f0; + background: #fafafa; + border-radius: 8px 8px 0 0; + // Header is the cdkDragHandle; show a move cursor so it's discoverable. + cursor: move; + + // The close button inside the header should still feel clickable, not draggable. + .dg-panel__close { + cursor: pointer; + } +} + +// While dragging, the CDK adds .cdk-drag-preview to the moving copy. +.dg-panel.cdk-drag-preview { + box-shadow: 0 12px 28px rgba(0, 0, 0, 0.25); +} + +.dg-panel__title { + font-weight: 600; + font-size: 0.95rem; + display: flex; + align-items: center; + gap: 6px; + flex: 1; +} + +.dg-panel__status { + font-size: 0.75rem; + color: #8c8c8c; + background: #f0f5ff; + padding: 2px 8px; + border-radius: 12px; +} + +.dg-panel__close { + margin-left: auto; +} + +.dg-panel__sub { + padding: 8px 12px; + border-bottom: 1px solid #f5f5f5; + font-size: 0.78rem; + color: #595959; + + code { + background: #f5f5f5; + padding: 1px 6px; + border-radius: 3px; + font-size: 0.75rem; + } +} + +.dg-panel__message { + margin-top: 4px; + font-style: italic; + color: #1890ff; +} + +.dg-row__category { + display: inline-block; + font-size: 0.65rem; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.5px; + padding: 2px 8px; + border-radius: 4px; + background: #e6f4ff; + color: #003eb3; + margin-right: 6px; +} + +.dg-panel__empty { + padding: 24px; + text-align: center; + color: #8c8c8c; + + &--clean { + color: #389e0d; + } +} + +.dg-panel__categories { + padding: 6px 12px; + border-bottom: 1px solid #f5f5f5; + display: flex; + flex-wrap: wrap; + gap: 4px; + font-size: 0.74rem; + color: #003eb3; + background: #f0f8ff; +} + +.dg-panel__category-chip { + display: inline-flex; + align-items: center; + gap: 4px; + font-weight: 500; +} + +.dg-panel__category-sep { + margin: 0 2px; + color: #adc6ff; + font-weight: 400; +} + +.dg-panel__bulk { + padding: 6px 12px; + border-bottom: 1px solid #f5f5f5; + display: flex; + justify-content: space-between; + align-items: center; + font-size: 0.75rem; + color: #595959; +} + +.dg-panel__bulk-actions { + display: flex; + gap: 6px; +} + +.dg-panel__list { + list-style: none; + margin: 0; + padding: 0; + overflow-y: auto; + flex: 1; +} + +.dg-row { + padding: 10px 12px; + border-bottom: 1px solid #f5f5f5; + transition: background 0.15s; + + &:last-child { + border-bottom: none; + } + + &--allow { + background: #f6ffed; + } + + &--deny { + background: #fff1f0; + opacity: 0.75; + } + + &--error { + background: #fff7e6; + } +} + +.dg-row__head { + display: flex; + align-items: flex-start; + justify-content: space-between; + gap: 8px; +} + +.dg-row__action { + font-weight: 500; + margin-left: 4px; +} + +.dg-row__tier { + font-size: 0.7rem; + font-weight: 600; + text-transform: uppercase; + padding: 2px 8px; + border-radius: 4px; + letter-spacing: 0.5px; + flex-shrink: 0; + + &--low { + background: #d9f7be; + color: #389e0d; + } + + &--medium { + background: #fff1b8; + color: #d48806; + } + + &--high { + background: #ffccc7; + color: #cf1322; + } + + &--warning { + background: #ffe7ba; + color: #ad4e00; + } +} + +.dg-row__details { + margin: 6px 0 0 26px; + font-size: 0.75rem; + color: #595959; + line-height: 1.4; +} + +.dg-row__field { + margin: 2px 0; +} + +.dg-row__field--error { + color: #cf1322; +} + +// "Locate this issue in the result panel" affordance — visually a flat link +// that sits on the same line as the column/row-count detail, so users don't +// see a second control to think about. +.dg-row__locate { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 0; + margin: 2px 0; + background: transparent; + border: none; + font: inherit; + color: #1677ff; + cursor: pointer; + text-align: left; + + i { + font-size: 0.85rem; + } + + &:hover { + text-decoration: underline; + } + + &:focus-visible { + outline: 1px dashed #1677ff; + outline-offset: 2px; + } +} + +.dg-row__foot { + margin-top: 6px; + margin-left: 22px; + display: flex; + gap: 4px; + align-items: center; +} + +.dg-row__remember { + margin-left: 8px; + font-size: 0.72rem; + color: #8c8c8c; +} + +.dg-panel__foot { + padding: 10px 12px; + border-top: 1px solid #f0f0f0; + background: #fafafa; + border-radius: 0 0 8px 8px; + text-align: right; +} + +.dg-panel__foot--split { + display: flex; + justify-content: space-between; + align-items: center; + gap: 8px; +} + +// Floating "open DataGuard" button, sits where the panel used to be. +// Lives on the canvas overlay layer so it doesn't interfere with operators. +.dg-floater { + position: fixed; + bottom: 100px; + right: 80px; + width: 44px; + height: 44px; + border-radius: 50%; + border: 1px solid #91caff; + background: #e6f4ff; + cursor: pointer; + z-index: 10; + display: flex; + align-items: center; + justify-content: center; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.12); + transition: background 0.15s, transform 0.1s; + + i { + font-size: 1.4rem; + } + + &:hover { + background: #bae0ff; + } + + &:active { + transform: scale(0.95); + } +} diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts new file mode 100644 index 00000000000..d55d1eb16d7 --- /dev/null +++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts @@ -0,0 +1,369 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Component, OnDestroy, OnInit } from "@angular/core"; +import { NgFor, NgIf } from "@angular/common"; +import { Subscription } from "rxjs"; +import { CdkDrag, CdkDragHandle } from "@angular/cdk/drag-drop"; +import { NzIconDirective } from "ng-zorro-antd/icon"; +import { NzButtonComponent } from "ng-zorro-antd/button"; +import { NzCheckboxComponent } from "ng-zorro-antd/checkbox"; +import { + DataGuardResultsService, + DataGuardScanResult, + ChecklistEntry, +} from "../../service/agent/data-guard-results.service"; +import { DataGuardAutoTriggerService } from "../../service/agent/data-guard-auto-trigger.service"; +import { DataGuardSettingsService } from "../../service/agent/data-guard-settings.service"; +import { DataGuardRowNavigatorService } from "../../service/agent/data-guard-row-navigator.service"; +import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service"; +import { NotificationService } from "../../../common/service/notification/notification.service"; + +/** + * Standalone DataGuard checklist panel. + * + * UX (per user request): + * - When DataGuard auto-trigger detects a dataset and runs /scan, the + * issues + proposals land in DataGuardResultsService. + * - This component renders them as a checklist with per-row Allow / Deny + * pickers (low-risk pre-checked Allow). + * - Click "Apply Selected" to send the batch decision to + * /api/agents/:id/dataguard/apply-batch. The agent's chat is NOT + * involved — so the LLM can't accidentally call deleteOperator or modify + * the workflow canvas. + * + * Visibility: + * - Floating panel docked bottom-right, slides up when state ≠ "idle". + * - Collapsed via close button → state goes back to "idle". + */ +@Component({ + selector: "texera-dataguard-checklist", + standalone: true, + imports: [NgIf, NgFor, NzIconDirective, NzButtonComponent, NzCheckboxComponent, CdkDrag, CdkDragHandle], + templateUrl: "./dataguard-checklist.component.html", + styleUrls: ["./dataguard-checklist.component.scss"], +}) +export class DataGuardChecklistComponent implements OnInit, OnDestroy { + public scan: DataGuardScanResult = { + agentId: "", + state: "idle", + entries: [], + datasetSource: "", + datasetRows: 0, + datasetColumns: 0, + }; + + // Cached row-verdict tallies. Recomputed once per state push (see ngOnInit) + // rather than three times per Angular change-detection tick. Default-CD + // means every event walks the template, so a get() that filters the entries + // array fires for *each* row in *each* tick — quadratic with a fat list. + public selectedCount = 0; + public deniedCount = 0; + public pendingCount = 0; + + private sub?: Subscription; + // Owns the workflow-graph subscription that powers auto-trigger orchestration. + // Lives here (not in agent-panel) so the gate is "is the checklist mounted?" + // — the only consumer of the auto-trigger output. + private orchestrationSub?: Subscription; + + // Per-row cursor for the "📍" locate-cycle affordance. Keyed by `issueId` + // so each detector row gets an independent cursor. Cleared on every fresh + // scan push — different issueIds means stale keys would just become + // garbage, but a hard reset keeps memory bounded and the behaviour + // predictable. The cursor value is the index of the *next* click — + // i.e., on entry it is 0, and after navigating to indices[0] it becomes 1. + private locateCursors = new Map(); + + constructor( + private readonly results: DataGuardResultsService, + private readonly autoTrigger: DataGuardAutoTriggerService, + private readonly settings: DataGuardSettingsService, + private readonly workflowActionService: WorkflowActionService, + private readonly rowNavigator: DataGuardRowNavigatorService, + private readonly notificationService: NotificationService + ) {} + + // ---------------- floating reopen button ---------------- + + /** Shield toggle in the toolbar gates the floater. When the user explicitly + * turns DataGuard off for this workflow, the floater must disappear too — + * otherwise the "OFF" toggle would be a lie. */ + public get shieldOn(): boolean { + const wid = this.workflowActionService.getWorkflowMetadata()?.wid; + if (wid === undefined) return true; + return this.settings.isEnabled(wid); + } + + /** Visible iff: panel is closed (idle) AND the shield toggle is on. */ + public get showFloater(): boolean { + return this.scan.state === "idle" && this.shieldOn; + } + + /** User clicked the floating DataGuard icon. Always triggers a fresh scan + * of whatever dataset operator is on the canvas — that's what the user + * picked when we asked "click behavior?". */ + public onFloaterClick(): void { + void this.autoTrigger.rescanAny(); + } + + ngOnInit(): void { + // Track the issueId set of the previous push. `updateEntry` rebuilds the + // entries array on every verdict toggle (`.map(...)`), so identity-compare + // would spuriously reset cursors mid-review. Instead, reset only when the + // *set of issueIds* changes — that's the actual "fresh scan" signal. + let lastIssueIdsKey: string | undefined; + this.sub = this.results.getState$().subscribe(s => { + const key = s.entries.map(e => e.issueId).join("|"); + if (key !== lastIssueIdsKey) { + this.locateCursors.clear(); + lastIssueIdsKey = key; + } + this.scan = s; + // Tally once per state push instead of three full-walks per CD tick. + let allow = 0, + deny = 0, + pending = 0; + for (const entry of s.entries) { + if (entry.verdict === "allow") allow++; + else if (entry.verdict === "deny") deny++; + else pending++; + } + this.selectedCount = allow; + this.deniedCount = deny; + this.pendingCount = pending; + }); + // Subscribe to operator-add / property-change so dropping a dataset + // operator on the canvas triggers /scan via the auto-trigger pipeline. + // Without this, the checklist never opens on its own. + this.orchestrationSub = this.autoTrigger.startOrchestration(); + } + + ngOnDestroy(): void { + this.sub?.unsubscribe(); + this.orchestrationSub?.unsubscribe(); + } + + // ---------------- visibility helpers ---------------- + + public get isOpen(): boolean { + return this.scan.state !== "idle"; + } + + // ---------------- row actions ---------------- + + public onToggleAllow(entry: ChecklistEntry, checked: boolean): void { + this.results.updateEntry(entry.issueId, { verdict: checked ? "allow" : "pending" }); + } + + public onDeny(entry: ChecklistEntry): void { + this.results.updateEntry(entry.issueId, { verdict: "deny" }); + } + + public onToggleRemember(entry: ChecklistEntry, checked: boolean): void { + this.results.updateEntry(entry.issueId, { remember: checked }); + } + + // ---------------- panel actions ---------------- + + /** Mark every pending row as Allow. Skips already-denied. */ + public onSelectAll(): void { + for (const e of this.scan.entries) { + if (e.verdict === "pending") this.results.updateEntry(e.issueId, { verdict: "allow" }); + } + } + + /** Mark every pending row as Deny. */ + public onDenyAll(): void { + for (const e of this.scan.entries) { + if (e.verdict === "pending") this.results.updateEntry(e.issueId, { verdict: "deny" }); + } + } + + /** Apply the user's selection. Backend bypasses the LLM / chat entirely. */ + public async onApplySelected(): Promise { + const decisions = this.scan.entries + .filter((e): e is ChecklistEntry & { verdict: "allow" | "deny" } => e.verdict !== "pending") + .map(e => ({ + issueId: e.issueId, + verdict: e.verdict, + remember: e.remember, + })); + await this.autoTrigger.applyBatch(decisions); + } + + public onClose(): void { + this.results.reset(); + } + + public isRescanning = false; + + /** "Scan again": re-runs DataGuard on the current dataset version. After + * a previous Apply created v2, a re-scan + Apply produces v3 — letting the + * user iterate when the AI missed an issue on the first pass. */ + public async onRescan(): Promise { + if (this.isRescanning) return; + this.isRescanning = true; + try { + await this.autoTrigger.rescanCurrent(); + } finally { + this.isRescanning = false; + } + } + + // ---------------- show-in-result-panel ---------------- + + /** + * Tooltip text for the "📍" button. Shows "Show next affected row (i of N)" + * where i is the 1-based index that the *next* click will navigate to. Falls + * back to a static label when row indices are unknown or empty. + */ + public locateTooltip(entry: ChecklistEntry): string { + const rowIndices = entry.issue.affectedRowIndices; + if (!rowIndices || rowIndices.length === 0) return "Show this row in the result panel"; + if (rowIndices.length === 1) return "Show the affected row in the result panel"; + const cursor = this.locateCursors.get(entry.issueId) ?? 0; + const next = (cursor % rowIndices.length) + 1; + return `Show next affected row (${next} of ${rowIndices.length})`; + } + + /** + * "📍" affordance: focus the source operator's result panel and jump to the + * next affected row, cycling through `affectedRowIndices` with a per-row + * cursor. Repeated clicks walk every affected row and wrap to the first. + * Length-1 rows re-emit the same navigator event so the result-panel pulse + * re-fires — the user has to *feel* that the click did something. + * + * Operator highlight + result-panel open are dispatched synchronously so the + * ResultTableFrameComponent has time to mount before we publish the row-nav + * event in a microtask — otherwise our subscriber on the table side may not + * exist yet on the first click after a panel close. + */ + public onShowInResultPanel(entry: ChecklistEntry): void { + const opId = this.scan.sourceOperatorId; + if (!opId) { + this.notificationService.warning("DataGuard: no source operator recorded for this scan."); + return; + } + const jointGraph = this.workflowActionService.getJointGraphWrapper(); + const operatorExists = this.workflowActionService + .getTexeraGraph() + .getAllOperators() + .some(op => op.operatorID === opId); + if (!operatorExists) { + this.notificationService.warning("DataGuard: the source operator was removed from the canvas."); + return; + } + + const currentlyHighlighted = jointGraph.getCurrentHighlightedOperatorIDs(); + if (!(currentlyHighlighted.length === 1 && currentlyHighlighted[0] === opId)) { + jointGraph.unhighlightOperators(...currentlyHighlighted); + jointGraph.highlightOperators(opId); + } + this.workflowActionService.openResultPanel(); + + const rowIndices = entry.issue.affectedRowIndices; + if (rowIndices === undefined) { + this.notificationService.info( + "DataGuard: opened the result panel — row indices weren't recorded for this issue." + ); + return; + } + if (rowIndices.length === 0) { + this.notificationService.info( + "DataGuard: opened the result panel — no rows are affected by this issue." + ); + return; + } + // Cycle through affectedRowIndices: each click advances this row's cursor + // by one and wraps modulo length. Length-1 rows still emit so the result + // panel re-pulses on every click. Pure helper for testability. + const cursor = this.locateCursors.get(entry.issueId) ?? 0; + const step = DataGuardRowNavigatorService.nextCycleStep(rowIndices, cursor); + this.locateCursors.set(entry.issueId, step.nextCursor); + // Defer one microtask so the table frame mounts before we ask it to page. + queueMicrotask(() => + this.rowNavigator.navigate({ + operatorId: opId, + rowIndex: step.value, + column: entry.issue.column, + }) + ); + } + + // ---------------- display helpers ---------------- + + public riskTierLabel(entry: ChecklistEntry): string { + return entry.proposal?.riskTier ?? "—"; + } + + /** Human-readable category name for the row title. Maps the raw issueType + * enum to plain English so non-technical users see "Missing value" instead + * of "missing_value". */ + public categoryLabel(entry: ChecklistEntry): string { + return this.categoryLabelForType(entry.issue.issueType); + } + + /** Aggregate the current entries by category — drives the at-a-glance + * "2 Missing values · 3 Placeholder values" summary above the row list. */ + public categorySummary(): Array<{ label: string; count: number }> { + const counts = new Map(); + for (const entry of this.scan.entries) { + const label = this.categoryLabel(entry); + counts.set(label, (counts.get(label) ?? 0) + 1); + } + return Array.from(counts.entries()).map(([label, count]) => ({ label, count })); + } + + private categoryLabelForType(issueType: string): string { + switch (issueType) { + case "missing_value": + return "Missing value"; + case "placeholder_value": + return "Placeholder value"; + case "duplicate_id": + return "Duplicate row"; + case "outlier": + // The validRanges-based outlier — the earlier z-score variant was + // removed and "out_of_range" was renamed into this one. + return "Outlier"; + case "inconsistent_label": + return "Inconsistent label"; + default: + return issueType; + } + } + + public statusBadge(): string { + switch (this.scan.state) { + case "scanning": + return "Checking…"; + case "applying": + return "Fixing…"; + case "ready": + return `${this.scan.entries.length} to review`; + case "done": + return "Done"; + case "error": + return "Problem"; + default: + return ""; + } + } +} diff --git a/frontend/src/app/workspace/component/menu/menu.component.html b/frontend/src/app/workspace/component/menu/menu.component.html index a21e4d56429..8d0865b8e02 100644 --- a/frontend/src/app/workspace/component/menu/menu.component.html +++ b/frontend/src/app/workspace/component/menu/menu.component.html @@ -360,6 +360,28 @@ nzType="ellipsis"> + + + + - + -
+
Decision sent — waiting for the agent to continue.
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html index f50626061a0..4aec6974497 100644 --- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html +++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html @@ -1,10 +1,38 @@ + -
+
-
+
- + DataGuard {{ statusBadge() }} @@ -15,27 +43,42 @@ class="dg-panel__close" (click)="onClose()" title="Dismiss checklist"> - +
- We checked {{ scan.datasetRows }} rows in - {{ scan.datasetSource }}. + We checked {{ scan.datasetRows }} rows in {{ scan.datasetSource }}. -
{{ scan.message }}
+
+ {{ scan.message }} +
-
-  Looking for problems in your data… +
+  Looking for problems in your data…
-
- +
+ Your data looks good — nothing to fix.
@@ -47,7 +90,11 @@ - {{ c.count }} {{ c.label }}{{ c.count === 1 ? "" : "s" }}· + {{ c.count }} {{ c.label }}{{ c.count === 1 ? "" : "s" }}·
@@ -57,13 +104,25 @@ class="dg-panel__bulk"> {{ selectedCount }} to fix · {{ deniedCount }} skipped - - + +
-
    +
    • {{ categoryLabel(entry) }} {{ entry.proposal?.action || entry.issue.description }} - - {{ riskTierLabel(entry) }} - + {{ riskTierLabel(entry) }}
@@ -90,11 +147,20 @@ class="dg-row__locate" (click)="onShowInResultPanel(entry)" [title]="locateTooltip(entry)"> - + In column {{ entry.issue.column }} · affects {{ entry.issue.affectedRowCount }} row(s) -
{{ entry.proposal.reason }}
-
+
+ {{ entry.proposal.reason }} +
+
⚠ We couldn't suggest a fix for this one. You can skip it.
@@ -107,7 +173,10 @@ nzSize="small" [disabled]="entry.verdict === 'deny' || !!entry.error" (click)="onDeny(entry)"> -  Skip +  Skip