From 5f5326f9371084ba78a39b29008639f8baa1372b Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Thu, 14 May 2026 19:07:29 -0700
Subject: [PATCH 01/14] feat(agent-service): add DataGuard types and profiler

First checkpoint for the DataGuard data-cleaning agent: adds the shared
types contract (DataQualityIssue, FixProposal, DecisionLogEntry,
AutoAllowRule, plus supporting unions) and the read-only profile_dataset
scanner with four heuristics (missing, placeholder, duplicate-ID,
out-of-range). DataGuard auto-launches when a dataset is added to the
workflow and asks Claude-Code-style approval before applying each fix;
see README_DataGuard_Texera.md.
---
 .../src/agent/tools/dataguard/dataset.ts      |  26 ++
 .../tools/dataguard/profile-dataset.test.ts   | 209 ++++++++++++++++
 .../agent/tools/dataguard/profile-dataset.ts  | 236 ++++++++++++++++++
 agent-service/src/types/dataguard.test.ts     | 219 ++++++++++++++++
 agent-service/src/types/dataguard.ts          | 101 ++++++++
 agent-service/src/types/index.ts              |   1 +
 6 files changed, 792 insertions(+)
 create mode 100644 agent-service/src/agent/tools/dataguard/dataset.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/profile-dataset.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/profile-dataset.ts
 create mode 100644 agent-service/src/types/dataguard.test.ts
 create mode 100644 agent-service/src/types/dataguard.ts

diff --git a/agent-service/src/agent/tools/dataguard/dataset.ts b/agent-service/src/agent/tools/dataguard/dataset.ts
new file mode 100644
index 00000000000..6d1d5fdb32a
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/dataset.ts
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// In-memory tabular dataset view shared across the DataGuard tools
+// (profile_dataset / suggest_fix / apply_fix). Source-agnostic: rows can come
+// from a parsed CSV, a Texera operator result, or a fixture used in tests.
+export interface DatasetView {
+  columns: string[];
+  rows: Record<string, unknown>[];
+}
diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts
new file mode 100644
index 00000000000..d48d828c98d
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts
@@ -0,0 +1,209 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { profileDataset } from "./profile-dataset";
+import type { DatasetView } from "./dataset";
+
+describe("profileDataset", () => {
+  test("clean dataset → empty issue list", () => {
+    const ds: DatasetView = {
+      columns: ["age", "name"],
+      rows: [
+        { age: 25, name: "Alice" },
+        { age: 30, name: "Bob" },
+      ],
+    };
+    expect(profileDataset(ds)).toEqual([]);
+  });
+
+  test("empty dataset → empty issue list", () => {
+    expect(profileDataset({ columns: [], rows: [] })).toEqual([]);
+  });
+
+  test("detects missing values per column (null + empty string)", () => {
+    const ds: DatasetView = {
+      columns: ["age", "name"],
+      rows: [
+        { age: 25, name: "Alice" },
+        { age: null, name: "Bob" },
+        { age: 30, name: "" },
+      ],
+    };
+    const issues = profileDataset(ds);
+    const ageMiss = issues.find(i => i.issueType === "missing_value" && i.column === "age");
+    const nameMiss = issues.find(i => i.issueType === "missing_value" && i.column === "name");
+    expect(ageMiss).toBeDefined();
+    expect(ageMiss!.affectedRowCount).toBe(1);
+    expect(nameMiss).toBeDefined();
+    expect(nameMiss!.affectedRowCount).toBe(1);
+  });
+
+  test("treats configured missing tokens as missing", () => {
+    const ds: DatasetView = {
+      columns: ["x"],
+      rows: [{ x: "ok" }, { x: "N/A" }, { x: "NA" }, { x: "ok" }],
+    };
+    const issues = profileDataset(ds);
+    const miss = issues.find(i => i.issueType === "missing_value");
+    expect(miss).toBeDefined();
+    expect(miss!.affectedRowCount).toBe(2);
+  });
+
+  test("NaN counts as missing", () => {
+    const ds: DatasetView = {
+      columns: ["x"],
+      rows: [{ x: 1 }, { x: Number.NaN }, { x: 3 }],
+    };
+    const issues = profileDataset(ds);
+    const miss = issues.find(i => i.issueType === "missing_value");
+    expect(miss).toBeDefined();
+    expect(miss!.affectedRowCount).toBe(1);
+  });
+
+  test("detects 999 as placeholder in numeric column (default placeholder list)", () => {
+    const ds: DatasetView = {
+      columns: ["age"],
+      rows: [
+        { age: 25 }, { age: 999 }, { age: 30 }, { age: 999 }, { age: 999 },
+      ],
+    };
+    const issues = profileDataset(ds);
+    const ph = issues.find(i => i.issueType === "placeholder_value");
+    expect(ph).toBeDefined();
+    expect(ph!.column).toBe("age");
+    expect(ph!.affectedRowCount).toBe(3);
+  });
+
+  test("custom placeholder list overrides default", () => {
+    const ds: DatasetView = {
+      columns: ["status"],
+      rows: [
+        { status: "ok" }, { status: "missing" }, { status: "ok" }, { status: "missing" },
+      ],
+    };
+    const issues = profileDataset(ds, { placeholderValues: ["missing"] });
+    const ph = issues.find(i => i.issueType === "placeholder_value");
+    expect(ph).toBeDefined();
+    expect(ph!.affectedRowCount).toBe(2);
+  });
+
+  test("idColumn → detects duplicate IDs", () => {
+    const ds: DatasetView = {
+      columns: ["sample_id", "value"],
+      rows: [
+        { sample_id: "S1", value: 1 },
+        { sample_id: "S2", value: 2 },
+        { sample_id: "S1", value: 99 },
+        { sample_id: "S3", value: 3 },
+      ],
+    };
+    const issues = profileDataset(ds, { idColumn: "sample_id" });
+    const dup = issues.find(i => i.issueType === "duplicate_id");
+    expect(dup).toBeDefined();
+    expect(dup!.column).toBe("sample_id");
+    expect(dup!.affectedRowCount).toBe(2);
+  });
+
+  test("no idColumn → no duplicate_id issue even with repeated values", () => {
+    const ds: DatasetView = {
+      columns: ["x"],
+      rows: [{ x: 1 }, { x: 1 }, { x: 1 }],
+    };
+    const issues = profileDataset(ds);
+    expect(issues.find(i => i.issueType === "duplicate_id")).toBeUndefined();
+  });
+
+  test("validRanges → detects out-of-range values", () => {
+    const ds: DatasetView = {
+      columns: ["bmi"],
+      rows: [
+        { bmi: 25.5 }, { bmi: 65 }, { bmi: 72 }, { bmi: 22 },
+      ],
+    };
+    const issues = profileDataset(ds, { validRanges: { bmi: { min: 10, max: 60 } } });
+    const oor = issues.find(i => i.issueType === "out_of_range");
+    expect(oor).toBeDefined();
+    expect(oor!.affectedRowCount).toBe(2);
+  });
+
+  test("placeholder values are not double-counted as out_of_range", () => {
+    const ds: DatasetView = {
+      columns: ["age"],
+      rows: [{ age: 25 }, { age: 999 }, { age: 30 }],
+    };
+    const issues = profileDataset(ds, { validRanges: { age: { min: 0, max: 130 } } });
+    expect(issues.find(i => i.issueType === "out_of_range")).toBeUndefined();
+    expect(issues.find(i => i.issueType === "placeholder_value")).toBeDefined();
+  });
+
+  test("affectedRowIndices included when small (≤50)", () => {
+    const ds: DatasetView = {
+      columns: ["age"],
+      rows: [{ age: 999 }, { age: 25 }, { age: 999 }],
+    };
+    const issues = profileDataset(ds);
+    const ph = issues.find(i => i.issueType === "placeholder_value")!;
+    expect(ph.affectedRowIndices).toEqual([0, 2]);
+  });
+
+  test("affectedRowIndices omitted for large sets (>50)", () => {
+    const rows = Array.from({ length: 100 }, (_, i) => ({ x: i < 60 ? null : i }));
+    const ds: DatasetView = { columns: ["x"], rows };
+    const issues = profileDataset(ds);
+    const miss = issues.find(i => i.issueType === "missing_value")!;
+    expect(miss.affectedRowCount).toBe(60);
+    expect(miss.affectedRowIndices).toBeUndefined();
+  });
+
+  test("each emitted issue has a distinct issueId", () => {
+    const ds: DatasetView = {
+      columns: ["a", "b"],
+      rows: [{ a: null, b: null }],
+    };
+    const issues = profileDataset(ds);
+    const ids = new Set(issues.map(i => i.issueId));
+    expect(ids.size).toBe(issues.length);
+    expect(issues.length).toBeGreaterThan(0);
+  });
+
+  test("realistic diabetes fixture surfaces 4 issue categories", () => {
+    // Mirrors the §5 storyboard's diabetes_messy.csv: placeholder ages,
+    // missing glucose, duplicate sample IDs, and biologically impossible BMI.
+    const ds: DatasetView = {
+      columns: ["sample_id", "age", "glucose", "bmi", "group"],
+      rows: [
+        { sample_id: "S1", age: 45, glucose: 110, bmi: 28, group: "A" },
+        { sample_id: "S2", age: 999, glucose: null, bmi: 30, group: "A" },
+        { sample_id: "S1", age: 45, glucose: 120, bmi: 27, group: "A" }, // dup ID
+        { sample_id: "S3", age: 50, glucose: null, bmi: 65, group: "B" }, // impossible BMI
+        { sample_id: "S4", age: 999, glucose: 140, bmi: 31, group: "B" },
+      ],
+    };
+    const issues = profileDataset(ds, {
+      idColumn: "sample_id",
+      validRanges: { age: { min: 0, max: 120 }, bmi: { min: 10, max: 60 } },
+    });
+    const kinds = new Set(issues.map(i => i.issueType));
+    expect(kinds.has("placeholder_value")).toBe(true);
+    expect(kinds.has("missing_value")).toBe(true);
+    expect(kinds.has("duplicate_id")).toBe(true);
+    expect(kinds.has("out_of_range")).toBe(true);
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
new file mode 100644
index 00000000000..08cd8723bd2
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
@@ -0,0 +1,236 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Read-only scanner that returns DataQualityIssue[] for a dataset.
+// No LLM calls — pure heuristics. Safe to auto-run on dataset-add.
+
+import type { DataQualityIssue } from "../../../types/dataguard";
+import type { DatasetView } from "./dataset";
+
+export interface ProfileOptions {
+  // Column to treat as a unique identifier; duplicates flagged as duplicate_id.
+  idColumn?: string;
+  // Hard valid range per numeric column (e.g., { age: { min: 0, max: 130 } }).
+  validRanges?: Record<string, { min: number; max: number }>;
+  // Sentinel values that should be treated as placeholders rather than data.
+  placeholderValues?: Array<string | number>;
+  // String tokens that should be treated as missing in addition to null/undefined/NaN/"".
+  missingTokens?: string[];
+  // Above this row count, affectedRowIndices is omitted from the issue.
+  maxIndicesInIssue?: number;
+}
+
+// Placeholders are sentinel *values* that look like data but are actually
+// "no value." Kept distinct from missing-tokens to avoid double-flagging
+// the same cell under two issue types.
+const DEFAULT_PLACEHOLDERS: Array<string | number> = [
+  999,
+  -1,
+  "unknown",
+  "Unknown",
+];
+
+// Tokens that mean "no data was recorded." Empty string is always treated
+// as missing without needing to be listed.
+const DEFAULT_MISSING_TOKENS: string[] = [
+  "NA",
+  "N/A",
+  "n/a",
+  "null",
+  "NULL",
+  "None",
+];
+
+const DEFAULT_MAX_INDICES_IN_ISSUE = 50;
+
+let issueCounter = 0;
+function nextIssueId(): string {
+  issueCounter += 1;
+  return `iss-${Date.now()}-${issueCounter}`;
+}
+
+function nowIso(): string {
+  return new Date().toISOString();
+}
+
+function isMissing(value: unknown, missingTokens: string[]): boolean {
+  if (value === null || value === undefined) return true;
+  if (typeof value === "number" && Number.isNaN(value)) return true;
+  if (typeof value === "string") {
+    if (value === "") return true;
+    if (missingTokens.includes(value)) return true;
+  }
+  return false;
+}
+
+function toNumber(value: unknown): number | undefined {
+  if (typeof value === "number" && Number.isFinite(value)) return value;
+  if (typeof value === "string" && value.trim() !== "") {
+    const n = Number(value);
+    if (Number.isFinite(n)) return n;
+  }
+  return undefined;
+}
+
+function placeholderHit(
+  value: unknown,
+  placeholders: Array<string | number>
+): string | number | undefined {
+  for (const p of placeholders) {
+    if (typeof p === "string" && typeof value === "string" && p === value) return p;
+    if (typeof p === "number") {
+      const n = toNumber(value);
+      if (n !== undefined && n === p) return p;
+    }
+  }
+  return undefined;
+}
+
+function maybeIndices(
+  indices: number[],
+  cap: number
+): number[] | undefined {
+  return indices.length <= cap ? indices : undefined;
+}
+
+export function profileDataset(
+  dataset: DatasetView,
+  options: ProfileOptions = {}
+): DataQualityIssue[] {
+  const placeholders = options.placeholderValues ?? DEFAULT_PLACEHOLDERS;
+  const missingTokens = options.missingTokens ?? DEFAULT_MISSING_TOKENS;
+  const indexCap = options.maxIndicesInIssue ?? DEFAULT_MAX_INDICES_IN_ISSUE;
+  const detectedAt = nowIso();
+  const issues: DataQualityIssue[] = [];
+
+  // Pre-compute placeholder hits per row/column so out_of_range can avoid
+  // double-counting and missing-value can avoid flagging a row that has a
+  // string placeholder like "N/A" twice.
+  const placeholderHitByColRow = new Map<string, Map<number, string | number>>();
+  for (const col of dataset.columns) {
+    const map = new Map<number, string | number>();
+    for (let i = 0; i < dataset.rows.length; i++) {
+      const hit = placeholderHit(dataset.rows[i][col], placeholders);
+      if (hit !== undefined) map.set(i, hit);
+    }
+    placeholderHitByColRow.set(col, map);
+  }
+
+  // Missing-value detector.
+  for (const col of dataset.columns) {
+    const missingIndices: number[] = [];
+    for (let i = 0; i < dataset.rows.length; i++) {
+      if (isMissing(dataset.rows[i][col], missingTokens)) missingIndices.push(i);
+    }
+    if (missingIndices.length === 0) continue;
+    const pct = (missingIndices.length / Math.max(dataset.rows.length, 1)) * 100;
+    issues.push({
+      issueId: nextIssueId(),
+      issueType: "missing_value",
+      column: col,
+      description: `${missingIndices.length} row(s) have missing ${col}`,
+      evidence: `Missing: ${missingIndices.length} of ${dataset.rows.length} (${pct.toFixed(1)}%)`,
+      affectedRowCount: missingIndices.length,
+      affectedRowIndices: maybeIndices(missingIndices, indexCap),
+      detectedAt,
+    });
+  }
+
+  // Placeholder-value detector.
+  for (const col of dataset.columns) {
+    const hits = placeholderHitByColRow.get(col)!;
+    if (hits.size === 0) continue;
+    const indices = Array.from(hits.keys()).sort((a, b) => a - b);
+    const distinctValues = Array.from(new Set(hits.values()));
+    issues.push({
+      issueId: nextIssueId(),
+      issueType: "placeholder_value",
+      column: col,
+      description: `${indices.length} row(s) in ${col} contain placeholder value(s): ${distinctValues.join(", ")}`,
+      evidence: `Placeholder(s) ${distinctValues.join(", ")} appear ${indices.length} time(s) in ${col}.`,
+      affectedRowCount: indices.length,
+      affectedRowIndices: maybeIndices(indices, indexCap),
+      detectedAt,
+    });
+  }
+
+  // Duplicate-ID detector (only when idColumn is configured and exists).
+  if (options.idColumn && dataset.columns.includes(options.idColumn)) {
+    const idCol = options.idColumn;
+    const positions = new Map<string, number[]>();
+    for (let i = 0; i < dataset.rows.length; i++) {
+      const v = dataset.rows[i][idCol];
+      if (v === null || v === undefined) continue;
+      const key = String(v);
+      const existing = positions.get(key);
+      if (existing) existing.push(i);
+      else positions.set(key, [i]);
+    }
+    const duplicateIndices: number[] = [];
+    const duplicateKeys: string[] = [];
+    for (const [key, rows] of positions) {
+      if (rows.length > 1) {
+        duplicateIndices.push(...rows);
+        duplicateKeys.push(key);
+      }
+    }
+    if (duplicateIndices.length > 0) {
+      duplicateIndices.sort((a, b) => a - b);
+      issues.push({
+        issueId: nextIssueId(),
+        issueType: "duplicate_id",
+        column: idCol,
+        description: `${duplicateKeys.length} duplicate ID(s) in ${idCol} affecting ${duplicateIndices.length} row(s)`,
+        evidence: `Duplicate keys (showing up to 5): ${duplicateKeys.slice(0, 5).join(", ")}`,
+        affectedRowCount: duplicateIndices.length,
+        affectedRowIndices: maybeIndices(duplicateIndices, indexCap),
+        detectedAt,
+      });
+    }
+  }
+
+  // Out-of-range detector — skips rows already flagged as placeholders so we
+  // don't surface the same row under two issue types.
+  if (options.validRanges) {
+    for (const [col, range] of Object.entries(options.validRanges)) {
+      if (!dataset.columns.includes(col)) continue;
+      const placeholderHits = placeholderHitByColRow.get(col)!;
+      const oorIndices: number[] = [];
+      for (let i = 0; i < dataset.rows.length; i++) {
+        if (placeholderHits.has(i)) continue;
+        const v = toNumber(dataset.rows[i][col]);
+        if (v === undefined) continue;
+        if (v < range.min || v > range.max) oorIndices.push(i);
+      }
+      if (oorIndices.length === 0) continue;
+      issues.push({
+        issueId: nextIssueId(),
+        issueType: "out_of_range",
+        column: col,
+        description: `${oorIndices.length} row(s) in ${col} fall outside [${range.min}, ${range.max}]`,
+        evidence: `Valid range: [${range.min}, ${range.max}]; violations: ${oorIndices.length}.`,
+        affectedRowCount: oorIndices.length,
+        affectedRowIndices: maybeIndices(oorIndices, indexCap),
+        detectedAt,
+      });
+    }
+  }
+
+  return issues;
+}
diff --git a/agent-service/src/types/dataguard.test.ts b/agent-service/src/types/dataguard.test.ts
new file mode 100644
index 00000000000..c256c7e4709
--- /dev/null
+++ b/agent-service/src/types/dataguard.test.ts
@@ -0,0 +1,219 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { describe, expect, test } from "bun:test";
+import type {
+  AutoAllowRule,
+  Confidence,
+  DataQualityIssue,
+  DecisionLogEntry,
+  FixOperationKind,
+  FixProposal,
+  IssueType,
+  PermissionDecision,
+  RiskTier,
+  Verdict,
+} from "./dataguard";
+
+// These tests double as runtime fixtures for downstream tools. Each shape is
+// instantiated with realistic data drawn from the design doc's §5 storyboard
+// so that any drift in the type definitions surfaces here first.
+
+describe("DataGuard type shapes", () => {
+  test("DataQualityIssue: placeholder-value example (high-affinity, small N)", () => {
+    const issue: DataQualityIssue = {
+      issueId: "iss-1",
+      issueType: "placeholder_value",
+      column: "age",
+      description: "5 rows have age = 999",
+      evidence: "5 of 5 rows with age=999 have no other anomalies.",
+      affectedRowCount: 5,
+      affectedRowIndices: [10, 42, 77, 199, 412],
+      detectedAt: "2026-05-14T12:00:00.000Z",
+    };
+    expect(issue.issueType).toBe("placeholder_value");
+    expect(issue.affectedRowCount).toBe(5);
+    expect(issue.affectedRowIndices).toHaveLength(5);
+  });
+
+  test("DataQualityIssue: missing-value example without row indices (large N)", () => {
+    const issue: DataQualityIssue = {
+      issueId: "iss-2",
+      issueType: "missing_value",
+      column: "glucose",
+      description: "17 missing glucose values, 14 in Group A",
+      evidence: "Group A: 14 of 200 rows missing; Group B: 3 of 200 rows missing.",
+      affectedRowCount: 17,
+      detectedAt: "2026-05-14T12:00:01.000Z",
+    };
+    expect(issue.affectedRowIndices).toBeUndefined();
+  });
+
+  test("FixProposal: replace-value, medium risk, high confidence", () => {
+    const proposal: FixProposal = {
+      issueId: "iss-1",
+      action: "Replace age = 999 with NULL",
+      operationKind: "replace_value",
+      operationParams: { column: "age", match: 999, replacement: null },
+      riskTier: "medium",
+      reason: "999 is outside the valid human-age range and appears to be a placeholder.",
+      evidence: "5 of 5 rows with age=999 have no other anomalies.",
+      confidence: "high",
+      targetRowCount: 5,
+    };
+    expect(proposal.riskTier).toBe("medium");
+    expect(proposal.operationKind).toBe("replace_value");
+    expect(proposal.operationParams).toMatchObject({ column: "age", match: 999 });
+  });
+
+  test("FixProposal: drop-rows, high risk (the storyboard 'deny' case)", () => {
+    const proposal: FixProposal = {
+      issueId: "iss-3",
+      action: "Drop 3 rows with BMI > 60",
+      operationKind: "drop_rows",
+      operationParams: { rowIndices: [55, 211, 433] },
+      riskTier: "high",
+      reason: "Extreme outliers may be data-entry errors.",
+      evidence: "3 rows have BMI > 60 (clinical maximum ~70).",
+      confidence: "low",
+      targetRowCount: 3,
+    };
+    expect(proposal.riskTier).toBe("high");
+  });
+
+  test("DecisionLogEntry: allowed and applied", () => {
+    const entry: DecisionLogEntry = {
+      decisionId: "dec-1",
+      timestamp: "2026-05-14T12:00:30.000Z",
+      issueType: "placeholder_value",
+      targetRowCount: 5,
+      proposedAction: "Replace age = 999 with NULL",
+      userDecision: "allow",
+      reason: "999 outside valid age range.",
+      confidence: "high",
+      appliedAt: "2026-05-14T12:00:31.123Z",
+    };
+    expect(entry.userDecision).toBe("allow");
+    expect(entry.appliedAt).toBeDefined();
+    expect(entry.modifiedAction).toBeUndefined();
+  });
+
+  test("DecisionLogEntry: denied — no appliedAt", () => {
+    const entry: DecisionLogEntry = {
+      decisionId: "dec-2",
+      timestamp: "2026-05-14T12:01:00.000Z",
+      issueType: "outlier",
+      targetRowCount: 3,
+      proposedAction: "Drop 3 rows with BMI > 60",
+      userDecision: "deny",
+      reason: "User flagged these as meaningful clinical cases.",
+      confidence: "low",
+    };
+    expect(entry.userDecision).toBe("deny");
+    expect(entry.appliedAt).toBeUndefined();
+  });
+
+  test("DecisionLogEntry: modified — carries modifiedAction", () => {
+    const entry: DecisionLogEntry = {
+      decisionId: "dec-3",
+      timestamp: "2026-05-14T12:02:00.000Z",
+      issueType: "missing_value",
+      targetRowCount: 17,
+      proposedAction: "Impute missing glucose with group median",
+      userDecision: "modify",
+      modifiedAction: "Flag for manual review",
+      reason: "Imbalance across groups makes imputation risky.",
+      confidence: "medium",
+      appliedAt: "2026-05-14T12:02:05.000Z",
+    };
+    expect(entry.userDecision).toBe("modify");
+    expect(entry.modifiedAction).toBe("Flag for manual review");
+  });
+
+  test("AutoAllowRule: per-issue-type policy", () => {
+    const rule: AutoAllowRule = {
+      ruleId: "rule-1",
+      issueType: "placeholder_value",
+      createdAt: "2026-05-14T12:00:30.000Z",
+    };
+    expect(rule.issueType).toBe("placeholder_value");
+  });
+
+  test("PermissionDecision: allow with remember=true triggers a rule write", () => {
+    const decision: PermissionDecision = {
+      stepId: "step-43",
+      verdict: "allow",
+      remember: true,
+    };
+    expect(decision.remember).toBe(true);
+    expect(decision.modifiedAction).toBeUndefined();
+  });
+
+  test("PermissionDecision: modify with modifiedAction", () => {
+    const decision: PermissionDecision = {
+      stepId: "step-42",
+      verdict: "modify",
+      modifiedAction: "Flag for manual review instead of impute",
+    };
+    expect(decision.verdict).toBe("modify");
+    expect(decision.modifiedAction).toBeDefined();
+  });
+
+  test("PermissionDecision: deny", () => {
+    const decision: PermissionDecision = {
+      stepId: "step-44",
+      verdict: "deny",
+    };
+    expect(decision.verdict).toBe("deny");
+  });
+
+  test("Literal unions accept all documented members", () => {
+    const risks: RiskTier[] = ["low", "medium", "high"];
+    const confidences: Confidence[] = ["low", "medium", "high"];
+    const issueTypes: IssueType[] = [
+      "placeholder_value",
+      "missing_value",
+      "duplicate_id",
+      "out_of_range",
+      "outlier",
+      "inconsistent_label",
+    ];
+    const opKinds: FixOperationKind[] = [
+      "replace_value",
+      "drop_rows",
+      "impute",
+      "flag",
+      "standardize",
+      "trim_whitespace",
+      "rename_column",
+    ];
+    const verdicts: Verdict[] = [
+      "allow",
+      "deny",
+      "modify",
+      "auto_allow_low_risk",
+      "auto_allow_remembered",
+    ];
+    expect(risks).toHaveLength(3);
+    expect(confidences).toHaveLength(3);
+    expect(issueTypes).toHaveLength(6);
+    expect(opKinds).toHaveLength(7);
+    expect(verdicts).toHaveLength(5);
+  });
+});
diff --git a/agent-service/src/types/dataguard.ts b/agent-service/src/types/dataguard.ts
new file mode 100644
index 00000000000..413e2e37814
--- /dev/null
+++ b/agent-service/src/types/dataguard.ts
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Shared types for the DataGuard agent: the contract between the four
+// DataGuard tools (profile_dataset / suggest_fix / apply_fix / write_decision_log),
+// the agent's permission-gating layer, and the chat-panel approval UI.
+
+export type RiskTier = "low" | "medium" | "high";
+
+export type Confidence = "low" | "medium" | "high";
+
+export type IssueType =
+  | "placeholder_value"
+  | "missing_value"
+  | "duplicate_id"
+  | "out_of_range"
+  | "outlier"
+  | "inconsistent_label";
+
+export type FixOperationKind =
+  | "replace_value"
+  | "drop_rows"
+  | "impute"
+  | "flag"
+  | "standardize"
+  | "trim_whitespace"
+  | "rename_column";
+
+export type Verdict =
+  | "allow"
+  | "deny"
+  | "modify"
+  | "auto_allow_low_risk"
+  | "auto_allow_remembered";
+
+export interface DataQualityIssue {
+  issueId: string;
+  issueType: IssueType;
+  column: string;
+  description: string;
+  evidence: string;
+  affectedRowCount: number;
+  // Present only when the affected set is small enough to enumerate; otherwise
+  // omit and rely on `evidence` for a sample / aggregate description.
+  affectedRowIndices?: number[];
+  detectedAt: string;
+}
+
+export interface FixProposal {
+  issueId: string;
+  action: string;
+  operationKind: FixOperationKind;
+  operationParams: Record<string, unknown>;
+  riskTier: RiskTier;
+  reason: string;
+  evidence: string;
+  confidence: Confidence;
+  targetRowCount: number;
+}
+
+export interface PermissionDecision {
+  stepId: string;
+  verdict: Verdict;
+  modifiedAction?: string;
+  remember?: boolean;
+}
+
+export interface DecisionLogEntry {
+  decisionId: string;
+  timestamp: string;
+  issueType: IssueType;
+  targetRowCount: number;
+  proposedAction: string;
+  userDecision: Verdict;
+  modifiedAction?: string;
+  reason: string;
+  confidence: Confidence;
+  appliedAt?: string;
+}
+
+export interface AutoAllowRule {
+  ruleId: string;
+  issueType: IssueType;
+  createdAt: string;
+}
diff --git a/agent-service/src/types/index.ts b/agent-service/src/types/index.ts
index c6d7291e51d..1fd76593d69 100644
--- a/agent-service/src/types/index.ts
+++ b/agent-service/src/types/index.ts
@@ -20,3 +20,4 @@
 export * from "./workflow";
 export * from "./execution";
 export * from "./agent";
+export * from "./dataguard";

From 076b708bc1eba75fd40d6f9cee28fb2a581fcaa4 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Thu, 14 May 2026 19:34:15 -0700
Subject: [PATCH 02/14] feat(agent-service): complete DataGuard backend with
 permission gating

Finishes the agent-side DataGuard MVP: LLM-driven suggest_fix, mutating
apply_fix, the requestApproval/awaitDecision/resolveDecision gating layer
on TexeraAgent (pendingApproval step + WS decision message + auto-allow
"remember" rules), write_decision_log (RFC-4180 CSV), bias-check
(per-group retention), and a ~50-row polluted diabetes demo CSV. 122 new
test cases, all four DataGuard tools registered into createTools().
---
 agent-service/demo/README.md                  |  40 +++
 agent-service/demo/diabetes_messy.csv         |  51 ++++
 agent-service/src/agent/texera-agent.ts       | 128 +++++++-
 .../agent/tools/dataguard/apply-fix.test.ts   | 280 ++++++++++++++++++
 .../src/agent/tools/dataguard/apply-fix.ts    | 205 +++++++++++++
 .../agent/tools/dataguard/bias-check.test.ts  | 106 +++++++
 .../src/agent/tools/dataguard/bias-check.ts   | 131 ++++++++
 .../tools/dataguard/dataguard-session.test.ts | 119 ++++++++
 .../tools/dataguard/dataguard-session.ts      | 151 ++++++++++
 .../agent/tools/dataguard/dataguard-tools.ts  | 190 ++++++++++++
 .../tools/dataguard/decision-log.test.ts      |  98 ++++++
 .../src/agent/tools/dataguard/decision-log.ts |  89 ++++++
 .../agent/tools/dataguard/suggest-fix.test.ts | 150 ++++++++++
 .../src/agent/tools/dataguard/suggest-fix.ts  | 137 +++++++++
 .../tools/dataguard/with-approval.test.ts     | 137 +++++++++
 .../agent/tools/dataguard/with-approval.ts    |  57 ++++
 agent-service/src/server.ts                   |  65 +++-
 agent-service/src/types/agent.ts              |  11 +
 agent-service/src/types/dataguard.test.ts     |   2 +
 agent-service/src/types/dataguard.ts          |   1 +
 20 files changed, 2145 insertions(+), 3 deletions(-)
 create mode 100644 agent-service/demo/README.md
 create mode 100644 agent-service/demo/diabetes_messy.csv
 create mode 100644 agent-service/src/agent/tools/dataguard/apply-fix.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/apply-fix.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/bias-check.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/bias-check.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/dataguard-session.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/dataguard-session.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/dataguard-tools.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/decision-log.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/decision-log.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/suggest-fix.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/suggest-fix.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/with-approval.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/with-approval.ts

diff --git a/agent-service/demo/README.md b/agent-service/demo/README.md
new file mode 100644
index 00000000000..968987a0703
--- /dev/null
+++ b/agent-service/demo/README.md
@@ -0,0 +1,40 @@
+# DataGuard demo dataset
+
+`diabetes_messy.csv` — a deliberately polluted ~50-row sample modeled on the
+UCI Pima Indians Diabetes dataset. Each row is one subject; the rightmost
+column (`diabetic_outcome`) is the label.
+
+## Injected issues (every issue type DataGuard detects)
+
+| # | Issue type | Rows | What's wrong |
+|---|---|---|---|
+| 1 | `placeholder_value` | S004, S012 | `age = 999` (sentinel) |
+| 2 | `placeholder_value` | S043 | `bmi = -1` (sentinel) |
+| 3 | `placeholder_value` | S047 | `age = "Unknown"` (string sentinel) |
+| 4 | `missing_value` | S005, S007, S009, S014 | empty `glucose` in Group A (imbalanced) |
+| 5 | `missing_value` | S045, S046, S048 | `age = "N/A"`, `glucose = " "`, `glucose = "null"` |
+| 6 | `duplicate_id` | S001, S017 | sample_id repeats with conflicting `diabetic_outcome` |
+| 7 | `out_of_range` | S041, S042, S044 | `bmi > 60` (clinical max ~70 — possibly real, possibly error) |
+
+## How to load into DataGuard
+
+```bash
+# After bun install + bun run dev in agent-service/
+curl -X POST http://localhost:8000/api/agents/<agentId>/dataguard/dataset \
+  -H "Content-Type: application/json" \
+  -d "$(jq -nR --rawfile c diabetes_messy.csv '{
+    columns: ($c | split("\n")[0] | split(",")),
+    rows: ($c | split("\n")[1:] | map(split(",") | length as $n | reduce range(0;$n) as $i ({}; . + {(.|keys|"col\($i)"): .[$i]})))
+  }')"
+```
+
+Or via the demo script (Step 13's frontend auto-trigger handles this in the
+real flow — once a `CSVFileScan` operator is added that references this file,
+DataGuard auto-launches).
+
+## Bias-check expectation
+
+Group A: 22 rows. Group B: 23 rows. After cleaning, missingness imbalance
+(more empties in A) means naive imputation drops ~18% of A but only ~4% of B
+— DataGuard surfaces this and recommends `flag` instead of `impute` for the
+missing-glucose issue (the §5 storyboard "Modify" beat).
diff --git a/agent-service/demo/diabetes_messy.csv b/agent-service/demo/diabetes_messy.csv
new file mode 100644
index 00000000000..12115816b1a
--- /dev/null
+++ b/agent-service/demo/diabetes_messy.csv
@@ -0,0 +1,51 @@
+sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
+S001,45,110,28.1,80,A,0
+S002,52,140,30.5,85,A,1
+S003,38,95,24.0,70,A,0
+S004,999,130,29.8,82,A,1
+S005,41,,27.5,78,A,0
+S006,47,125,31.2,80,A,1
+S007,55,,33.0,90,A,1
+S008,49,118,28.9,82,A,0
+S009,42,,26.7,75,A,0
+S010,53,135,29.1,88,A,1
+S011,46,108,27.3,79,A,0
+S012,999,142,30.8,87,A,1
+S013,43,112,26.4,76,A,0
+S014,51,,29.5,84,A,1
+S015,48,120,28.6,81,A,0
+S016,40,98,25.2,72,A,0
+S017,57,148,32.7,92,A,1
+S018,44,115,27.8,79,A,0
+S019,50,128,30.0,85,A,1
+S020,39,102,24.9,73,A,0
+S021,46,113,28.4,80,B,0
+S022,54,138,31.5,88,B,1
+S023,42,108,27.0,77,B,0
+S024,48,122,29.3,82,B,1
+S025,45,116,28.7,79,B,0
+S026,52,134,30.6,86,B,1
+S027,41,105,25.8,74,B,0
+S028,49,124,29.0,83,B,1
+S029,47,118,28.2,81,B,0
+S030,43,110,26.9,76,B,0
+S031,55,145,32.1,90,B,1
+S032,46,114,28.5,80,B,0
+S033,50,130,30.2,85,B,1
+S034,38,72,23.5,68,B,0
+S035,53,140,31.0,87,B,1
+S036,44,112,27.6,78,B,0
+S037,51,132,30.4,86,B,1
+S038,42,107,26.3,75,B,0
+S039,48,121,28.8,82,B,1
+S040,45,115,27.9,80,B,0
+S001,45,110,28.1,80,A,1
+S017,57,148,32.7,92,A,0
+S041,62,180,67.5,95,A,1
+S042,58,165,65.2,93,B,1
+S043,49,128,-1,82,A,0
+S044,46,112,72.0,84,B,1
+S045,N/A,118,28.7,80,A,0
+S046,44, ,26.9,76,B,0
+S047,Unknown,124,29.2,83,A,1
+S048,50,null,28.8,81,B,0
diff --git a/agent-service/src/agent/texera-agent.ts b/agent-service/src/agent/texera-agent.ts
index 37eb12d8688..f70cbadc358 100644
--- a/agent-service/src/agent/texera-agent.ts
+++ b/agent-service/src/agent/texera-agent.ts
@@ -51,6 +51,16 @@ import { assembleContext } from "./util/context-utils";
 import { compileWorkflowAsync, type WorkflowCompilationResponse } from "../api/compile-api";
 import { createLogger } from "../logger";
 import type { Logger } from "pino";
+import type {
+  FixProposal,
+  IssueType,
+  PermissionDecision,
+} from "../types/dataguard";
+import { DataGuardSession } from "./tools/dataguard/dataguard-session";
+import type { ApprovalGateway } from "./tools/dataguard/with-approval";
+import type { LlmCallFn } from "./tools/dataguard/suggest-fix";
+import { createDataGuardTools } from "./tools/dataguard/dataguard-tools";
+import type { DatasetView } from "./tools/dataguard/dataset";
 
 const PERSIST_DEBOUNCE_MS = 500;
 
@@ -80,8 +90,12 @@ type ReActStepCallback = (step: ReActStep) => void;
  * (`WorkflowResultState`), and the tool surface exposed to the LLM. Each call
  * to `sendMessage` drives one multi-step generation via the Vercel AI SDK,
  * streaming step updates to subscribed websockets.
+ *
+ * Also implements `ApprovalGateway` — DataGuard's mutating tools call
+ * `requestApproval(this, …)` to pause the ReAct loop until the user clicks
+ * Allow / Deny / Modify in the chat panel.
  */
-export class TexeraAgent {
+export class TexeraAgent implements ApprovalGateway {
   readonly agentId: string;
   readonly agentName: string;
   readonly modelType: string;
@@ -125,6 +139,11 @@ export class TexeraAgent {
 
   private log: Logger;
 
+  // DataGuard state — see agent/tools/dataguard/
+  private readonly dataGuardSession = new DataGuardSession();
+  private pendingDecisions: Map<string, (d: PermissionDecision) => void> = new Map();
+  private decidedBuffer: Map<string, PermissionDecision> = new Map();
+
   constructor(config: TexeraAgentConfig) {
     this.agentId = config.agentId;
     this.agentName = config.agentName || `Agent-${config.agentId}`;
@@ -228,6 +247,24 @@ export class TexeraAgent {
       );
     }
 
+    // DataGuard tools — read-only profile/suggest plus permission-gated apply.
+    const llmCall: LlmCallFn = async (prompt: string) => {
+      const { text } = await generateText({
+        model: this.model,
+        prompt,
+        temperature: 0.2,
+      });
+      return text;
+    };
+    Object.assign(
+      tools,
+      createDataGuardTools({
+        session: this.dataGuardSession,
+        gateway: this,
+        llmCall,
+      })
+    );
+
     return tools;
   }
 
@@ -312,7 +349,9 @@ export class TexeraAgent {
     this.stepCallback = callback;
   }
 
-  private generateStepId(): string {
+  // Public because DataGuard's permission-gating layer needs a fresh step id
+  // for a pending-approval step before any AI SDK step has been minted.
+  public generateStepId(): string {
     return `step-${this.agentId}-${++this.stepCounter}-${Date.now()}`;
   }
 
@@ -823,6 +862,83 @@ export class TexeraAgent {
     return relevantSteps;
   }
 
+  // ============================================================
+  // DataGuard / ApprovalGateway
+  // ============================================================
+
+  public getDataGuardSession(): DataGuardSession {
+    return this.dataGuardSession;
+  }
+
+  public setDataGuardDataset(dataset: DatasetView): void {
+    this.dataGuardSession.setDataset(dataset);
+  }
+
+  // ApprovalGateway: does this issueType have a standing "remember" rule?
+  public matchesAutoAllowRule(issueType: IssueType): boolean {
+    return this.dataGuardSession.matchesAutoAllowRule(issueType);
+  }
+
+  // ApprovalGateway: append a pending-approval step into the history and
+  // broadcast it through the existing stepCallback so the chat panel renders
+  // the prompt UI.
+  public emitPendingApproval(stepId: string, proposal: FixProposal): void {
+    const messageId = this.currentMessageId ?? "<no-message>";
+    const wf = this.workflowState.getWorkflowContent();
+    const step: ReActStep = {
+      id: stepId,
+      parentId: this.head,
+      messageId,
+      stepId: -1,
+      timestamp: Date.now(),
+      role: "agent",
+      content: proposal.action,
+      isBegin: false,
+      isEnd: false,
+      pendingApproval: {
+        toolName: "apply_fix",
+        proposal,
+        riskTier: proposal.riskTier,
+      },
+      beforeWorkflowContent: wf,
+      afterWorkflowContent: wf,
+    };
+    this.addStep(step);
+    this.head = stepId;
+  }
+
+  // ApprovalGateway: wait for the user's decision. Resolves when the server
+  // receives a WS {type:"decision", stepId, …} message and calls resolveDecision.
+  public awaitDecision(stepId: string): Promise<PermissionDecision> {
+    const buffered = this.decidedBuffer.get(stepId);
+    if (buffered) {
+      this.decidedBuffer.delete(stepId);
+      return Promise.resolve(buffered);
+    }
+    return new Promise<PermissionDecision>(resolve => {
+      this.pendingDecisions.set(stepId, resolve);
+    });
+  }
+
+  // Called from the WS handler when the user clicks Allow / Deny / Modify.
+  // Returns true if a waiting tool was unblocked, false if buffered for later.
+  public resolveDecision(stepId: string, decision: PermissionDecision): boolean {
+    if (decision.remember) {
+      // The "Allow & don't ask again" verdict also writes an auto-allow rule.
+      const proposal = this.dataGuardSession.getProposal(extractIssueIdFromStep(this.stepsById, stepId));
+      if (proposal) this.dataGuardSession.addAutoAllowRule(proposal.issueType);
+    }
+    const resolver = this.pendingDecisions.get(stepId);
+    if (resolver) {
+      this.pendingDecisions.delete(stepId);
+      resolver(decision);
+      return true;
+    }
+    // Decision arrived before the tool started awaiting — buffer it.
+    this.decidedBuffer.set(stepId, decision);
+    return false;
+  }
+
   destroy(): void {
     if (this.workflowChangeSubscription) {
       this.workflowChangeSubscription.unsubscribe();
@@ -838,3 +954,11 @@ export class TexeraAgent {
     this.currentMessageId = undefined;
   }
 }
+
+function extractIssueIdFromStep(
+  steps: Map<string, ReActStep>,
+  stepId: string
+): string {
+  const step = steps.get(stepId);
+  return step?.pendingApproval?.proposal.issueId ?? "";
+}
diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/apply-fix.test.ts
new file mode 100644
index 00000000000..1546102b4dd
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/apply-fix.test.ts
@@ -0,0 +1,280 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { applyFix } from "./apply-fix";
+import type { DatasetView } from "./dataset";
+import type { FixProposal } from "../../../types/dataguard";
+
+function makeProposal(overrides: Partial<FixProposal> = {}): FixProposal {
+  return {
+    issueId: "iss-test",
+    issueType: "placeholder_value",
+    action: "test action",
+    operationKind: "replace_value",
+    operationParams: {},
+    riskTier: "medium",
+    reason: "test",
+    evidence: "test",
+    confidence: "high",
+    targetRowCount: 0,
+    ...overrides,
+  };
+}
+
+describe("applyFix", () => {
+  test("replace_value: swaps matching cells, leaves rest", () => {
+    const ds: DatasetView = {
+      columns: ["age"],
+      rows: [{ age: 25 }, { age: 999 }, { age: 30 }, { age: 999 }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "replace_value",
+        operationParams: { column: "age", match: 999, replacement: null },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows[0].age).toBe(25);
+    expect(result.dataset.rows[1].age).toBe(null);
+    expect(result.dataset.rows[3].age).toBe(null);
+  });
+
+  test("replace_value: original dataset is not mutated", () => {
+    const ds: DatasetView = {
+      columns: ["age"],
+      rows: [{ age: 999 }, { age: 30 }],
+    };
+    const before = JSON.stringify(ds);
+    applyFix(
+      ds,
+      makeProposal({
+        operationKind: "replace_value",
+        operationParams: { column: "age", match: 999, replacement: null },
+      })
+    );
+    expect(JSON.stringify(ds)).toBe(before);
+  });
+
+  test("drop_rows: removes rows at given indices", () => {
+    const ds: DatasetView = {
+      columns: ["x"],
+      rows: [{ x: 0 }, { x: 1 }, { x: 2 }, { x: 3 }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "drop_rows",
+        operationParams: { rowIndices: [1, 3] },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows).toHaveLength(2);
+    expect(result.dataset.rows[0].x).toBe(0);
+    expect(result.dataset.rows[1].x).toBe(2);
+  });
+
+  test("impute mean: fills missing with column mean", () => {
+    const ds: DatasetView = {
+      columns: ["v"],
+      rows: [{ v: 10 }, { v: 20 }, { v: null }, { v: 30 }, { v: null }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "impute",
+        operationParams: { column: "v", strategy: "mean" },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows[2].v).toBe(20);
+    expect(result.dataset.rows[4].v).toBe(20);
+  });
+
+  test("impute median (odd count): fills missing with middle value", () => {
+    const ds: DatasetView = {
+      columns: ["v"],
+      rows: [{ v: 1 }, { v: 3 }, { v: null }, { v: 100 }],
+    };
+    // Non-missing values [1, 3, 100], sorted → median = 3
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "impute",
+        operationParams: { column: "v", strategy: "median" },
+      })
+    );
+    expect(result.dataset.rows[2].v).toBe(3);
+  });
+
+  test("impute median (even count): fills missing with mean of two middle", () => {
+    const ds: DatasetView = {
+      columns: ["v"],
+      rows: [{ v: 1 }, { v: 3 }, { v: null }, { v: 5 }, { v: 100 }],
+    };
+    // Non-missing values [1, 3, 5, 100], sorted → (3 + 5) / 2 = 4
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "impute",
+        operationParams: { column: "v", strategy: "median" },
+      })
+    );
+    expect(result.dataset.rows[2].v).toBe(4);
+  });
+
+  test("impute mode: fills missing with most common string", () => {
+    const ds: DatasetView = {
+      columns: ["c"],
+      rows: [
+        { c: "A" }, { c: "A" }, { c: "B" }, { c: null }, { c: "" },
+      ],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "impute",
+        operationParams: { column: "c", strategy: "mode" },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows[3].c).toBe("A");
+    expect(result.dataset.rows[4].c).toBe("A");
+  });
+
+  test("flag: does not mutate rows, populates flaggedRows", () => {
+    const ds: DatasetView = {
+      columns: ["x"],
+      rows: [{ x: 1 }, { x: 2 }, { x: 3 }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "flag",
+        operationParams: { rowIndices: [0, 2] },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.flaggedRows).toEqual([0, 2]);
+    expect(result.dataset.rows[0].x).toBe(1);
+    expect(result.dataset.rows[2].x).toBe(3);
+  });
+
+  test("trim_whitespace: trims string cells in target column", () => {
+    const ds: DatasetView = {
+      columns: ["name"],
+      rows: [{ name: " Alice " }, { name: "Bob" }, { name: "\tCharlie\n" }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "trim_whitespace",
+        operationParams: { column: "name" },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows[0].name).toBe("Alice");
+    expect(result.dataset.rows[1].name).toBe("Bob");
+    expect(result.dataset.rows[2].name).toBe("Charlie");
+  });
+
+  test("standardize: maps values per mapping dict", () => {
+    const ds: DatasetView = {
+      columns: ["yn"],
+      rows: [{ yn: "Y" }, { yn: "yes" }, { yn: "n" }, { yn: "N" }, { yn: "unknown" }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "standardize",
+        operationParams: {
+          column: "yn",
+          mapping: { Y: "yes", N: "no", n: "no" },
+        },
+      })
+    );
+    expect(result.rowsAffected).toBe(3);
+    expect(result.dataset.rows[0].yn).toBe("yes");
+    expect(result.dataset.rows[1].yn).toBe("yes"); // unchanged (no mapping)
+    expect(result.dataset.rows[2].yn).toBe("no");
+    expect(result.dataset.rows[3].yn).toBe("no");
+    expect(result.dataset.rows[4].yn).toBe("unknown");
+  });
+
+  test("rename_column: updates columns array and per-row keys", () => {
+    const ds: DatasetView = {
+      columns: ["sample_id", "value"],
+      rows: [{ sample_id: "S1", value: 1 }, { sample_id: "S2", value: 2 }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "rename_column",
+        operationParams: { from: "sample_id", to: "subjectId" },
+      })
+    );
+    expect(result.dataset.columns).toEqual(["subjectId", "value"]);
+    expect(result.dataset.rows[0].subjectId).toBe("S1");
+    expect(result.dataset.rows[0].sample_id).toBeUndefined();
+  });
+
+  test("empty dataset: returns empty dataset and zero rowsAffected", () => {
+    const ds: DatasetView = { columns: [], rows: [] };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "replace_value",
+        operationParams: { column: "x", match: 1, replacement: 0 },
+      })
+    );
+    expect(result.rowsAffected).toBe(0);
+    expect(result.dataset.rows).toEqual([]);
+  });
+
+  test("unknown operationKind: throws", () => {
+    const bad = makeProposal({
+      operationKind: "nuke_database" as unknown as FixProposal["operationKind"],
+      operationParams: {},
+    });
+    expect(() => applyFix({ columns: [], rows: [] }, bad)).toThrow(/unknown operationKind/);
+  });
+
+  test("realistic diabetes flow: replace age=999 with NULL leaves other columns intact", () => {
+    const ds: DatasetView = {
+      columns: ["sample_id", "age", "glucose"],
+      rows: [
+        { sample_id: "S1", age: 45, glucose: 110 },
+        { sample_id: "S2", age: 999, glucose: 130 },
+        { sample_id: "S3", age: 999, glucose: 140 },
+      ],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "replace_value",
+        operationParams: { column: "age", match: 999, replacement: null },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows[1].age).toBeNull();
+    expect(result.dataset.rows[1].glucose).toBe(130);
+    expect(result.dataset.rows[1].sample_id).toBe("S2");
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.ts b/agent-service/src/agent/tools/dataguard/apply-fix.ts
new file mode 100644
index 00000000000..77d2c0f85f4
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/apply-fix.ts
@@ -0,0 +1,205 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Applies one approved FixProposal to an in-memory DatasetView.
+// Pure function: returns a new dataset; never mutates the input.
+// This is the mutating boundary of DataGuard — every call here must have been
+// authorized through the permission scaffolding (see with-approval.ts).
+
+import type { FixProposal } from "../../../types/dataguard";
+import type { DatasetView } from "./dataset";
+
+export interface ApplyResult {
+  dataset: DatasetView;
+  rowsAffected: number;
+  flaggedRows: number[];
+}
+
+export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResult {
+  const rows = dataset.rows.map(r => ({ ...r }));
+  let columns = [...dataset.columns];
+  const params = proposal.operationParams;
+
+  switch (proposal.operationKind) {
+    case "replace_value": {
+      const column = params.column as string;
+      const match = params.match;
+      const replacement = params.replacement;
+      let affected = 0;
+      for (const r of rows) {
+        if (cellEquals(r[column], match)) {
+          r[column] = replacement;
+          affected++;
+        }
+      }
+      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+    }
+
+    case "drop_rows": {
+      const drop = new Set(params.rowIndices as number[]);
+      const kept = rows.filter((_, i) => !drop.has(i));
+      return {
+        dataset: { columns, rows: kept },
+        rowsAffected: rows.length - kept.length,
+        flaggedRows: [],
+      };
+    }
+
+    case "impute": {
+      const column = params.column as string;
+      const strategy = params.strategy as "mean" | "median" | "mode";
+      const fill = computeImputeValue(rows, column, strategy);
+      let affected = 0;
+      for (const r of rows) {
+        if (isMissing(r[column])) {
+          r[column] = fill;
+          affected++;
+        }
+      }
+      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+    }
+
+    case "flag": {
+      const indices = (params.rowIndices as number[]).slice();
+      return {
+        dataset: { columns, rows },
+        rowsAffected: indices.length,
+        flaggedRows: indices,
+      };
+    }
+
+    case "trim_whitespace": {
+      const column = params.column as string;
+      let affected = 0;
+      for (const r of rows) {
+        const v = r[column];
+        if (typeof v === "string") {
+          const trimmed = v.trim();
+          if (trimmed !== v) {
+            r[column] = trimmed;
+            affected++;
+          }
+        }
+      }
+      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+    }
+
+    case "standardize": {
+      const column = params.column as string;
+      const mapping = params.mapping as Record<string, string>;
+      let affected = 0;
+      for (const r of rows) {
+        const v = r[column];
+        if (typeof v === "string" && Object.prototype.hasOwnProperty.call(mapping, v)) {
+          r[column] = mapping[v];
+          affected++;
+        }
+      }
+      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+    }
+
+    case "rename_column": {
+      const from = params.from as string;
+      const to = params.to as string;
+      columns = columns.map(c => (c === from ? to : c));
+      let affected = 0;
+      for (const r of rows) {
+        if (Object.prototype.hasOwnProperty.call(r, from)) {
+          r[to] = r[from];
+          delete r[from];
+          affected++;
+        }
+      }
+      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+    }
+
+    default:
+      throw new Error(
+        `apply_fix: unknown operationKind: ${(proposal as unknown as { operationKind: string }).operationKind}`
+      );
+  }
+}
+
+function cellEquals(a: unknown, b: unknown): boolean {
+  if (a === b) return true;
+  if (typeof a === "number" && typeof b === "number" && Number.isNaN(a) && Number.isNaN(b)) {
+    return true;
+  }
+  return false;
+}
+
+function isMissing(v: unknown): boolean {
+  if (v === null || v === undefined) return true;
+  if (typeof v === "number" && Number.isNaN(v)) return true;
+  if (typeof v === "string" && v === "") return true;
+  return false;
+}
+
+function computeImputeValue(
+  rows: Record<string, unknown>[],
+  column: string,
+  strategy: "mean" | "median" | "mode"
+): unknown {
+  const numericValues: number[] = [];
+  const stringCounts = new Map<string, number>();
+  for (const r of rows) {
+    const v = r[column];
+    if (isMissing(v)) continue;
+    if (typeof v === "number" && Number.isFinite(v)) {
+      numericValues.push(v);
+    } else if (typeof v === "string") {
+      stringCounts.set(v, (stringCounts.get(v) ?? 0) + 1);
+    }
+  }
+
+  if (strategy === "mean") {
+    if (numericValues.length === 0) return null;
+    return numericValues.reduce((s, n) => s + n, 0) / numericValues.length;
+  }
+  if (strategy === "median") {
+    if (numericValues.length === 0) return null;
+    const sorted = [...numericValues].sort((a, b) => a - b);
+    const mid = Math.floor(sorted.length / 2);
+    return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
+  }
+  // mode: prefer strings if any non-missing strings exist; else fall back to numbers
+  if (stringCounts.size > 0) {
+    let mode = "";
+    let max = -1;
+    for (const [k, count] of stringCounts) {
+      if (count > max) {
+        max = count;
+        mode = k;
+      }
+    }
+    return mode;
+  }
+  if (numericValues.length === 0) return null;
+  const numCounts = new Map<number, number>();
+  for (const n of numericValues) numCounts.set(n, (numCounts.get(n) ?? 0) + 1);
+  let mode = numericValues[0];
+  let max = -1;
+  for (const [k, count] of numCounts) {
+    if (count > max) {
+      max = count;
+      mode = k;
+    }
+  }
+  return mode;
+}
diff --git a/agent-service/src/agent/tools/dataguard/bias-check.test.ts b/agent-service/src/agent/tools/dataguard/bias-check.test.ts
new file mode 100644
index 00000000000..04755ba8558
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/bias-check.test.ts
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { computeBiasCheck } from "./bias-check";
+import type { DatasetView } from "./dataset";
+
+describe("computeBiasCheck", () => {
+  test("identical before/after → 100% retention, no skew", () => {
+    const ds: DatasetView = {
+      columns: ["group"],
+      rows: [{ group: "A" }, { group: "A" }, { group: "B" }, { group: "B" }],
+    };
+    const r = computeBiasCheck(ds, ds, "group");
+    expect(r.skewDetected).toBe(false);
+    expect(r.perGroup.A.retentionPct).toBe(100);
+    expect(r.perGroup.B.retentionPct).toBe(100);
+  });
+
+  test("flags skew when one group loses much more than another", () => {
+    const before: DatasetView = {
+      columns: ["group"],
+      rows: [
+        { group: "A" }, { group: "A" }, { group: "A" }, { group: "A" }, { group: "A" },
+        { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" },
+      ],
+    };
+    const after: DatasetView = {
+      columns: ["group"],
+      rows: [
+        { group: "A" },
+        { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" },
+      ],
+    };
+    // A retains 20%, B retains 100% → 80-point gap → skew
+    const r = computeBiasCheck(before, after, "group");
+    expect(r.skewDetected).toBe(true);
+    expect(r.perGroup.A.retentionPct).toBe(20);
+    expect(r.perGroup.B.retentionPct).toBe(100);
+  });
+
+  test("balanced cleanup (5%/4% loss across groups) → no skew", () => {
+    // Mirrors the §5 storyboard closing beat — 4-5% loss per group.
+    const before: DatasetView = {
+      columns: ["group"],
+      rows: Array.from({ length: 200 }, (_, i) => ({ group: i < 100 ? "A" : "B" })),
+    };
+    const after: DatasetView = {
+      columns: ["group"],
+      rows: [
+        ...Array.from({ length: 96 }, () => ({ group: "A" })),
+        ...Array.from({ length: 95 }, () => ({ group: "B" })),
+      ],
+    };
+    const r = computeBiasCheck(before, after, "group");
+    expect(r.skewDetected).toBe(false);
+    expect(Math.round(r.perGroup.A.retentionPct)).toBe(96);
+    expect(Math.round(r.perGroup.B.retentionPct)).toBe(95);
+  });
+
+  test("groupColumn missing from dataset: returns empty perGroup, no crash", () => {
+    const ds: DatasetView = { columns: ["x"], rows: [{ x: 1 }] };
+    const r = computeBiasCheck(ds, ds, "group");
+    expect(r.perGroup).toEqual({});
+    expect(r.skewDetected).toBe(false);
+  });
+
+  test("custom skewThreshold widens / narrows the trigger", () => {
+    const before: DatasetView = {
+      columns: ["g"],
+      rows: Array.from({ length: 100 }, (_, i) => ({ g: i < 50 ? "A" : "B" })),
+    };
+    const after: DatasetView = {
+      columns: ["g"],
+      rows: [
+        ...Array.from({ length: 45 }, () => ({ g: "A" })), // 90%
+        ...Array.from({ length: 40 }, () => ({ g: "B" })), // 80%
+      ],
+    };
+    // 10-point gap: skew with threshold=5, no skew with threshold=15
+    expect(computeBiasCheck(before, after, "g", { skewThresholdPct: 5 }).skewDetected).toBe(true);
+    expect(computeBiasCheck(before, after, "g", { skewThresholdPct: 15 }).skewDetected).toBe(false);
+  });
+
+  test("empty before → no groups, no skew", () => {
+    const r = computeBiasCheck({ columns: [], rows: [] }, { columns: [], rows: [] }, "g");
+    expect(r.perGroup).toEqual({});
+    expect(r.skewDetected).toBe(false);
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/bias-check.ts b/agent-service/src/agent/tools/dataguard/bias-check.ts
new file mode 100644
index 00000000000..0a7b9213a9b
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/bias-check.ts
@@ -0,0 +1,131 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Compares per-group row counts in the before / after dataset and flags skew
+// — the closing demo beat: "Group A retention: 96%. Group B retention: 95%.
+// ✓ No skew introduced."
+
+import { z } from "zod";
+import { tool } from "ai";
+import type { DatasetView } from "./dataset";
+import type { DataGuardSession } from "./dataguard-session";
+
+export interface BiasCheckResult {
+  groupColumn: string;
+  perGroup: Record<string, { before: number; after: number; retentionPct: number }>;
+  maxRetentionGapPct: number;
+  skewThresholdPct: number;
+  skewDetected: boolean;
+}
+
+export interface BiasCheckOptions {
+  skewThresholdPct?: number;
+}
+
+const DEFAULT_SKEW_THRESHOLD = 10;
+
+export function computeBiasCheck(
+  before: DatasetView,
+  after: DatasetView,
+  groupColumn: string,
+  options: BiasCheckOptions = {}
+): BiasCheckResult {
+  const threshold = options.skewThresholdPct ?? DEFAULT_SKEW_THRESHOLD;
+  const perGroup: BiasCheckResult["perGroup"] = {};
+
+  if (!before.columns.includes(groupColumn) || before.rows.length === 0) {
+    return {
+      groupColumn,
+      perGroup,
+      maxRetentionGapPct: 0,
+      skewThresholdPct: threshold,
+      skewDetected: false,
+    };
+  }
+
+  const beforeCounts = countByGroup(before, groupColumn);
+  const afterCounts = countByGroup(after, groupColumn);
+
+  for (const [group, beforeN] of beforeCounts) {
+    const afterN = afterCounts.get(group) ?? 0;
+    perGroup[group] = {
+      before: beforeN,
+      after: afterN,
+      retentionPct: beforeN > 0 ? (afterN / beforeN) * 100 : 0,
+    };
+  }
+
+  const retentions = Object.values(perGroup).map(g => g.retentionPct);
+  const maxGap = retentions.length > 0 ? Math.max(...retentions) - Math.min(...retentions) : 0;
+
+  return {
+    groupColumn,
+    perGroup,
+    maxRetentionGapPct: maxGap,
+    skewThresholdPct: threshold,
+    skewDetected: maxGap > threshold,
+  };
+}
+
+function countByGroup(ds: DatasetView, col: string): Map<string, number> {
+  const counts = new Map<string, number>();
+  for (const r of ds.rows) {
+    const v = r[col];
+    if (v === undefined || v === null) continue;
+    const key = String(v);
+    counts.set(key, (counts.get(key) ?? 0) + 1);
+  }
+  return counts;
+}
+
+// ----- AI SDK tool -----
+
+export const TOOL_NAME_BIAS_CHECK = "bias_check";
+
+export function createBiasCheckTool(session: DataGuardSession) {
+  return tool({
+    description: `Compare row counts per group in the current dataset vs the pre-cleanup snapshot. Returns retention% per group plus a skew flag. Read-only.`,
+    inputSchema: z.object({
+      groupColumn: z.string().describe("Column whose distinct values define the groups (e.g., 'group', 'cohort')."),
+      skewThresholdPct: z
+        .number()
+        .optional()
+        .describe("If max(retention%) - min(retention%) exceeds this, skewDetected = true. Default 10."),
+      beforeDataset: z
+        .object({
+          columns: z.array(z.string()),
+          rows: z.array(z.record(z.string(), z.unknown())),
+        })
+        .optional()
+        .describe("Optional explicit 'before' dataset; if omitted, the tool cannot compute bias and returns an error."),
+    }),
+    execute: async (input) => {
+      const after = session.getDataset();
+      if (!after) return "[ERROR] No dataset in session; load one before calling bias_check.";
+      const before = input.beforeDataset ?? null;
+      if (!before) {
+        return "[ERROR] bias_check requires a beforeDataset (the pre-cleanup snapshot). Pass it explicitly.";
+      }
+      const result = computeBiasCheck(before, after, input.groupColumn, {
+        skewThresholdPct: input.skewThresholdPct,
+      });
+      return JSON.stringify(result);
+    },
+  });
+}
diff --git a/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts b/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts
new file mode 100644
index 00000000000..96f7c2fa43c
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { DataGuardSession } from "./dataguard-session";
+import type { DatasetView } from "./dataset";
+import type { DataQualityIssue, FixProposal } from "../../../types/dataguard";
+
+function makeIssue(): DataQualityIssue {
+  return {
+    issueId: "iss-1",
+    issueType: "placeholder_value",
+    column: "age",
+    description: "5 rows have age=999",
+    evidence: "5 of 5 placeholder-only.",
+    affectedRowCount: 5,
+    detectedAt: "2026-05-14T12:00:00.000Z",
+  };
+}
+
+function makeProposal(): FixProposal {
+  return {
+    issueId: "iss-1",
+    issueType: "placeholder_value",
+    action: "Replace age=999 with NULL",
+    operationKind: "replace_value",
+    operationParams: { column: "age", match: 999, replacement: null },
+    riskTier: "medium",
+    reason: "out of range",
+    evidence: "5 rows",
+    confidence: "high",
+    targetRowCount: 5,
+  };
+}
+
+describe("DataGuardSession", () => {
+  test("setDataset stores the dataset and resets per-run state", () => {
+    const s = new DataGuardSession();
+    const ds: DatasetView = { columns: ["a"], rows: [{ a: 1 }] };
+    s.recordIssue(makeIssue());
+    s.setDataset(ds);
+    expect(s.getDataset()).toBe(ds);
+    expect(s.getIssues()).toEqual([]);
+    expect(s.getDecisionLog()).toEqual([]);
+    expect(s.getFlaggedRows()).toEqual([]);
+  });
+
+  test("recordIssue accumulates and dedupes by issueId", () => {
+    const s = new DataGuardSession();
+    s.recordIssue(makeIssue());
+    s.recordIssue(makeIssue()); // same issueId — should not duplicate
+    expect(s.getIssues()).toHaveLength(1);
+  });
+
+  test("recordDecision appends a DecisionLogEntry", () => {
+    const s = new DataGuardSession();
+    s.setDataset({ columns: [], rows: [] });
+    s.recordDecision({
+      proposal: makeProposal(),
+      verdict: "allow",
+      applied: true,
+    });
+    const log = s.getDecisionLog();
+    expect(log).toHaveLength(1);
+    expect(log[0].userDecision).toBe("allow");
+    expect(log[0].issueType).toBe("placeholder_value");
+    expect(log[0].appliedAt).toBeDefined();
+  });
+
+  test("recordDecision with denied: no appliedAt", () => {
+    const s = new DataGuardSession();
+    s.recordDecision({ proposal: makeProposal(), verdict: "deny", applied: false });
+    expect(s.getDecisionLog()[0].appliedAt).toBeUndefined();
+  });
+
+  test("addAutoAllowRule registers, matchesAutoAllowRule returns true", () => {
+    const s = new DataGuardSession();
+    s.addAutoAllowRule("placeholder_value");
+    expect(s.matchesAutoAllowRule("placeholder_value")).toBe(true);
+    expect(s.matchesAutoAllowRule("outlier")).toBe(false);
+  });
+
+  test("addAutoAllowRule is idempotent (does not duplicate)", () => {
+    const s = new DataGuardSession();
+    s.addAutoAllowRule("placeholder_value");
+    s.addAutoAllowRule("placeholder_value");
+    expect(s.getAutoAllowRules()).toHaveLength(1);
+  });
+
+  test("removeAutoAllowRule clears the rule by id", () => {
+    const s = new DataGuardSession();
+    const rule = s.addAutoAllowRule("placeholder_value");
+    expect(s.removeAutoAllowRule(rule.ruleId)).toBe(true);
+    expect(s.matchesAutoAllowRule("placeholder_value")).toBe(false);
+  });
+
+  test("addFlaggedRows merges + dedupes + sorts", () => {
+    const s = new DataGuardSession();
+    s.addFlaggedRows([3, 1, 2]);
+    s.addFlaggedRows([2, 5]);
+    expect(s.getFlaggedRows()).toEqual([1, 2, 3, 5]);
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/dataguard-session.ts b/agent-service/src/agent/tools/dataguard/dataguard-session.ts
new file mode 100644
index 00000000000..1fc33799f8b
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/dataguard-session.ts
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Per-agent DataGuard run state. One DataGuardSession lives on each
+// TexeraAgent (lazy-initialized when the first DataGuard tool fires) and
+// holds the working dataset, accumulated issues, decision log, flagged rows,
+// and auto-allow rules. Independent of the workflow state so resetting one
+// does not affect the other.
+
+import type {
+  AutoAllowRule,
+  DataQualityIssue,
+  DecisionLogEntry,
+  FixProposal,
+  IssueType,
+  Verdict,
+} from "../../../types/dataguard";
+import type { DatasetView } from "./dataset";
+
+export interface RecordDecisionInput {
+  proposal: FixProposal;
+  verdict: Verdict;
+  modifiedAction?: string;
+  applied: boolean;
+}
+
+export class DataGuardSession {
+  private dataset: DatasetView | undefined;
+  private issues: Map<string, DataQualityIssue> = new Map();
+  private proposals: Map<string, FixProposal> = new Map();
+  private decisionLog: DecisionLogEntry[] = [];
+  private flaggedRows: Set<number> = new Set();
+  private autoAllowRules: Map<string, AutoAllowRule> = new Map();
+  private decisionCounter = 0;
+  private ruleCounter = 0;
+
+  setDataset(dataset: DatasetView): void {
+    this.dataset = dataset;
+    // A new dataset means a fresh DataGuard run — clear the per-run state.
+    // Auto-allow rules persist (they're a user preference, not run state).
+    this.issues.clear();
+    this.proposals.clear();
+    this.decisionLog = [];
+    this.flaggedRows.clear();
+  }
+
+  recordProposal(proposal: FixProposal): void {
+    this.proposals.set(proposal.issueId, proposal);
+  }
+
+  getProposal(issueId: string): FixProposal | undefined {
+    return this.proposals.get(issueId);
+  }
+
+  getIssue(issueId: string): DataQualityIssue | undefined {
+    return this.issues.get(issueId);
+  }
+
+  getDataset(): DatasetView | undefined {
+    return this.dataset;
+  }
+
+  updateDataset(dataset: DatasetView): void {
+    this.dataset = dataset;
+  }
+
+  recordIssue(issue: DataQualityIssue): void {
+    this.issues.set(issue.issueId, issue);
+  }
+
+  getIssues(): DataQualityIssue[] {
+    return Array.from(this.issues.values());
+  }
+
+  recordDecision(input: RecordDecisionInput): DecisionLogEntry {
+    this.decisionCounter += 1;
+    const now = new Date().toISOString();
+    const entry: DecisionLogEntry = {
+      decisionId: `dec-${this.decisionCounter}`,
+      timestamp: now,
+      issueType: input.proposal.issueType,
+      targetRowCount: input.proposal.targetRowCount,
+      proposedAction: input.proposal.action,
+      userDecision: input.verdict,
+      modifiedAction: input.modifiedAction,
+      reason: input.proposal.reason,
+      confidence: input.proposal.confidence,
+      appliedAt: input.applied ? now : undefined,
+    };
+    this.decisionLog.push(entry);
+    return entry;
+  }
+
+  getDecisionLog(): DecisionLogEntry[] {
+    return [...this.decisionLog];
+  }
+
+  addFlaggedRows(indices: number[]): void {
+    for (const i of indices) this.flaggedRows.add(i);
+  }
+
+  getFlaggedRows(): number[] {
+    return Array.from(this.flaggedRows).sort((a, b) => a - b);
+  }
+
+  addAutoAllowRule(issueType: IssueType): AutoAllowRule {
+    // Idempotent: if a rule already exists for this issueType, return it.
+    for (const r of this.autoAllowRules.values()) {
+      if (r.issueType === issueType) return r;
+    }
+    this.ruleCounter += 1;
+    const rule: AutoAllowRule = {
+      ruleId: `rule-${this.ruleCounter}`,
+      issueType,
+      createdAt: new Date().toISOString(),
+    };
+    this.autoAllowRules.set(rule.ruleId, rule);
+    return rule;
+  }
+
+  removeAutoAllowRule(ruleId: string): boolean {
+    return this.autoAllowRules.delete(ruleId);
+  }
+
+  matchesAutoAllowRule(issueType: IssueType): boolean {
+    for (const r of this.autoAllowRules.values()) {
+      if (r.issueType === issueType) return true;
+    }
+    return false;
+  }
+
+  getAutoAllowRules(): AutoAllowRule[] {
+    return Array.from(this.autoAllowRules.values());
+  }
+}
diff --git a/agent-service/src/agent/tools/dataguard/dataguard-tools.ts b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts
new file mode 100644
index 00000000000..f58e29df8e4
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Vercel AI SDK tool definitions for DataGuard.
+// Three tools exposed to the LLM:
+//   - profile_dataset (read-only)
+//   - suggest_fix    (read-only)
+//   - apply_fix      (mutating — gated by requestApproval)
+// The decision log is written automatically inside apply_fix; an explicit
+// write_decision_log tool (Step 10) exports the log to CSV at session end.
+
+import { z } from "zod";
+import { tool } from "ai";
+import { profileDataset, type ProfileOptions } from "./profile-dataset";
+import { suggestFix, type LlmCallFn } from "./suggest-fix";
+import { applyFix } from "./apply-fix";
+import { requestApproval, type ApprovalGateway } from "./with-approval";
+import type { DataGuardSession } from "./dataguard-session";
+import { createWriteDecisionLogTool, TOOL_NAME_WRITE_DECISION_LOG } from "./decision-log";
+import { createBiasCheckTool, TOOL_NAME_BIAS_CHECK } from "./bias-check";
+
+export const TOOL_NAME_PROFILE_DATASET = "profile_dataset";
+export const TOOL_NAME_SUGGEST_FIX = "suggest_fix";
+export const TOOL_NAME_APPLY_FIX = "apply_fix";
+
+export interface DataGuardToolContext {
+  session: DataGuardSession;
+  gateway: ApprovalGateway;
+  llmCall: LlmCallFn;
+}
+
+export function createDataGuardTools(ctx: DataGuardToolContext): Record<string, unknown> {
+  return {
+    [TOOL_NAME_PROFILE_DATASET]: createProfileDatasetTool(ctx),
+    [TOOL_NAME_SUGGEST_FIX]: createSuggestFixTool(ctx),
+    [TOOL_NAME_APPLY_FIX]: createApplyFixTool(ctx),
+    [TOOL_NAME_WRITE_DECISION_LOG]: createWriteDecisionLogTool(ctx.session),
+    [TOOL_NAME_BIAS_CHECK]: createBiasCheckTool(ctx.session),
+  };
+}
+
+function createProfileDatasetTool(ctx: DataGuardToolContext) {
+  return tool({
+    description: `Scan the loaded dataset for quality issues. Read-only.
+
+Detects four categories:
+- missing_value: null / empty / configured missing tokens
+- placeholder_value: numeric (999, -1) or string sentinels
+- duplicate_id: requires idColumn hint
+- out_of_range: requires validRanges hint per column
+
+Call this once at the start of a DataGuard run. Returns a JSON array of DataQualityIssue records.`,
+    inputSchema: z.object({
+      idColumn: z
+        .string()
+        .optional()
+        .describe("Column name to treat as the unique row identifier. If omitted, no duplicate_id detection runs."),
+      validRanges: z
+        .record(z.string(), z.object({ min: z.number(), max: z.number() }))
+        .optional()
+        .describe("Per-column valid numeric range. Values outside are flagged as out_of_range."),
+      placeholderValues: z
+        .array(z.union([z.string(), z.number()]))
+        .optional()
+        .describe("Override the default placeholder list (default: [999, -1, 'unknown', 'Unknown'])."),
+      missingTokens: z
+        .array(z.string())
+        .optional()
+        .describe("Override the default missing-token list (default: ['NA', 'N/A', 'n/a', 'null', 'NULL', 'None'])."),
+    }),
+    execute: async (input) => {
+      const dataset = ctx.session.getDataset();
+      if (!dataset) {
+        return "[ERROR] No dataset loaded into DataGuard session. The frontend must call setDataset before invoking profile_dataset.";
+      }
+      const options: ProfileOptions = {
+        idColumn: input.idColumn,
+        validRanges: input.validRanges,
+        placeholderValues: input.placeholderValues,
+        missingTokens: input.missingTokens,
+      };
+      const issues = profileDataset(dataset, options);
+      for (const issue of issues) ctx.session.recordIssue(issue);
+      return JSON.stringify({
+        datasetRowCount: dataset.rows.length,
+        datasetColumnCount: dataset.columns.length,
+        issueCount: issues.length,
+        issues,
+      });
+    },
+  });
+}
+
+function createSuggestFixTool(ctx: DataGuardToolContext) {
+  return tool({
+    description: `Propose a single concrete fix for a previously-detected issue. Read-only.
+
+Call after profile_dataset. Pass the issueId from one of the returned issues. Returns a FixProposal that you can then pass to apply_fix.`,
+    inputSchema: z.object({
+      issueId: z.string().describe("The issueId of a DataQualityIssue returned by profile_dataset."),
+    }),
+    execute: async (input) => {
+      const issue = ctx.session.getIssue(input.issueId);
+      if (!issue) {
+        return `[ERROR] No issue with id "${input.issueId}". Call profile_dataset first.`;
+      }
+      try {
+        const proposal = await suggestFix(issue, { llmCall: ctx.llmCall });
+        ctx.session.recordProposal(proposal);
+        return JSON.stringify(proposal);
+      } catch (e) {
+        return `[ERROR] suggest_fix failed: ${(e as Error).message}`;
+      }
+    },
+  });
+}
+
+function createApplyFixTool(ctx: DataGuardToolContext) {
+  return tool({
+    description: `Apply a previously-proposed fix to the dataset. MUTATING — gated by user approval.
+
+Pass the issueId. The proposal stored from suggest_fix is looked up automatically. For risk tier "low" the fix is auto-applied with a summary line; for "medium" / "high" the user must approve through the chat panel. The result includes the user's verdict.`,
+    inputSchema: z.object({
+      issueId: z.string().describe("The issueId whose proposal should be applied."),
+    }),
+    execute: async (input) => {
+      const proposal = ctx.session.getProposal(input.issueId);
+      if (!proposal) {
+        return `[ERROR] No proposal for issueId "${input.issueId}". Call suggest_fix first.`;
+      }
+      const dataset = ctx.session.getDataset();
+      if (!dataset) {
+        return `[ERROR] No dataset loaded.`;
+      }
+
+      const decision = await requestApproval(ctx.gateway, proposal);
+
+      if (decision.verdict === "deny") {
+        ctx.session.recordDecision({ proposal, verdict: "deny", applied: false });
+        return JSON.stringify({
+          verdict: "deny",
+          rowsAffected: 0,
+          message: "User denied the fix. No changes made.",
+        });
+      }
+
+      // For modify, MVP keeps the original operationKind/params but records the
+      // user's free-text override in the log. Future iteration can parse the
+      // modifiedAction back into a structured proposal override.
+      const modifiedAction = decision.verdict === "modify" ? decision.modifiedAction : undefined;
+
+      try {
+        const result = applyFix(dataset, proposal);
+        ctx.session.updateDataset(result.dataset);
+        if (result.flaggedRows.length > 0) ctx.session.addFlaggedRows(result.flaggedRows);
+        ctx.session.recordDecision({
+          proposal,
+          verdict: decision.verdict,
+          modifiedAction,
+          applied: true,
+        });
+        return JSON.stringify({
+          verdict: decision.verdict,
+          rowsAffected: result.rowsAffected,
+          flaggedRows: result.flaggedRows,
+          datasetRowCount: result.dataset.rows.length,
+          message: `Applied ${proposal.operationKind}. Rows affected: ${result.rowsAffected}.`,
+        });
+      } catch (e) {
+        return `[ERROR] apply_fix failed: ${(e as Error).message}`;
+      }
+    },
+  });
+}
diff --git a/agent-service/src/agent/tools/dataguard/decision-log.test.ts b/agent-service/src/agent/tools/dataguard/decision-log.test.ts
new file mode 100644
index 00000000000..242259d2ecf
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/decision-log.test.ts
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { serializeDecisionLogCsv } from "./decision-log";
+import type { DecisionLogEntry } from "../../../types/dataguard";
+
+function entry(overrides: Partial<DecisionLogEntry> = {}): DecisionLogEntry {
+  return {
+    decisionId: "dec-1",
+    timestamp: "2026-05-14T12:00:30.000Z",
+    issueType: "placeholder_value",
+    targetRowCount: 5,
+    proposedAction: "Replace age=999 with NULL",
+    userDecision: "allow",
+    reason: "out of valid range",
+    confidence: "high",
+    appliedAt: "2026-05-14T12:00:31.000Z",
+    ...overrides,
+  };
+}
+
+describe("serializeDecisionLogCsv", () => {
+  test("empty log returns header only", () => {
+    const csv = serializeDecisionLogCsv([]);
+    expect(csv.split("\n")).toEqual([
+      "decision_id,timestamp,issue_type,target_rows,proposed_action,user_decision,modified_action,reason,confidence,applied_at",
+    ]);
+  });
+
+  test("single row: header + one data row", () => {
+    const csv = serializeDecisionLogCsv([entry()]);
+    const lines = csv.split("\n");
+    expect(lines).toHaveLength(2);
+    expect(lines[1]).toContain("dec-1");
+    expect(lines[1]).toContain("placeholder_value");
+    expect(lines[1]).toContain("allow");
+  });
+
+  test("escapes commas, quotes, and newlines in fields per RFC 4180", () => {
+    const csv = serializeDecisionLogCsv([
+      entry({
+        proposedAction: 'Replace "999" with NULL, including row 3',
+        reason: "line1\nline2",
+      }),
+    ]);
+    const dataRow = csv.split("\n").slice(1).join("\n");
+    expect(dataRow).toContain('"Replace ""999"" with NULL, including row 3"');
+    expect(dataRow).toContain('"line1\nline2"');
+  });
+
+  test("missing appliedAt and modifiedAction render as empty fields", () => {
+    const csv = serializeDecisionLogCsv([
+      entry({ userDecision: "deny", appliedAt: undefined }),
+    ]);
+    const row = csv.split("\n")[1];
+    expect(row.endsWith(",")).toBe(true); // appliedAt is the last column and is empty
+    expect(row).toContain(",,"); // modifiedAction is empty between reason+confidence's neighbors
+  });
+
+  test("multiple rows preserve insertion order", () => {
+    const csv = serializeDecisionLogCsv([
+      entry({ decisionId: "dec-1", issueType: "placeholder_value" }),
+      entry({ decisionId: "dec-2", issueType: "missing_value" }),
+      entry({ decisionId: "dec-3", issueType: "outlier", userDecision: "deny" }),
+    ]);
+    const lines = csv.split("\n").slice(1);
+    expect(lines[0]).toContain("dec-1");
+    expect(lines[1]).toContain("dec-2");
+    expect(lines[2]).toContain("dec-3");
+    expect(lines[2]).toContain("deny");
+  });
+
+  test("auto_allow_low_risk and auto_allow_remembered survive the round trip", () => {
+    const csv = serializeDecisionLogCsv([
+      entry({ userDecision: "auto_allow_low_risk" }),
+      entry({ userDecision: "auto_allow_remembered" }),
+    ]);
+    expect(csv).toContain("auto_allow_low_risk");
+    expect(csv).toContain("auto_allow_remembered");
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/decision-log.ts b/agent-service/src/agent/tools/dataguard/decision-log.ts
new file mode 100644
index 00000000000..e9752fdf422
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/decision-log.ts
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// RFC-4180 CSV serializer for the DataGuard decision log. Schema matches
+// §4.4 of README_DataGuard_Texera.md exactly so a reviewer can open the
+// downloaded CSV and trace every applied/denied/modified fix.
+
+import { z } from "zod";
+import { tool } from "ai";
+import type { DecisionLogEntry } from "../../../types/dataguard";
+import type { DataGuardSession } from "./dataguard-session";
+
+const HEADER_COLUMNS = [
+  "decision_id",
+  "timestamp",
+  "issue_type",
+  "target_rows",
+  "proposed_action",
+  "user_decision",
+  "modified_action",
+  "reason",
+  "confidence",
+  "applied_at",
+] as const;
+
+export const TOOL_NAME_WRITE_DECISION_LOG = "write_decision_log";
+
+export function serializeDecisionLogCsv(entries: DecisionLogEntry[]): string {
+  const header = HEADER_COLUMNS.join(",");
+  const rows = entries.map(rowToCsv);
+  return [header, ...rows].join("\n");
+}
+
+function rowToCsv(e: DecisionLogEntry): string {
+  return [
+    csvField(e.decisionId),
+    csvField(e.timestamp),
+    csvField(e.issueType),
+    csvField(String(e.targetRowCount)),
+    csvField(e.proposedAction),
+    csvField(e.userDecision),
+    csvField(e.modifiedAction ?? ""),
+    csvField(e.reason),
+    csvField(e.confidence),
+    csvField(e.appliedAt ?? ""),
+  ].join(",");
+}
+
+// RFC 4180: a field MUST be quoted if it contains a comma, double-quote, or
+// line break. Quotes within a quoted field are escaped by doubling.
+function csvField(value: string): string {
+  if (value === "") return "";
+  const needsQuoting = /[",\r\n]/.test(value);
+  if (!needsQuoting) return value;
+  return `"${value.replace(/"/g, '""')}"`;
+}
+
+// ----- AI SDK tool (exposed to the LLM) -----
+
+export function createWriteDecisionLogTool(session: DataGuardSession) {
+  return tool({
+    description: `Export the DataGuard decision log to CSV. Returns the CSV text. Call this at the end of a DataGuard run to give the user an audit trail of every Allow / Deny / Modify they made.`,
+    inputSchema: z.object({}),
+    execute: async () => {
+      const csv = serializeDecisionLogCsv(session.getDecisionLog());
+      return JSON.stringify({
+        rows: session.getDecisionLog().length,
+        bytes: csv.length,
+        csv,
+      });
+    },
+  });
+}
diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts b/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts
new file mode 100644
index 00000000000..3d3ab4dc632
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { suggestFix, type LlmCallFn } from "./suggest-fix";
+import type { DataQualityIssue } from "../../../types/dataguard";
+
+function makeIssue(overrides: Partial<DataQualityIssue> = {}): DataQualityIssue {
+  return {
+    issueId: "iss-test-1",
+    issueType: "placeholder_value",
+    column: "age",
+    description: "5 rows have age = 999",
+    evidence: "5 of 5 rows with age=999 have no other anomalies.",
+    affectedRowCount: 5,
+    affectedRowIndices: [10, 42, 77, 199, 412],
+    detectedAt: "2026-05-14T12:00:00.000Z",
+    ...overrides,
+  };
+}
+
+const VALID_RAW_JSON = JSON.stringify({
+  action: "Replace age = 999 with NULL",
+  operationKind: "replace_value",
+  operationParams: { column: "age", match: 999, replacement: null },
+  riskTier: "medium",
+  reason: "999 is outside the valid human-age range and appears to be a placeholder.",
+  evidence: "5 of 5 rows with age=999 have no other anomalies.",
+  confidence: "high",
+  targetRowCount: 5,
+});
+
+function constantLlm(payload: string): LlmCallFn {
+  return async () => payload;
+}
+
+describe("suggestFix", () => {
+  test("parses a valid LLM JSON payload into a FixProposal", async () => {
+    const issue = makeIssue();
+    const proposal = await suggestFix(issue, { llmCall: constantLlm(VALID_RAW_JSON) });
+    expect(proposal.issueId).toBe(issue.issueId);
+    expect(proposal.issueType).toBe("placeholder_value");
+    expect(proposal.operationKind).toBe("replace_value");
+    expect(proposal.riskTier).toBe("medium");
+    expect(proposal.confidence).toBe("high");
+    expect(proposal.targetRowCount).toBe(5);
+  });
+
+  test("strips ```json``` code fences before parsing", async () => {
+    const fenced = "```json\n" + VALID_RAW_JSON + "\n```";
+    const proposal = await suggestFix(makeIssue(), { llmCall: constantLlm(fenced) });
+    expect(proposal.operationKind).toBe("replace_value");
+  });
+
+  test("strips bare ``` fences before parsing", async () => {
+    const fenced = "```\n" + VALID_RAW_JSON + "\n```";
+    const proposal = await suggestFix(makeIssue(), { llmCall: constantLlm(fenced) });
+    expect(proposal.riskTier).toBe("medium");
+  });
+
+  test("issueId and issueType are set from the issue, not the LLM", async () => {
+    // The LLM payload claims a different issueType — we ignore it and use the
+    // server-side issue's type to keep the contract honest.
+    const proposalIgnoredFields = {
+      ...JSON.parse(VALID_RAW_JSON),
+      issueId: "wrong-id-from-llm",
+      issueType: "outlier",
+    };
+    const issue = makeIssue({ issueId: "iss-real-7", issueType: "missing_value" });
+    const proposal = await suggestFix(issue, {
+      llmCall: constantLlm(JSON.stringify(proposalIgnoredFields)),
+    });
+    expect(proposal.issueId).toBe("iss-real-7");
+    expect(proposal.issueType).toBe("missing_value");
+  });
+
+  test("throws on invalid JSON", async () => {
+    await expect(
+      suggestFix(makeIssue(), { llmCall: constantLlm("not json at all") })
+    ).rejects.toThrow(/invalid JSON/);
+  });
+
+  test("throws when required field is missing", async () => {
+    const bad = { ...JSON.parse(VALID_RAW_JSON) };
+    delete bad.operationKind;
+    await expect(
+      suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })
+    ).rejects.toThrow(/schema validation/);
+  });
+
+  test("throws when operationKind is not a known enum member", async () => {
+    const bad = { ...JSON.parse(VALID_RAW_JSON), operationKind: "delete_database" };
+    await expect(
+      suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })
+    ).rejects.toThrow(/schema validation/);
+  });
+
+  test("throws when riskTier is not low|medium|high", async () => {
+    const bad = { ...JSON.parse(VALID_RAW_JSON), riskTier: "critical" };
+    await expect(
+      suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })
+    ).rejects.toThrow(/schema validation/);
+  });
+
+  test("passes issue details into the prompt for the LLM", async () => {
+    let captured = "";
+    const issue = makeIssue({
+      issueType: "duplicate_id",
+      column: "sample_id",
+      description: "3 duplicate sample IDs",
+    });
+    const proposal = await suggestFix(issue, {
+      llmCall: async (prompt) => {
+        captured = prompt;
+        return VALID_RAW_JSON;
+      },
+    });
+    expect(captured).toContain("duplicate_id");
+    expect(captured).toContain("sample_id");
+    expect(captured).toContain("3 duplicate sample IDs");
+    expect(proposal).toBeDefined();
+  });
+
+  test("propagates LLM transport errors", async () => {
+    const issue = makeIssue();
+    await expect(
+      suggestFix(issue, {
+        llmCall: async () => {
+          throw new Error("connection refused");
+        },
+      })
+    ).rejects.toThrow(/connection refused/);
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.ts b/agent-service/src/agent/tools/dataguard/suggest-fix.ts
new file mode 100644
index 00000000000..b4e6feda995
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/suggest-fix.ts
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Takes a DataQualityIssue from profile_dataset and asks an LLM for a single
+// concrete FixProposal. Read-only with respect to the dataset: it only
+// proposes, never applies.
+
+import { z } from "zod";
+import type { DataQualityIssue, FixProposal, RiskTier } from "../../../types/dataguard";
+
+export type LlmCallFn = (prompt: string) => Promise<string>;
+
+export interface SuggestFixOptions {
+  llmCall: LlmCallFn;
+}
+
+const fixProposalSchema = z.object({
+  action: z.string().min(1),
+  operationKind: z.enum([
+    "replace_value",
+    "drop_rows",
+    "impute",
+    "flag",
+    "standardize",
+    "trim_whitespace",
+    "rename_column",
+  ]),
+  operationParams: z.record(z.string(), z.unknown()),
+  riskTier: z.enum(["low", "medium", "high"]),
+  reason: z.string().min(1),
+  evidence: z.string().min(1),
+  confidence: z.enum(["low", "medium", "high"]),
+  targetRowCount: z.number().int().nonnegative(),
+});
+
+const DEFAULT_RISK_TIER_BY_ISSUE: Record<string, RiskTier> = {
+  placeholder_value: "medium",
+  missing_value: "medium",
+  duplicate_id: "high",
+  out_of_range: "medium",
+  outlier: "high",
+  inconsistent_label: "medium",
+};
+
+export async function suggestFix(
+  issue: DataQualityIssue,
+  options: SuggestFixOptions
+): Promise<FixProposal> {
+  const prompt = buildPrompt(issue);
+  const rawResponse = await options.llmCall(prompt);
+  const cleaned = stripCodeFences(rawResponse);
+
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(cleaned);
+  } catch (e) {
+    throw new Error(
+      `suggest_fix: LLM returned invalid JSON for issue ${issue.issueId}: ${(e as Error).message}`
+    );
+  }
+
+  const validated = fixProposalSchema.safeParse(parsed);
+  if (!validated.success) {
+    throw new Error(
+      `suggest_fix: LLM proposal failed schema validation for issue ${issue.issueId}: ${validated.error.message}`
+    );
+  }
+
+  // Override LLM-supplied issueId/issueType with the server-side values to
+  // keep the contract honest: the LLM can suggest *what* to do, but it does
+  // not control *which* issue this proposal is bound to.
+  return {
+    issueId: issue.issueId,
+    issueType: issue.issueType,
+    ...validated.data,
+  };
+}
+
+export function buildPrompt(issue: DataQualityIssue): string {
+  const defaultTier = DEFAULT_RISK_TIER_BY_ISSUE[issue.issueType] ?? "medium";
+  return `You are a data-cleaning assistant. Propose a single concrete fix for the following data-quality issue. Reply with one JSON object only — no prose, no markdown, no fences.
+
+Issue:
+- type: ${issue.issueType}
+- column: ${issue.column}
+- description: ${issue.description}
+- evidence: ${issue.evidence}
+- affectedRowCount: ${issue.affectedRowCount}
+
+Required JSON shape:
+{
+  "action": "<one-sentence human-readable description of the fix>",
+  "operationKind": "replace_value | drop_rows | impute | flag | standardize | trim_whitespace | rename_column",
+  "operationParams": { ...operation-specific params... },
+  "riskTier": "low | medium | high",
+  "reason": "<one-sentence justification>",
+  "evidence": "<one-sentence supporting data from the issue>",
+  "confidence": "low | medium | high",
+  "targetRowCount": ${issue.affectedRowCount}
+}
+
+operationParams by kind:
+- replace_value: { "column": string, "match": any, "replacement": any }
+- drop_rows: { "rowIndices": number[] }
+- impute: { "column": string, "strategy": "mean" | "median" | "mode" }
+- flag: { "rowIndices": number[] }
+- standardize: { "column": string, "mapping": { [from: string]: string } }
+- trim_whitespace: { "column": string }
+- rename_column: { "from": string, "to": string }
+
+Default risk tier for ${issue.issueType}: ${defaultTier}. Override only with a strong reason. Prefer "flag" or "impute" over destructive "drop_rows".`;
+}
+
+function stripCodeFences(s: string): string {
+  const trimmed = s.trim();
+  if (!trimmed.startsWith("```")) return trimmed;
+  const lines = trimmed.split("\n");
+  const last = lines[lines.length - 1]?.trim() ?? "";
+  const sliced = last === "```" ? lines.slice(1, -1) : lines.slice(1);
+  return sliced.join("\n").trim();
+}
diff --git a/agent-service/src/agent/tools/dataguard/with-approval.test.ts b/agent-service/src/agent/tools/dataguard/with-approval.test.ts
new file mode 100644
index 00000000000..1fff89cb9fd
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/with-approval.test.ts
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { describe, expect, test } from "bun:test";
+import { requestApproval, type ApprovalGateway } from "./with-approval";
+import type { FixProposal, IssueType, PermissionDecision, RiskTier } from "../../../types/dataguard";
+
+function makeProposal(overrides: Partial<FixProposal> = {}): FixProposal {
+  return {
+    issueId: "iss-1",
+    issueType: "placeholder_value",
+    action: "Replace age=999 with NULL",
+    operationKind: "replace_value",
+    operationParams: { column: "age", match: 999, replacement: null },
+    riskTier: "medium",
+    reason: "test",
+    evidence: "test",
+    confidence: "high",
+    targetRowCount: 5,
+    ...overrides,
+  };
+}
+
+class MockGateway implements ApprovalGateway {
+  rules: Set<IssueType> = new Set();
+  emitted: Array<{ stepId: string; proposal: FixProposal }> = [];
+  decisions: Map<string, PermissionDecision> = new Map();
+  private waiters: Map<string, (d: PermissionDecision) => void> = new Map();
+  private counter = 0;
+
+  matchesAutoAllowRule(issueType: IssueType): boolean {
+    return this.rules.has(issueType);
+  }
+  generateStepId(): string {
+    this.counter += 1;
+    return `mock-step-${this.counter}`;
+  }
+  emitPendingApproval(stepId: string, proposal: FixProposal): void {
+    this.emitted.push({ stepId, proposal });
+  }
+  awaitDecision(stepId: string): Promise<PermissionDecision> {
+    if (this.decisions.has(stepId)) {
+      return Promise.resolve(this.decisions.get(stepId)!);
+    }
+    return new Promise(resolve => this.waiters.set(stepId, resolve));
+  }
+  resolveLater(stepId: string, decision: PermissionDecision): void {
+    const w = this.waiters.get(stepId);
+    if (w) {
+      this.waiters.delete(stepId);
+      w(decision);
+    } else {
+      this.decisions.set(stepId, decision);
+    }
+  }
+}
+
+describe("requestApproval", () => {
+  test("auto-allows low-risk fixes without prompting", async () => {
+    const gw = new MockGateway();
+    const decision = await requestApproval(gw, makeProposal({ riskTier: "low" }));
+    expect(decision.verdict).toBe("auto_allow_low_risk");
+    expect(gw.emitted).toHaveLength(0);
+  });
+
+  test("auto-allows when the issueType matches a remembered rule", async () => {
+    const gw = new MockGateway();
+    gw.rules.add("placeholder_value");
+    const decision = await requestApproval(gw, makeProposal({ riskTier: "medium" }));
+    expect(decision.verdict).toBe("auto_allow_remembered");
+    expect(gw.emitted).toHaveLength(0);
+  });
+
+  test("medium risk without remembered rule → emits pending and waits", async () => {
+    const gw = new MockGateway();
+    const promise = requestApproval(gw, makeProposal({ riskTier: "medium" }));
+    // Pending emitted synchronously before the promise resolves.
+    expect(gw.emitted).toHaveLength(1);
+    expect(gw.emitted[0].stepId).toBe("mock-step-1");
+
+    // Simulate user clicking Allow.
+    gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "allow" });
+
+    const decision = await promise;
+    expect(decision.verdict).toBe("allow");
+    expect(decision.stepId).toBe("mock-step-1");
+  });
+
+  test("high risk: prompts every time even with a remembered rule", async () => {
+    const gw = new MockGateway();
+    gw.rules.add("outlier");
+    const promise = requestApproval(gw, makeProposal({ issueType: "outlier", riskTier: "high" }));
+    expect(gw.emitted).toHaveLength(1);
+    gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "deny" });
+    const decision = await promise;
+    expect(decision.verdict).toBe("deny");
+  });
+
+  test("'modify' verdict carries through with the modifiedAction", async () => {
+    const gw = new MockGateway();
+    const promise = requestApproval(gw, makeProposal({ riskTier: "medium" }));
+    gw.resolveLater("mock-step-1", {
+      stepId: "mock-step-1",
+      verdict: "modify",
+      modifiedAction: "Flag instead of replace",
+    });
+    const decision = await promise;
+    expect(decision.verdict).toBe("modify");
+    expect(decision.modifiedAction).toBe("Flag instead of replace");
+  });
+
+  test("a decision that arrives before the tool awaits is buffered and delivered", async () => {
+    const gw = new MockGateway();
+    // The decision is pre-recorded BEFORE the tool starts awaiting. This
+    // matches a race where the user clicks before the agent has finished
+    // emitting the pending step on this side.
+    gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "allow" });
+    const decision = await requestApproval(gw, makeProposal({ riskTier: "medium" }));
+    expect(decision.verdict).toBe("allow");
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/with-approval.ts b/agent-service/src/agent/tools/dataguard/with-approval.ts
new file mode 100644
index 00000000000..454ea6de901
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/with-approval.ts
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// The permission gate: every mutating DataGuard tool calls requestApproval()
+// before doing anything. The function returns a PermissionDecision that the
+// tool inspects to know whether to apply, skip, or transform its input.
+
+import type { FixProposal, IssueType, PermissionDecision } from "../../../types/dataguard";
+
+// The set of operations the gate needs from its host. Implemented by
+// TexeraAgent in production and by a mock in tests, so the gating logic
+// itself can be unit-tested without a full agent or a websocket.
+export interface ApprovalGateway {
+  // Does this issueType have a standing "auto-allow" rule?
+  matchesAutoAllowRule(issueType: IssueType): boolean;
+  // Mint a fresh step id used to correlate the emitted pending step with the
+  // user's eventual decision.
+  generateStepId(): string;
+  // Add a "pending approval" step into the conversation history and broadcast
+  // it to subscribed websocket clients. The frontend renders the prompt UI.
+  emitPendingApproval(stepId: string, proposal: FixProposal): void;
+  // Resolve when the decision for this step arrives via a websocket message.
+  awaitDecision(stepId: string): Promise<PermissionDecision>;
+}
+
+export async function requestApproval(
+  gateway: ApprovalGateway,
+  proposal: FixProposal
+): Promise<PermissionDecision> {
+  // High-risk fixes ALWAYS prompt — the "remember" rule does not apply.
+  // This is the same shape Claude Code uses for destructive Bash operations.
+  if (proposal.riskTier !== "high" && gateway.matchesAutoAllowRule(proposal.issueType)) {
+    return { stepId: "", verdict: "auto_allow_remembered" };
+  }
+  if (proposal.riskTier === "low") {
+    return { stepId: "", verdict: "auto_allow_low_risk" };
+  }
+  const stepId = gateway.generateStepId();
+  gateway.emitPendingApproval(stepId, proposal);
+  return gateway.awaitDecision(stepId);
+}
diff --git a/agent-service/src/server.ts b/agent-service/src/server.ts
index a31f9ede115..3bfa0df9f7c 100644
--- a/agent-service/src/server.ts
+++ b/agent-service/src/server.ts
@@ -325,6 +325,40 @@ const agentsRouter = new Elysia({ prefix: "/agents" })
     };
   })
 
+  // ---------- DataGuard endpoints ----------
+
+  .post(
+    "/:id/dataguard/dataset",
+    ({ params: { id }, body }) => {
+      const agent = getAgent(id);
+      agent.setDataGuardDataset({
+        columns: body.columns,
+        rows: body.rows,
+      });
+      return { ok: true, columns: body.columns.length, rows: body.rows.length };
+    },
+    {
+      body: t.Object({
+        columns: t.Array(t.String()),
+        rows: t.Array(t.Record(t.String(), t.Any())),
+      }),
+    }
+  )
+
+  .get("/:id/dataguard/session", ({ params: { id } }) => {
+    const agent = getAgent(id);
+    const session = agent.getDataGuardSession();
+    const dataset = session.getDataset();
+    return {
+      datasetRowCount: dataset?.rows.length ?? 0,
+      datasetColumnCount: dataset?.columns.length ?? 0,
+      issues: session.getIssues(),
+      decisionLog: session.getDecisionLog(),
+      flaggedRows: session.getFlaggedRows(),
+      autoAllowRules: session.getAutoAllowRules(),
+    };
+  })
+
   .get("/:id/operator-types", ({ params: { id } }) => {
     const agent = getAgent(id);
     const metadataStore = agent.getMetadataStore();
@@ -403,9 +437,15 @@ const agentsRouter = new Elysia({ prefix: "/agents" })
   );
 
 interface WsMessage {
-  type: "message" | "stop";
+  type: "message" | "stop" | "decision";
   content?: string;
   messageSource?: "chat" | "feedback";
+  // Fields below carry the user's verdict on a pending-approval step.
+  // Used when type === "decision". See agent/tools/dataguard/with-approval.ts.
+  stepId?: string;
+  verdict?: "allow" | "deny" | "modify";
+  modifiedAction?: string;
+  remember?: boolean;
 }
 
 interface OperatorResultSummaryWs {
@@ -532,6 +572,29 @@ export function buildApp() {
           return;
         }
 
+        if (msg.type === "decision") {
+          if (!msg.stepId || !msg.verdict) {
+            ws.send(
+              JSON.stringify({
+                type: "error",
+                error: "decision requires stepId and verdict",
+              })
+            );
+            return;
+          }
+          const resolved = agent.resolveDecision(msg.stepId, {
+            stepId: msg.stepId,
+            verdict: msg.verdict,
+            modifiedAction: msg.modifiedAction,
+            remember: msg.remember,
+          });
+          wsLog.info(
+            { agentId, stepId: msg.stepId, verdict: msg.verdict, resolved },
+            "received user decision"
+          );
+          return;
+        }
+
         if (msg.type === "message") {
           if (!msg.content || typeof msg.content !== "string") {
             ws.send(JSON.stringify({ type: "error", error: "Message content is required" }));
diff --git a/agent-service/src/types/agent.ts b/agent-service/src/types/agent.ts
index 765f5a7cb46..cef2db3e618 100644
--- a/agent-service/src/types/agent.ts
+++ b/agent-service/src/types/agent.ts
@@ -18,6 +18,7 @@
  */
 
 import type { WorkflowContent } from "./workflow";
+import type { FixProposal, RiskTier } from "./dataguard";
 
 export enum AgentState {
   UNAVAILABLE = "UNAVAILABLE",
@@ -35,6 +36,12 @@ export interface TokenUsage {
 
 export const INITIAL_STEP_ID = "step-initial";
 
+export interface PendingApproval {
+  toolName: string;
+  proposal: FixProposal;
+  riskTier: RiskTier;
+}
+
 export interface ReActStep {
   id: string;
   parentId?: string;
@@ -60,6 +67,10 @@ export interface ReActStep {
   messageSource?: "chat" | "feedback";
   beforeWorkflowContent?: WorkflowContent;
   afterWorkflowContent?: WorkflowContent;
+  // Present on a step that is awaiting user approval. The agent's ReAct loop
+  // pauses (inside the mutating tool's execute fn) until a decision WS message
+  // arrives. See agent/tools/dataguard/with-approval.ts.
+  pendingApproval?: PendingApproval;
 }
 
 export enum OperatorResultSerializationMode {
diff --git a/agent-service/src/types/dataguard.test.ts b/agent-service/src/types/dataguard.test.ts
index c256c7e4709..53047e08879 100644
--- a/agent-service/src/types/dataguard.test.ts
+++ b/agent-service/src/types/dataguard.test.ts
@@ -68,6 +68,7 @@ describe("DataGuard type shapes", () => {
   test("FixProposal: replace-value, medium risk, high confidence", () => {
     const proposal: FixProposal = {
       issueId: "iss-1",
+      issueType: "placeholder_value",
       action: "Replace age = 999 with NULL",
       operationKind: "replace_value",
       operationParams: { column: "age", match: 999, replacement: null },
@@ -85,6 +86,7 @@ describe("DataGuard type shapes", () => {
   test("FixProposal: drop-rows, high risk (the storyboard 'deny' case)", () => {
     const proposal: FixProposal = {
       issueId: "iss-3",
+      issueType: "outlier",
       action: "Drop 3 rows with BMI > 60",
       operationKind: "drop_rows",
       operationParams: { rowIndices: [55, 211, 433] },
diff --git a/agent-service/src/types/dataguard.ts b/agent-service/src/types/dataguard.ts
index 413e2e37814..d5b17ac1654 100644
--- a/agent-service/src/types/dataguard.ts
+++ b/agent-service/src/types/dataguard.ts
@@ -64,6 +64,7 @@ export interface DataQualityIssue {
 
 export interface FixProposal {
   issueId: string;
+  issueType: IssueType;
   action: string;
   operationKind: FixOperationKind;
   operationParams: Record<string, unknown>;

From 13290ac839354022a97f57183eea865178444e46 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Thu, 14 May 2026 19:34:27 -0700
Subject: [PATCH 03/14] feat(frontend): DataGuard permission prompt +
 auto-trigger hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the chat-panel UX: a standalone PermissionPromptComponent that renders
inline on any ReActStep with pendingApproval (Allow / Deny / Modify /
Allow & remember), AgentService.sendDecision wiring the new WS message,
and DataGuardAutoTriggerService that fires when a dataset-reading
operator (CSVFileScan, TableFileScan, JSONFileScan, ParallelCSVFileScan)
is added to the workflow. AgentPanelComponent subscribes and surfaces a
notification — full agent-creation flow remains a follow-up.
---
 .../agent-chat/agent-chat.component.html      |  7 ++
 .../agent-chat/agent-chat.component.ts        |  2 +
 .../agent-panel/agent-panel.component.ts      | 19 ++++-
 .../permission-prompt.component.html          | 54 +++++++++++++
 .../permission-prompt.component.scss          | 78 ++++++++++++++++++
 .../permission-prompt.component.ts            | 80 ++++++++++++++++++
 .../workspace/service/agent/agent-types.ts    | 21 +++++
 .../workspace/service/agent/agent.service.ts  | 32 ++++++++
 .../agent/data-guard-auto-trigger.service.ts  | 81 +++++++++++++++++++
 9 files changed, 373 insertions(+), 1 deletion(-)
 create mode 100644 frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
 create mode 100644 frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
 create mode 100644 frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts

diff --git a/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.html b/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.html
index d650a0a146b..262da7db2d3 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.html
+++ b/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.html
@@ -123,6 +123,13 @@
               style="color: #8c8c8c; font-style: italic">
               Execute {{ response.toolCalls.length }} tool{{ response.toolCalls.length > 1 ? 's' : '' }}
             </div>
+            <!-- DataGuard permission prompt: only renders when the step is
+                 awaiting user authorization. -->
+            <texera-permission-prompt
+              *ngIf="response.pendingApproval"
+              [step]="response"
+              [agentId]="agentInfo.id">
+            </texera-permission-prompt>
 
             <!-- Details button (shown on hover) - for any message with details -->
             <button
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.ts
index 55b6c6a3f66..ab909478013 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.ts
+++ b/frontend/src/app/workspace/component/agent/agent-panel/agent-chat/agent-chat.component.ts
@@ -59,6 +59,7 @@ import { NzTabsComponent, NzTabComponent } from "ng-zorro-antd/tabs";
 import { NzInputNumberComponent } from "ng-zorro-antd/input-number";
 import { NzTagComponent } from "ng-zorro-antd/tag";
 import { NzSwitchComponent } from "ng-zorro-antd/switch";
+import { PermissionPromptComponent } from "../permission-prompt/permission-prompt.component";
 
 @UntilDestroy()
 @Component({
@@ -89,6 +90,7 @@ import { NzSwitchComponent } from "ng-zorro-antd/switch";
     NzInputGroupComponent,
     NzInputGroupWhitSuffixOrPrefixDirective,
     NzSwitchComponent,
+    PermissionPromptComponent,
   ],
 })
 export class AgentChatComponent implements OnInit, AfterViewChecked, OnDestroy, OnChanges {
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
index cf47b8b3ab7..34f88ced675 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
+++ b/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
@@ -21,6 +21,7 @@ import { Component, HostListener, Input, OnDestroy, OnInit, OnChanges, SimpleCha
 import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy";
 import { NzResizeEvent, NzResizableDirective, NzResizeHandlesComponent } from "ng-zorro-antd/resizable";
 import { AgentService, AgentInfo } from "../../../service/agent/agent.service";
+import { DataGuardAutoTriggerService } from "../../../service/agent/data-guard-auto-trigger.service";
 import { WorkflowActionService } from "../../../service/workflow-graph/model/workflow-action.service";
 import { NotificationService } from "../../../../common/service/notification/notification.service";
 import { calculateTotalTranslate3d } from "../../../../common/util/panel-dock";
@@ -98,7 +99,8 @@ export class AgentPanelComponent implements OnInit, OnDestroy, OnChanges {
   constructor(
     private agentService: AgentService,
     private workflowActionService: WorkflowActionService,
-    private notificationService: NotificationService
+    private notificationService: NotificationService,
+    private dataGuardAutoTrigger: DataGuardAutoTriggerService
   ) {}
 
   ngOnInit(): void {
@@ -125,6 +127,21 @@ export class AgentPanelComponent implements OnInit, OnDestroy, OnChanges {
         // Try to activate the agent if agentIdToActivate is set
         this.tryActivateAgentFromInput();
       });
+
+    // DataGuard auto-trigger: when a dataset-reading operator is added to the
+    // workflow, notify the user that DataGuard is ready to scan. Full
+    // integration (create agent + POST /agents/:id/dataguard/dataset + send
+    // initial "scan this dataset" message) is a follow-up that needs the
+    // referenced CSV bytes — for now we surface the trigger so the
+    // §5 storyboard "no manual invocation" beat is in place.
+    this.dataGuardAutoTrigger
+      .getDatasetAddedStream()
+      .pipe(untilDestroyed(this))
+      .subscribe(op => {
+        this.notificationService.info(
+          `DataGuard ready to scan ${op.operatorType}. Open the chat panel and ask "scan this dataset for quality issues".`
+        );
+      });
   }
 
   ngOnChanges(changes: SimpleChanges): void {
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
new file mode 100644
index 00000000000..e2544f8434d
--- /dev/null
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
@@ -0,0 +1,54 @@
+<div class="dg-permission" *ngIf="step.pendingApproval && !submitted">
+  <div class="dg-permission__header">
+    <strong>DataGuard wants to apply this fix</strong>
+    <span class="dg-permission__tier dg-permission__tier--{{step.pendingApproval.riskTier}}">
+      Risk: {{step.pendingApproval.riskTier}}
+    </span>
+  </div>
+
+  <div class="dg-permission__body">
+    <div class="dg-permission__field">
+      <span class="dg-permission__label">Action:</span> {{step.pendingApproval.proposal.action}}
+    </div>
+    <div class="dg-permission__field">
+      <span class="dg-permission__label">Affects:</span> {{step.pendingApproval.proposal.targetRowCount}} row(s)
+    </div>
+    <div class="dg-permission__field">
+      <span class="dg-permission__label">Reason:</span> {{step.pendingApproval.proposal.reason}}
+    </div>
+    <div class="dg-permission__field">
+      <span class="dg-permission__label">Evidence:</span> {{step.pendingApproval.proposal.evidence}}
+    </div>
+    <div class="dg-permission__field">
+      <span class="dg-permission__label">Confidence:</span> {{step.pendingApproval.proposal.confidence}}
+    </div>
+  </div>
+
+  <div class="dg-permission__actions" *ngIf="!isModifying">
+    <button nz-button nzType="primary" (click)="onAllow(false)">Allow</button>
+    <button
+      nz-button
+      *ngIf="step.pendingApproval.riskTier !== 'high'"
+      (click)="onAllow(true)">
+      Allow &amp; don't ask for similar
+    </button>
+    <button nz-button (click)="onDeny()">Deny</button>
+    <button nz-button (click)="openModify()">Modify…</button>
+  </div>
+
+  <div class="dg-permission__modify" *ngIf="isModifying">
+    <textarea
+      nz-input
+      rows="3"
+      [(ngModel)]="modifiedAction"
+      placeholder="Describe the modified action (e.g., 'Flag for manual review instead of impute')"></textarea>
+    <div class="dg-permission__actions">
+      <button nz-button nzType="primary" (click)="submitModify()">Submit modification</button>
+      <button nz-button (click)="cancelModify()">Cancel</button>
+    </div>
+  </div>
+</div>
+
+<div *ngIf="submitted" class="dg-permission dg-permission--resolved">
+  Decision sent — waiting for the agent to continue.
+</div>
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
new file mode 100644
index 00000000000..f0f32539e43
--- /dev/null
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
@@ -0,0 +1,78 @@
+.dg-permission {
+  border: 1px solid #e5b800;
+  border-radius: 8px;
+  padding: 12px;
+  margin: 8px 0;
+  background: #fffbe6;
+  font-size: 0.9rem;
+}
+
+.dg-permission--resolved {
+  background: #f0f8ff;
+  border-color: #91d5ff;
+  color: #1890ff;
+  font-style: italic;
+}
+
+.dg-permission__header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  margin-bottom: 8px;
+}
+
+.dg-permission__tier {
+  font-size: 0.75rem;
+  font-weight: 600;
+  padding: 2px 8px;
+  border-radius: 4px;
+  text-transform: uppercase;
+  letter-spacing: 0.5px;
+}
+
+.dg-permission__tier--low {
+  background: #d9f7be;
+  color: #389e0d;
+}
+
+.dg-permission__tier--medium {
+  background: #fff1b8;
+  color: #d48806;
+}
+
+.dg-permission__tier--high {
+  background: #ffccc7;
+  color: #cf1322;
+}
+
+.dg-permission__body {
+  margin: 8px 0;
+}
+
+.dg-permission__field {
+  margin: 4px 0;
+  line-height: 1.4;
+}
+
+.dg-permission__label {
+  font-weight: 600;
+  display: inline-block;
+  min-width: 90px;
+  color: #595959;
+}
+
+.dg-permission__actions {
+  display: flex;
+  gap: 8px;
+  margin-top: 8px;
+  flex-wrap: wrap;
+}
+
+.dg-permission__modify {
+  margin-top: 8px;
+
+  textarea {
+    width: 100%;
+    margin-bottom: 8px;
+  }
+}
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts
new file mode 100644
index 00000000000..7423d9aefb8
--- /dev/null
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Component, Input } from "@angular/core";
+import { NgIf } from "@angular/common";
+import { FormsModule } from "@angular/forms";
+import { NzButtonComponent } from "ng-zorro-antd/button";
+import { NzInputDirective } from "ng-zorro-antd/input";
+import { ReActStep } from "../../../../service/agent/agent-types";
+import { AgentService } from "../../../../service/agent/agent.service";
+
+/**
+ * DataGuard permission prompt. Rendered inline in the agent chat panel for
+ * any ReActStep whose `pendingApproval` field is set. The user's click sends
+ * a {type:"decision", stepId, verdict, ...} message over the agent WS; the
+ * server-side ReAct loop resumes once the awaiting tool promise resolves.
+ */
+@Component({
+  selector: "texera-permission-prompt",
+  standalone: true,
+  imports: [NgIf, FormsModule, NzButtonComponent, NzInputDirective],
+  templateUrl: "./permission-prompt.component.html",
+  styleUrls: ["./permission-prompt.component.scss"],
+})
+export class PermissionPromptComponent {
+  @Input() step!: ReActStep;
+  @Input() agentId!: string;
+
+  public isModifying = false;
+  public modifiedAction = "";
+  public submitted = false;
+
+  constructor(private readonly agentService: AgentService) {}
+
+  public onAllow(remember: boolean): void {
+    if (this.submitted) return;
+    this.submitted = true;
+    this.agentService.sendDecision(this.agentId, this.step.id, "allow", { remember });
+  }
+
+  public onDeny(): void {
+    if (this.submitted) return;
+    this.submitted = true;
+    this.agentService.sendDecision(this.agentId, this.step.id, "deny");
+  }
+
+  public openModify(): void {
+    if (this.submitted) return;
+    this.isModifying = true;
+    this.modifiedAction = this.step.pendingApproval?.proposal.action ?? "";
+  }
+
+  public submitModify(): void {
+    if (this.submitted) return;
+    this.submitted = true;
+    this.agentService.sendDecision(this.agentId, this.step.id, "modify", {
+      modifiedAction: this.modifiedAction,
+    });
+  }
+
+  public cancelModify(): void {
+    this.isModifying = false;
+  }
+}
diff --git a/frontend/src/app/workspace/service/agent/agent-types.ts b/frontend/src/app/workspace/service/agent/agent-types.ts
index c687de472a2..c9b3ed29cab 100644
--- a/frontend/src/app/workspace/service/agent/agent-types.ts
+++ b/frontend/src/app/workspace/service/agent/agent-types.ts
@@ -78,4 +78,25 @@ export interface ReActStep {
   beforeWorkflowContent?: any;
   /** Workflow state after this step executed */
   afterWorkflowContent?: any;
+  /**
+   * DataGuard: a mutating tool is awaiting user approval. When this field is
+   * set, the chat panel renders the permission-prompt UI (Allow / Deny /
+   * Modify / Allow & remember). The agent's ReAct loop is paused server-side
+   * until a WS {type:"decision", stepId, verdict} message resolves it.
+   */
+  pendingApproval?: {
+    toolName: string;
+    riskTier: "low" | "medium" | "high";
+    proposal: {
+      issueId: string;
+      issueType: string;
+      action: string;
+      operationKind: string;
+      operationParams: Record<string, unknown>;
+      reason: string;
+      evidence: string;
+      confidence: "low" | "medium" | "high";
+      targetRowCount: number;
+    };
+  };
 }
diff --git a/frontend/src/app/workspace/service/agent/agent.service.ts b/frontend/src/app/workspace/service/agent/agent.service.ts
index 2009734030b..01c10b8ea00 100644
--- a/frontend/src/app/workspace/service/agent/agent.service.ts
+++ b/frontend/src/app/workspace/service/agent/agent.service.ts
@@ -965,6 +965,38 @@ export class AgentService {
     });
   }
 
+  /**
+   * DataGuard: send the user's verdict on a pending-approval step.
+   * Resolves the awaiting tool execution server-side and lets the ReAct loop
+   * continue. `remember` (when verdict === "allow") registers an auto-allow
+   * rule for the issueType so subsequent matching issues skip the prompt.
+   */
+  public sendDecision(
+    agentId: string,
+    stepId: string,
+    verdict: "allow" | "deny" | "modify",
+    options: { modifiedAction?: string; remember?: boolean } = {}
+  ): void {
+    const tracking = this.agentStateTracking.get(agentId);
+    if (!tracking?.websocket || tracking.websocket.readyState !== WebSocket.OPEN) {
+      console.error(`Agent ${agentId}: cannot send decision — WebSocket not open`);
+      return;
+    }
+    try {
+      tracking.websocket.send(
+        JSON.stringify({
+          type: "decision",
+          stepId,
+          verdict,
+          modifiedAction: options.modifiedAction,
+          remember: options.remember,
+        })
+      );
+    } catch (error) {
+      console.error("Failed to send DataGuard decision:", error);
+    }
+  }
+
   /**
    * Stop generation for an agent via WebSocket.
    */
diff --git a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts
new file mode 100644
index 00000000000..fc6a7242f1b
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Injectable } from "@angular/core";
+import { Observable, filter, map } from "rxjs";
+import { OperatorPredicate } from "../../types/workflow-common.interface";
+import { WorkflowActionService } from "../workflow-graph/model/workflow-action.service";
+
+/**
+ * DataGuard auto-trigger.
+ *
+ * Watches the texera-graph for newly-added dataset-reading operators
+ * (CSVFileScan, TableFileScan, JSONFileScan, …) and emits a hint that the
+ * agent-panel should auto-launch a DataGuard agent and open the chat panel.
+ *
+ * The actual agent-creation flow (POST /agents → POST /agents/:id/dataguard/dataset
+ * → activate websocket → send "scan this dataset") is owned by the panel
+ * component. This service is the *event source*, not the orchestrator.
+ *
+ * Wire from a panel:
+ *
+ *   constructor(private trigger: DataGuardAutoTriggerService) {}
+ *   ngOnInit() {
+ *     this.trigger.getDatasetAddedStream().pipe(untilDestroyed(this)).subscribe(op => {
+ *       // create agent, load dataset, send message...
+ *     });
+ *   }
+ */
+@Injectable({ providedIn: "root" })
+export class DataGuardAutoTriggerService {
+  // Operator types that imply "the user just brought a tabular dataset onto
+  // the canvas." Extend cautiously — every type here triggers DataGuard.
+  private static readonly DATASET_OPERATOR_TYPES = new Set<string>([
+    "CSVFileScan",
+    "TableFileScan",
+    "JSONFileScan",
+    "ParallelCSVFileScan",
+  ]);
+
+  constructor(private readonly workflowActionService: WorkflowActionService) {}
+
+  /**
+   * Emits an OperatorPredicate every time a dataset-reading operator is
+   * added to the workflow. Subscribers should react by auto-launching a
+   * DataGuard agent and loading the referenced dataset.
+   */
+  public getDatasetAddedStream(): Observable<OperatorPredicate> {
+    return this.workflowActionService
+      .getTexeraGraph()
+      .getOperatorAddStream()
+      .pipe(
+        filter((op: OperatorPredicate) =>
+          DataGuardAutoTriggerService.DATASET_OPERATOR_TYPES.has(op.operatorType)
+        ),
+        map((op: OperatorPredicate) => op)
+      );
+  }
+
+  /**
+   * For tests / debugging: is a given operatorType one we'd auto-trigger on?
+   */
+  public isDatasetOperatorType(operatorType: string): boolean {
+    return DataGuardAutoTriggerService.DATASET_OPERATOR_TYPES.has(operatorType);
+  }
+}

From b84a457402649a72dc8859f2d752807190c5bcbf Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Thu, 14 May 2026 19:41:11 -0700
Subject: [PATCH 04/14] refactor(agent-service): group DataGuard tests under
 __tests__/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dataguard/ folder had 8 source + 8 test files in one directory and was
getting hard to scan. Move the seven DataGuard-specific test files into a
__tests__/ subdirectory and update their relative imports (one extra ../).
The types/dataguard.test.ts stays put — it's in src/types/, not under
dataguard/, and that folder isn't crowded. bun test still auto-discovers;
all 159 tests still pass.
---
 .../agent/tools/dataguard/{ => __tests__}/apply-fix.test.ts | 6 +++---
 .../tools/dataguard/{ => __tests__}/bias-check.test.ts      | 4 ++--
 .../dataguard/{ => __tests__}/dataguard-session.test.ts     | 6 +++---
 .../tools/dataguard/{ => __tests__}/decision-log.test.ts    | 4 ++--
 .../tools/dataguard/{ => __tests__}/profile-dataset.test.ts | 4 ++--
 .../tools/dataguard/{ => __tests__}/suggest-fix.test.ts     | 4 ++--
 .../tools/dataguard/{ => __tests__}/with-approval.test.ts   | 4 ++--
 7 files changed, 16 insertions(+), 16 deletions(-)
 rename agent-service/src/agent/tools/dataguard/{ => __tests__}/apply-fix.test.ts (98%)
 rename agent-service/src/agent/tools/dataguard/{ => __tests__}/bias-check.test.ts (97%)
 rename agent-service/src/agent/tools/dataguard/{ => __tests__}/dataguard-session.test.ts (95%)
 rename agent-service/src/agent/tools/dataguard/{ => __tests__}/decision-log.test.ts (96%)
 rename agent-service/src/agent/tools/dataguard/{ => __tests__}/profile-dataset.test.ts (98%)
 rename agent-service/src/agent/tools/dataguard/{ => __tests__}/suggest-fix.test.ts (97%)
 rename agent-service/src/agent/tools/dataguard/{ => __tests__}/with-approval.test.ts (97%)

diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
similarity index 98%
rename from agent-service/src/agent/tools/dataguard/apply-fix.test.ts
rename to agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
index 1546102b4dd..2f7e706b380 100644
--- a/agent-service/src/agent/tools/dataguard/apply-fix.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
@@ -18,9 +18,9 @@
  */
 
 import { describe, expect, test } from "bun:test";
-import { applyFix } from "./apply-fix";
-import type { DatasetView } from "./dataset";
-import type { FixProposal } from "../../../types/dataguard";
+import { applyFix } from "../apply-fix";
+import type { DatasetView } from "../dataset";
+import type { FixProposal } from "../../../../types/dataguard";
 
 function makeProposal(overrides: Partial<FixProposal> = {}): FixProposal {
   return {
diff --git a/agent-service/src/agent/tools/dataguard/bias-check.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts
similarity index 97%
rename from agent-service/src/agent/tools/dataguard/bias-check.test.ts
rename to agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts
index 04755ba8558..791637396c1 100644
--- a/agent-service/src/agent/tools/dataguard/bias-check.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts
@@ -18,8 +18,8 @@
  */
 
 import { describe, expect, test } from "bun:test";
-import { computeBiasCheck } from "./bias-check";
-import type { DatasetView } from "./dataset";
+import { computeBiasCheck } from "../bias-check";
+import type { DatasetView } from "../dataset";
 
 describe("computeBiasCheck", () => {
   test("identical before/after → 100% retention, no skew", () => {
diff --git a/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
similarity index 95%
rename from agent-service/src/agent/tools/dataguard/dataguard-session.test.ts
rename to agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
index 96f7c2fa43c..27afdb8346f 100644
--- a/agent-service/src/agent/tools/dataguard/dataguard-session.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
@@ -18,9 +18,9 @@
  */
 
 import { describe, expect, test } from "bun:test";
-import { DataGuardSession } from "./dataguard-session";
-import type { DatasetView } from "./dataset";
-import type { DataQualityIssue, FixProposal } from "../../../types/dataguard";
+import { DataGuardSession } from "../dataguard-session";
+import type { DatasetView } from "../dataset";
+import type { DataQualityIssue, FixProposal } from "../../../../types/dataguard";
 
 function makeIssue(): DataQualityIssue {
   return {
diff --git a/agent-service/src/agent/tools/dataguard/decision-log.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
similarity index 96%
rename from agent-service/src/agent/tools/dataguard/decision-log.test.ts
rename to agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
index 242259d2ecf..f31f1adbaa0 100644
--- a/agent-service/src/agent/tools/dataguard/decision-log.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
@@ -18,8 +18,8 @@
  */
 
 import { describe, expect, test } from "bun:test";
-import { serializeDecisionLogCsv } from "./decision-log";
-import type { DecisionLogEntry } from "../../../types/dataguard";
+import { serializeDecisionLogCsv } from "../decision-log";
+import type { DecisionLogEntry } from "../../../../types/dataguard";
 
 function entry(overrides: Partial<DecisionLogEntry> = {}): DecisionLogEntry {
   return {
diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
similarity index 98%
rename from agent-service/src/agent/tools/dataguard/profile-dataset.test.ts
rename to agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
index d48d828c98d..0586e338445 100644
--- a/agent-service/src/agent/tools/dataguard/profile-dataset.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
@@ -18,8 +18,8 @@
  */
 
 import { describe, expect, test } from "bun:test";
-import { profileDataset } from "./profile-dataset";
-import type { DatasetView } from "./dataset";
+import { profileDataset } from "../profile-dataset";
+import type { DatasetView } from "../dataset";
 
 describe("profileDataset", () => {
   test("clean dataset → empty issue list", () => {
diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts
similarity index 97%
rename from agent-service/src/agent/tools/dataguard/suggest-fix.test.ts
rename to agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts
index 3d3ab4dc632..99edc46304e 100644
--- a/agent-service/src/agent/tools/dataguard/suggest-fix.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts
@@ -18,8 +18,8 @@
  */
 
 import { describe, expect, test } from "bun:test";
-import { suggestFix, type LlmCallFn } from "./suggest-fix";
-import type { DataQualityIssue } from "../../../types/dataguard";
+import { suggestFix, type LlmCallFn } from "../suggest-fix";
+import type { DataQualityIssue } from "../../../../types/dataguard";
 
 function makeIssue(overrides: Partial<DataQualityIssue> = {}): DataQualityIssue {
   return {
diff --git a/agent-service/src/agent/tools/dataguard/with-approval.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts
similarity index 97%
rename from agent-service/src/agent/tools/dataguard/with-approval.test.ts
rename to agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts
index 1fff89cb9fd..1cfbbb0b67b 100644
--- a/agent-service/src/agent/tools/dataguard/with-approval.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts
@@ -18,8 +18,8 @@
  */
 
 import { describe, expect, test } from "bun:test";
-import { requestApproval, type ApprovalGateway } from "./with-approval";
-import type { FixProposal, IssueType, PermissionDecision, RiskTier } from "../../../types/dataguard";
+import { requestApproval, type ApprovalGateway } from "../with-approval";
+import type { FixProposal, IssueType, PermissionDecision, RiskTier } from "../../../../types/dataguard";
 
 function makeProposal(overrides: Partial<FixProposal> = {}): FixProposal {
   return {

From 2ad7bfb79781212502a1a0b008419f23dbeaf01a Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 00:41:51 -0700
Subject: [PATCH 05/14] feat(dataguard): checklist UI, contract hardening,
 outlier reshape, end-to-end MVP polish
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Major iteration on top of the four committed DataGuard MVP commits. Ships the
auto-trigger checklist as the primary UX (chat flow stays wired for any future
DataGuard-via-LLM path but isn't used by the user-facing flow).

Detector model: five categories (was six). The z-score outlier detector was
dropped — clustered legitimate extremes (e.g. sustained high glucose readings)
were being flagged en masse with no good way for the user to say "those are
real." The old `out_of_range` was renamed `outlier` and keeps its
validRanges-based definition. `duplicate_id` now auto-infers the ID column
from name patterns (`id`, `*_id`, `*Id`, `id_*`, `uid`) so the auto-trigger's
empty-body `/scan` still catches duplicates without UI configuration.

Fix-operation model: `flag` removed entirely (was a no-op against the data;
caused LakeFS "No changes detected" errors after Apply). `warning` added to
RiskTier — concrete fix, always prompts, no "remember." `replace_value` now
supports `rowIndices` targeting (deterministic; wins over value-based `match`),
which fixes a class of silent no-ops where LLM-rounded match values didn't
equal the actual cells. Suggest-fix prompt explicitly passes
`affectedRowIndices` and instructs `rowIndices`-based replace for outliers.

Permission contract: `/apply-batch` body schema is `verdict: "allow" | "deny"`
with `additionalProperties: false`. The Elysia app is built with
`normalize: false` so unknown legacy fields aren't silently stripped before
validation. Global onError converts `code === "VALIDATION"` to HTTP 400.
Runtime check rejects `{verdict: "deny", remember: true}`. WS decision handler
narrowed the same way. `modifiedAction` removed everywhere, including the
decision-log CSV header (9 columns now).

Checklist UX:
  - Auto-trigger CSV-only (`CSVFileScan`, `ParallelCSVFileScan`). JSON/Parquet
    were trigger-set members but `loadFromOperatorFile` blindly Papa-parsed,
    producing garbage.
  - Floating, draggable panel via Angular CDK; cdkDragBoundary="body" so it
    can't get lost behind the toolbar.
  - 📍 locate button per row: cycles through `affectedRowIndices` on repeat
    clicks (per-row cursor in a `Map<issueId, number>`, wraps at length).
    Tooltip previews the next click position.
  - Result-panel integration via `DataGuardRowNavigatorService` (ReplaySubject
    with 500ms TTL for cold-mount). `ResultTableFrameComponent` adds a private
    `pageRendered$ Subject` so the highlight only fires after the new page
    actually renders, not on an arbitrary 100ms timer. Cross-operator race,
    viewport-resize during page swap, columnDef-vs-header drift, destroyed-view
    NG0911 — all guarded.
  - After-Apply auto-rescan: the panel re-runs `/scan` against the new dataset
    version so users see real residue instead of stale entries.
  - Floating reopen icon when panel is closed & shield is ON. Click → fresh
    scan. Pipeline concurrency is gated by `currentPipeline: Promise<void> |
    null`: auto-trigger drops silently on slot conflict (spam suppression);
    user-initiated awaits the in-flight pipeline (with a "queuing your scan…"
    toast) — at most one `/scan` POST in flight at a time.
  - Toolbar 🛡 shield (per-workflow ON/OFF, localStorage).

Backend: server normalize:false; `/dataguard/load-demo-dataset` deleted as
dead code (frontend never called it). DataGuardSession's `flaggedRows` field
plus the post-apply `acknowledgedIssues` split removed alongside `flag`.
`missing-detection.ts` is the single source of truth for missing/placeholder
checks; `applyFix` threads `ScanOptions.missingTokens` through to `impute`.
With-approval gates `warning` identically to `high` (never auto-allows, even
with a remembered rule).

Frontend service ownership: the auto-trigger orchestration subscription moved
off `AgentPanelComponent` onto `DataGuardChecklistComponent` (the natural
consumer of its output). `selectedCount`/`deniedCount`/`pendingCount` are
cached on each state push instead of being three filter walks per CD tick.

README_DataGuard.md is the consolidated feature spec, kept in repo root.

Tests: 199 pass / 0 fail (419 expect calls), agent-service typecheck clean,
frontend `tsc --noEmit` clean. New regression-locks include: `replace_value`
with `rowIndices` (no more byte-identical export), `inferIdColumn` for all
id-name patterns, "clustered large readings are NOT auto-outliers", warning
tier prompts even with remembered rule, modify-verdict + modifiedAction +
`{deny, remember:true}` rejection, pipeline serialization (two user-initiated
rescans never invoke `/scan` concurrently), per-row locate cycle math.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 README_DataGuard.md                           | 556 ++++++++++++++++
 agent-service/demo/README.md                  |  42 +-
 agent-service/demo/duplicate_rows_demo.csv    |  31 +
 .../demo/inconsistent_labels_demo.csv         |  31 +
 agent-service/demo/missing_values_demo.csv    |  31 +
 agent-service/demo/outliers_demo.csv          |  31 +
 .../demo/placeholder_values_demo.csv          |  31 +
 agent-service/src/agent/texera-agent.ts       |  15 +
 .../apply-batch-modify-reject.test.ts         | 156 +++++
 .../__tests__/apply-batch-rescan.test.ts      | 111 ++++
 .../dataguard/__tests__/apply-fix.test.ts     | 140 +++-
 .../__tests__/dataguard-session.test.ts       |   7 -
 .../__tests__/decision-log-no-modify.test.ts  | 128 ++++
 .../dataguard/__tests__/decision-log.test.ts  |  12 +-
 .../__tests__/permission-types.test.ts        | 121 ++++
 .../__tests__/profile-dataset.test.ts         | 170 ++++-
 .../__tests__/with-approval-no-modify.test.ts | 117 ++++
 .../dataguard/__tests__/with-approval.test.ts |  27 +-
 .../src/agent/tools/dataguard/apply-fix.ts    |  86 ++-
 .../tools/dataguard/dataguard-session.ts      |  34 +-
 .../agent/tools/dataguard/dataguard-tools.ts  |  27 +-
 .../src/agent/tools/dataguard/decision-log.ts |   2 -
 .../tools/dataguard/missing-detection.ts      |  86 +++
 .../agent/tools/dataguard/profile-dataset.ts  | 197 ++++--
 .../src/agent/tools/dataguard/suggest-fix.ts  |  38 +-
 .../agent/tools/dataguard/with-approval.ts    |  11 +-
 agent-service/src/server.ts                   | 273 +++++++-
 agent-service/src/types/dataguard.test.ts     |  43 +-
 agent-service/src/types/dataguard.ts          |  18 +-
 common/config/src/main/resources/gui.conf     |   2 +-
 .../agent-panel/agent-panel.component.ts      |  18 +-
 .../permission-prompt.component.html          |  17 +-
 .../permission-prompt.component.scss          |  14 +-
 .../permission-prompt.component.ts            |  24 +-
 .../dataguard-checklist.component.html        | 160 +++++
 .../dataguard-checklist.component.scss        | 319 +++++++++
 .../dataguard-checklist.component.ts          | 369 +++++++++++
 .../component/menu/menu.component.html        |  22 +
 .../component/menu/menu.component.scss        |  18 +
 .../component/menu/menu.component.ts          |  33 +-
 .../result-table-frame.component.html         |   4 +-
 .../result-table-frame.component.scss         |  26 +
 .../result-table-frame.component.ts           |  96 ++-
 .../component/workspace.component.html        |   4 +
 .../component/workspace.component.ts          |   2 +
 .../workspace/service/agent/agent-types.ts    |   2 +-
 .../workspace/service/agent/agent.service.ts  |   5 +-
 .../data-guard-auto-trigger.service.spec.ts   | 259 ++++++++
 .../agent/data-guard-auto-trigger.service.ts  | 622 ++++++++++++++++--
 .../agent/data-guard-results.service.spec.ts  | 178 +++++
 .../agent/data-guard-results.service.ts       | 132 ++++
 .../data-guard-row-navigator.service.spec.ts  | 122 ++++
 .../agent/data-guard-row-navigator.service.ts |  95 +++
 .../agent/data-guard-settings.service.ts      |  88 +++
 54 files changed, 4854 insertions(+), 349 deletions(-)
 create mode 100644 README_DataGuard.md
 create mode 100644 agent-service/demo/duplicate_rows_demo.csv
 create mode 100644 agent-service/demo/inconsistent_labels_demo.csv
 create mode 100644 agent-service/demo/missing_values_demo.csv
 create mode 100644 agent-service/demo/outliers_demo.csv
 create mode 100644 agent-service/demo/placeholder_values_demo.csv
 create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/apply-batch-rescan.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/decision-log-no-modify.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts
 create mode 100644 agent-service/src/agent/tools/dataguard/missing-detection.ts
 create mode 100644 frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html
 create mode 100644 frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss
 create mode 100644 frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-results.service.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-settings.service.ts

diff --git a/README_DataGuard.md b/README_DataGuard.md
new file mode 100644
index 00000000000..baf0dafbf2a
--- /dev/null
+++ b/README_DataGuard.md
@@ -0,0 +1,556 @@
+# DataGuard — Permission-Gated Data Cleaning for Texera
+
+> **Tagline:** AI suggests. Humans authorize. Texera records.
+>
+> **One-sentence pitch:** DataGuard is a conversational agent inside Texera that proposes data-cleaning actions one at a time and asks the user's permission before applying each — the Claude Code experience, but for data instead of code.
+
+---
+
+## 1. Problem
+
+Data cleaning is rarely "just technical." Cleaning decisions can introduce bias, remove rare-but-meaningful cases, or silently change the meaning of a dataset — especially dangerous in scientific or high-stakes data.
+
+Typical pain points:
+- A missing glucose value may not be random.
+- An `age = 999` value may be a placeholder, not a real age.
+- A duplicate sample ID with conflicting labels may need expert review.
+- A statistical outlier may be a meaningful rare case.
+
+Today, two common workflows both fail in different ways:
+
+| Approach | Failure mode |
+|---|---|
+| Manual scripts (pandas in a notebook) | Opaque, hard to audit, no provenance, doesn't scale beyond one person |
+| Auto-clean tools | Black-box decisions, no explanation, no human control over high-impact actions |
+
+**DataGuard's claim:** the *interaction model* is the missing piece, not the algorithms. Treat data-cleaning decisions the way Claude Code treats file edits — **ask permission, explain reasoning, log every decision.**
+
+---
+
+## 2. Why this design
+
+Texera's execution engine (Amber) is a streaming, actor-based system. It does **not** natively support pausing a workflow mid-execution to wait for a user click on an in-canvas approval table. Building such a "pause-and-await-user" operator would require changes in three places (Amber engine, gRPC control protocol, Angular result panel).
+
+DataGuard sidesteps all of this by living in `agent-service/` rather than in the workflow graph:
+
+| Layer | Reused |
+|---|---|
+| Conversation state | Existing agent-service session management |
+| Permission UX | Same UI pattern Claude Code uses for tool authorization |
+| LLM gateway | The existing `LLM_ENDPOINT` (OpenAI-compatible) wired into `agent-service` |
+| Data processing | TypeScript pure functions in agent-service — no new operators required |
+| Workflow execution | Existing Texera workflow API (no new gRPC, no Amber changes) |
+
+**No Amber changes, no new operators, no new protocols.**
+
+---
+
+## 3. User experience
+
+### 3.1 Trigger
+
+The user does **nothing special**. When a dataset-reading operator is added to the workflow canvas — currently `CSVFileScan` or `ParallelCSVFileScan` — the auto-trigger fires:
+
+1. Resolves the workflow context (id, per-workflow shield setting).
+2. Finds or creates a per-workflow agent on agent-service.
+3. Reads the operator's `fileName`, fetches the bytes via `DatasetService`, parses with `papaparse`.
+4. POSTs `{columns, rows}` to `/dataguard/dataset` so the agent has the data in memory.
+5. POSTs `/dataguard/scan` — the server runs `profile_dataset` then `suggest_fix` per issue, **bypassing the LLM ReAct loop** (so the LLM can't decide to call `deleteOperator` and vaporize the user's workflow).
+6. Publishes the scan result to `DataGuardResultsService`.
+
+The dedicated `<texera-dataguard-checklist>` floating panel slides in. The chat panel is not involved.
+
+### 3.2 The checklist panel
+
+- **Floating, draggable** — `cdkDrag` on the panel, header is the `cdkDragHandle`. Position is session-only (no localStorage persistence). `cdkDragBoundary="body"` keeps the panel inside the viewport so it can't get lost behind the toolbar.
+- **Risk-tier chip** per row (LOW / MEDIUM / HIGH / WARNING) — color coded.
+- **Default verdict per row** — `low` is pre-checked Allow; everything else (`medium` / `high` / `warning`) starts `pending` so the user makes an explicit call.
+- **Per-row controls**: checkbox to allow, "Skip" button to deny, "Always do this" remember toggle (hidden for `high` and `warning`, and hidden whenever there's no proposal at all).
+- **Bulk actions**: "Fix all" / "Skip all" buttons in the row header.
+- **Apply button**: `Fix N & run` posts the batch to `/dataguard/apply-batch`, writes the cleaned data back as a new dataset version, repoints the operator, and auto-runs the workflow. **After Apply succeeds, the panel automatically re-scans the new dataset version** so the user sees real residue (or "all clean") instead of stale entries from before the fix.
+- **"Scan again"** footer button (visible when state is `done` or `error`) re-runs DataGuard on the *current* dataset version — supports iterative cleanup (v1 → Apply → v2 → Scan again → Apply → v3 → …).
+
+### 3.3 Locate this issue → result panel
+
+Each row's `In column X · affects N row(s)` line is a clickable **📍 locate** button. Clicking it:
+
+1. Highlights the source `CSVFileScan` operator on the graph.
+2. Opens / focuses the Result Panel for that operator.
+3. Navigates to the page containing the next affected row and flashes the cell.
+
+The button **cycles** — every click advances a per-row cursor through `affectedRowIndices`, wrapping back to the first after the last. Each row owns its own cursor (a `Map<issueId, number>` on the checklist component), so toggling verdicts elsewhere doesn't reset it. The tooltip previews the next position (`Show next affected row (i of N)`).
+
+Plumbing: `DataGuardRowNavigatorService` is a `ReplaySubject<DataGuardRowNavRequest>(1, 500ms)`. The 500 ms buffer covers the cold-mount case where the user clicks locate while the Result Panel is collapsed and `ResultTableFrameComponent` is async-instantiated. `ResultTableFrameComponent` subscribes, paginates (via an internal `pageRendered$ Subject` so the flash lands AFTER the new page renders — not after a 100 ms guess), and applies `dg-row-highlight` + `dg-cell-highlight` for 2 s (matched to the SCSS pulse animation). Cross-operator races, viewport resize during page swap, columnDef-vs-header naming drift, and stale closures on a destroyed view are all explicitly guarded.
+
+### 3.4 Toolbar shield + floating icon
+
+A `🛡` button in the workflow toolbar toggles DataGuard per-workflow (persisted in localStorage). When the panel is closed (`state === "idle"`) but the shield is ON, a small floating DataGuard icon appears on the canvas. Clicking it always triggers a fresh scan of whatever dataset operator is on the canvas (via `DataGuardAutoTriggerService.rescanAny()`).
+
+Resolution order for "what to scan?":
+1. The previously-scanned operator if it still exists on the canvas.
+2. Otherwise the first dataset-reading operator on the canvas.
+3. Otherwise warn the user: "drop a dataset operator first."
+
+**Concurrency control.** The pipeline is gated by `currentPipeline: Promise<void> | null` instead of a boolean. Two regimes:
+- **Auto-trigger** (`userInitiated: false`, driven by operator-add / property-change debounce): if the slot is occupied, **drop silently** to preserve the original spam-suppression semantics.
+- **User-initiated** (`userInitiated: true`, the floater click or the panel's Scan-again button): if the slot is occupied, **await** the in-flight pipeline (with a "queuing your scan…" toast) then start a fresh one. The panel state flips to `"scanning"` immediately so the user never sees a dead click. At most one `/scan` POST is in flight at a time.
+
+This means rescan works at any time, even after the user explicitly closed the panel — `state.sourceOperatorId` being gone after `reset()` doesn't break the flow, and a click during a slow scan doesn't double-fire LLM cost.
+
+---
+
+## 4. System architecture
+
+```
+┌───────────────────────────┐          ┌──────────────────────────────────────┐
+│  Texera frontend (Angular) │   chat   │       agent-service (Bun / TS)      │
+│                            │◄─── ws ──┤                                      │
+│  ┌─────────────────────┐   │          │   TexeraAgent (DataGuard host)      │
+│  │ DataGuard checklist │   │          │   ├── DataGuardSession              │
+│  │ + floating reopen   │   │  REST    │   │   • dataset (in-memory)         │
+│  │ + toolbar shield    │◄──┼──────────┤   │   • issues / proposals / log    │
+│  └─────────────────────┘   │          │   │   • auto-allow rules            │
+│  ┌─────────────────────┐   │          │   ├── Tools:                         │
+│  │ Auto-trigger        │   │          │   │   • profile_dataset (read-only) │
+│  │  service            │   │          │   │   • suggest_fix     (read-only) │
+│  │ + DataGuard results │   │          │   │   • apply_fix       (mutating)  │
+│  │  state              │   │          │   │   • write_decision_log          │
+│  └─────────────────────┘   │          │   │   • bias_check                  │
+└───────────────────────────┘          │   └── LLM gateway (LiteLLM)         │
+                                       └──────────────┬───────────────────────┘
+                                                      │
+                                              ┌───────▼────────┐
+                                              │ Texera storage │
+                                              │ (dataset/      │
+                                              │  file-service) │
+                                              └────────────────┘
+```
+
+**Two entry points to the same backend:**
+
+- **REST (the primary path):** `/scan` and `/apply-batch` are server-driven — no LLM in the loop during the user's interaction. The LLM is invoked exactly once per issue inside `/scan`, to render a structured `FixProposal` from the raw `DataQualityIssue`. This keeps the user's flow deterministic and fast.
+- **WS (the chat path):** `<texera-permission-prompt>` still works if the agent is prompted via chat and the ReAct loop reaches `apply_fix` for a mutating action. The shape is `{type: "decision", stepId, verdict, remember?}`. This path is not used by the auto-trigger but is kept for any future chat-driven DataGuard flow.
+
+---
+
+## 5. Detectors (five categories)
+
+`profile_dataset` runs entirely without an LLM and emits `DataQualityIssue` records of five types:
+
+| Type | Detection |
+|---|---|
+| `missing_value` | `null` / empty / configured missing tokens (default: `na`, `n/a`, `null`, `none`, `nan` — case-insensitive, whitespace-trimmed) |
+| `placeholder_value` | Numeric sentinels (`999`, `-1`) or string sentinels (`unknown`, `Unknown`) — overridable via `placeholderValues` |
+| `duplicate_id` | Honors `idColumn` hint; **falls back to a column-name heuristic** (`id`, `*_id`, `*Id`, `id_*`, `uid`) when no hint is supplied so the auto-trigger's empty-body `/scan` still catches duplicates. Flags repeated IDs (with or without conflicting labels) |
+| `outlier` | Requires `validRanges` hint per column (`{min, max}`); flags numeric values outside the range. Skips rows already flagged as placeholders to avoid double-counting |
+| `inconsistent_label` | Low-cardinality string columns where `trim+lowercase` keys collide on multiple raw spellings (e.g., `Male` / `male` / `MALE`); picks the most-frequent spelling as canonical |
+
+> **Note on outlier semantics.** An earlier z-score based detector (flag anything with `|z| > 3`) was deliberately removed: clustered legitimate extremes (e.g. sustained high glucose readings in a clinical dataset) were being flagged en masse and the user had no good way to tell the agent "those are real". The current `outlier` detector requires the caller to opt in by stating a hard range per column — the user owns the definition.
+
+The `missing` detector is centralized in `missing-detection.ts` — `profile_dataset` and `apply_fix` (impute) share it, so what the profiler flags is exactly what imputation treats as missing.
+
+---
+
+## 6. Risk tiers (four levels)
+
+`suggest_fix` annotates every proposal with a `RiskTier`. The tier governs both the UI (pre-check / badge color / "remember" availability) and the permission gate:
+
+| Tier | Color | Default checkbox | "Allow & remember"? | Use case |
+|---|---|---|---|---|
+| `low` | green | pre-checked Allow | yes | Trim whitespace, standardize column names, drop fully empty rows |
+| `medium` | yellow | pending (unchecked) | yes | Impute missing values, standardize inconsistent labels |
+| `high` | red | pending (unchecked) | **no** | Drop rows, resolve conflicting duplicate IDs |
+| `warning` | orange | pending (unchecked) | **no** | Concrete fix exists but the agent specifically wants a human to eyeball it — e.g. clamping an outlier that might be a real extreme value |
+
+The `warning` tier was introduced to replace the earlier `flag` operation kind (which was a no-op against the data and just recorded row indices on the session). Every proposal now produces a real concrete change to the data; "please review this one manually" is conveyed through the warning tier instead of a no-op operation. This also fixes a downstream LakeFS bug: when every "applied" fix is a real mutation, the exported CSV genuinely differs from the source and the version-create commit succeeds.
+
+`profile_dataset` default tiers:
+
+| Issue type | Default tier |
+|---|---|
+| `missing_value` | medium |
+| `placeholder_value` | medium |
+| `inconsistent_label` | medium |
+| `duplicate_id` | high |
+| `outlier` | **warning** |
+
+`suggest_fix` is allowed to override the default with a strong reason; the LLM prompt explicitly instructs it to prefer clamping via `replace_value` over destructive `drop_rows`, and to set `riskTier="warning"` when the user really should eyeball the fix.
+
+---
+
+## 7. Fix operation kinds (six)
+
+`FixOperationKind` is the closed enum of mutations `apply_fix` knows how to execute:
+
+| Kind | Params | Effect |
+|---|---|---|
+| `replace_value` | `{column, replacement, rowIndices?, match?}` | Swap cells. Two targeting modes: `rowIndices` (deterministic, used by outlier proposals) wins when both are present. `match` (value-based) is for cases like "replace every `unknown` with null". `rowIndices` was added because LLM-generated `match` values for numeric outliers silently no-op'd when the LLM rounded the cell value (e.g. `match: 950` vs cell `949.7`), producing byte-identical exports that LakeFS rejected. |
+| `drop_rows` | `{rowIndices: number[]}` | Remove rows by index |
+| `impute` | `{column, strategy: "mean" \| "median" \| "mode"}` | Fill missing cells; honors session `missingTokens` override |
+| `standardize` | `{column, mapping: {from: to}}` | Replace cell values via explicit mapping |
+| `trim_whitespace` | `{column}` | Strip leading/trailing whitespace |
+| `rename_column` | `{from, to}` | Rename column and rewrite per-row keys |
+
+`apply_fix` is a **pure function** — it never mutates the input `DatasetView`, just returns a new one. Optional `ApplyOptions.missingTokens` is threaded through from the session's scan options so `impute` treats the same set of cells as missing that the profiler flagged.
+
+---
+
+## 8. Permission model
+
+Every mutating tool call passes through `requestApproval(gateway, proposal)`:
+
+```
+verdict resolution:
+  if riskTier is "high" or "warning":
+      always prompt the user
+  else if issueType has an autoAllowRule in the session:
+      return auto_allow_remembered
+  else if riskTier is "low":
+      return auto_allow_low_risk
+  else:
+      emit pendingApproval step, wait for user decision via WS
+```
+
+The `Verdict` union: `"allow" | "deny" | "auto_allow_low_risk" | "auto_allow_remembered"`. **`"modify"` was deliberately cut** — the legacy handler recorded a user-supplied free-text override but still executed the original `operationParams`, which silently lied to users. Modify will return only with a real natural-language → operationParams parser (post-MVP).
+
+### 8.1 Contract enforcement
+
+The HTTP and WS handlers strictly enforce this contract:
+
+- `/apply-batch` body schema: `verdict: t.Union([t.Literal("allow"), t.Literal("deny")])` with `additionalProperties: false` on each decision object. Unknown fields (e.g., legacy `modifiedAction`) cause a typebox validation rejection.
+- The Elysia app is built with `normalize: false` so unknown fields aren't silently stripped before validation hits.
+- A global `onError` handler converts `code === "VALIDATION"` to HTTP 400 instead of the default 500.
+- A runtime check on `/apply-batch` rejects `{verdict: "deny", remember: true}` with a friendly 400 — `remember` only applies when the user approves a fix.
+- The WS `decision` handler narrows the same way: `verdict?: "allow" | "deny"` only, `modifiedAction` removed, and the handler explicitly rejects an invalid verdict and `deny+remember=true` with an error message.
+
+---
+
+## 9. Decision log
+
+Every approved or denied action is appended to a structured per-session log. The log serializes to RFC-4180 CSV with the 9-column schema:
+
+```
+decision_id, timestamp, issue_type, target_rows, proposed_action,
+user_decision, reason, confidence, applied_at
+```
+
+`applied_at` is empty for denied entries. The CSV is exported by the `write_decision_log` tool (LLM-invocable) or read directly from `session.getDecisionLog()`.
+
+The `modified_action` column was cut alongside the `"modify"` verdict.
+
+---
+
+## 10. Backend API surface
+
+All under `${API_PREFIX}/agents/:id`:
+
+| Method + Path | Purpose |
+|---|---|
+| `POST /dataguard/dataset` | Load `{columns: string[], rows: Record<string, unknown>[]}` into the agent's `DataGuardSession`. Resets per-run state (issues, proposals, decision log). |
+| `POST /dataguard/scan` | Run `profile_dataset` then `suggest_fix` per issue. Body: optional `{idColumn?, validRanges?, placeholderValues?, missingTokens?}`. Persists scan options on the session for the verification re-scan. Returns `{issueCount, issues, proposals}`. |
+| `POST /dataguard/apply-batch` | Apply the user's checked decisions. Body: `{decisions: [{issueId, verdict: "allow"|"deny", remember?}]}`. Runs `apply_fix` per allowed proposal, records every decision (including denies). Re-profiles the cleaned dataset and returns `{applied, denied, failed, datasetRowCount, results, residualIssues, residualCount}`. |
+| `GET  /dataguard/export-csv` | Return the in-memory cleaned dataset as a CSV blob. Used by the frontend to push the new version back to the source dataset. |
+| `GET  /dataguard/session` | Inspect session state (issue list, decision log, auto-allow rules). |
+| `WS   /agents/:id/react` `{type:"decision",…}` | Resolve a pending-approval step. Used by the chat flow only; the checklist path uses `/apply-batch`. |
+
+---
+
+## 11. Frontend components
+
+### 11.1 Services (DI singletons, `providedIn: 'root'`)
+
+| Service | Responsibility |
+|---|---|
+| `DataGuardAutoTriggerService` | Owns the operator-add / property-change subscription, runs the orchestration pipeline (resolve workflow → load dataset → scan → publish). Exposes `startOrchestration()`, `applyBatch(decisions)`, `rescanCurrent()`, `rescanAny()`. Concurrency-gated by `currentPipeline: Promise<void> \| null` (see §3.4). |
+| `DataGuardResultsService` | `BehaviorSubject<DataGuardScanResult>` that drives the checklist UI. States: `idle → scanning → ready → applying → done / error`. Exposes `setState(patch)`, `updateEntry(issueId, patch)`, `reset()`. |
+| `DataGuardSettingsService` | Per-workflow shield ON/OFF, persisted in `localStorage` (`dataguard.enabled.wid.<wid>`). Default ON. |
+| `DataGuardRowNavigatorService` | `ReplaySubject<DataGuardRowNavRequest>(1, 500ms)` driving the 📍 locate flow. Includes pure helpers `pageIndexFor(rowIndex, pageSize)` and `nextCycleStep(indices, cursor)`. |
+| `AgentService.sendDecision(agentId, stepId, verdict, options)` | WS sender for the chat flow's `{type:"decision",…}` message. |
+
+### 11.2 Components
+
+| Component | Role |
+|---|---|
+| `<texera-dataguard-checklist>` (standalone) | The floating, draggable checklist panel. Subscribes to `DataGuardResultsService`, owns the orchestration subscription (so it lives whenever the checklist itself can render), renders rows / risk-tier badges / Apply button / Scan-again footer / floating reopen icon. Holds the per-row `locateCursors: Map<issueId, number>` driving cyclic 📍 navigation. |
+| `<texera-permission-prompt>` (standalone) | The inline chat-bubble approval prompt — used by the chat-driven path in `agent-chat`. Allow / Deny / Allow-&-remember (the last is hidden for `high` and `warning`). |
+| `<texera-menu>` (modified) | Toolbar 🛡 shield button — toggles `DataGuardSettingsService.isEnabled(wid)`. |
+| `<texera-result-table-frame>` (modified) | Subscribes to `DataGuardRowNavigatorService`. On a locate request, navigates the paginator to the target page and chains off an internal `pageRendered$ Subject` so `applyFlash()` only fires after the new page actually renders. 2 s highlight via `HIGHLIGHT_DURATION_MS`; `ngOnDestroy` clears the timer to avoid NG0911. |
+
+The checklist component lives at `bottom: 100px; right: 80px` by default. When dragged elsewhere it stays where the user put it for the session; refreshing returns to the default anchor. The floating reopen icon occupies the same default position.
+
+---
+
+## 12. Auto-trigger dataset operator set
+
+```ts
+private static readonly DATASET_OPERATOR_TYPES = new Set<string>([
+  "CSVFileScan",
+  "ParallelCSVFileScan",
+]);
+```
+
+CSV-only for now — `loadFromOperatorFile` pipes every blob through `Papa.parse`, so adding `JSONFileScan` / `TableFileScan` / Parquet would either crash or produce garbage rows. Per-format parsing is the obvious follow-up.
+
+---
+
+## 13. File map
+
+### 13.1 agent-service
+
+```
+src/types/dataguard.ts                          Shared types: RiskTier, Confidence, IssueType,
+                                                FixOperationKind, Verdict, DataQualityIssue,
+                                                FixProposal, PermissionDecision, DecisionLogEntry,
+                                                AutoAllowRule
+src/types/dataguard.test.ts                     Fixture tests (literal-union shapes)
+src/types/agent.ts                              ReActStep.pendingApproval + PendingApproval interface
+
+src/agent/tools/dataguard/
+  dataset.ts                                    DatasetView type
+  missing-detection.ts                          Shared isMissing / placeholderHit / toNumber
+  profile-dataset.ts                            Five-detector profiler, no LLM;
+                                                inferIdColumn helper for auto-detecting
+                                                ID columns by name pattern
+  suggest-fix.ts                                LLM-driven proposal generator, zod-validated;
+                                                prompt passes affectedRowIndices and instructs
+                                                rowIndices-based replace_value for outliers
+  apply-fix.ts                                  Pure-function applier for the six op kinds.
+                                                replace_value supports rowIndices targeting;
+                                                ApplyOptions.missingTokens honored by impute
+  with-approval.ts                              Permission gate (handles low/medium/high/warning;
+                                                warning never auto-allows, even with remember rules)
+  dataguard-session.ts                          Per-agent state: dataset, issues, proposals,
+                                                decisionLog, autoAllowRules, ScanOptions
+  decision-log.ts                               9-col CSV serializer + AI-SDK tool
+                                                (modified_action column removed by #11a)
+  bias-check.ts                                 Per-group retention diff + AI-SDK tool
+  dataguard-tools.ts                            AI SDK tool({...}) definitions (5 tools)
+  __tests__/*.test.ts                           Test files — apply-fix, suggest-fix, profile-dataset,
+                                                with-approval, dataguard-session, decision-log,
+                                                bias-check, apply-batch-rescan, plus the
+                                                contract-lock files: permission-types,
+                                                apply-batch-modify-reject, decision-log-no-modify,
+                                                with-approval-no-modify
+
+src/agent/texera-agent.ts                       Implements ApprovalGateway; registers DataGuard
+                                                tools; exposes public callLlm(prompt) used by /scan
+src/server.ts                                   Elysia routes (app built with normalize:false so
+                                                additionalProperties:false on body schemas
+                                                actually rejects legacy fields):
+                                                  POST   /dataguard/dataset
+                                                  POST   /dataguard/scan
+                                                  POST   /dataguard/apply-batch  (rejects "modify"
+                                                         verdict, modifiedAction field, and
+                                                         {verdict:"deny", remember:true})
+                                                  GET    /dataguard/export-csv
+                                                  GET    /dataguard/session
+                                                  WS     decision message branch
+                                                onError: VALIDATION → 400
+```
+
+### 13.2 frontend
+
+```
+src/app/workspace/
+  service/agent/
+    agent-types.ts                              ReActStep.pendingApproval field (mirror of backend);
+                                                riskTier includes "warning"
+    agent.service.ts                            sendDecision(agentId, stepId, verdict, {remember})
+                                                — WS sender for the chat flow
+    data-guard-auto-trigger.service.ts          Orchestration pipeline (scan / apply-batch /
+                                                rescanAny / rescanCurrent), debounced operator-add
+                                                + operator-property-change subscription.
+                                                Concurrency-gated via currentPipeline Promise:
+                                                user-initiated awaits in-flight, auto-trigger
+                                                drops silently. After-Apply auto-rescan.
+                                                Includes pure helper resolveRescanTarget(state, graph).
+    data-guard-auto-trigger.service.spec.ts     Tests for resolveRescanTarget decision tree
+                                                + serialization test (no concurrent /scan)
+    data-guard-results.service.ts               BehaviorSubject<DataGuardScanResult> driving the UI
+    data-guard-results.service.spec.ts          State-shape tests
+    data-guard-settings.service.ts              Per-workflow shield ON/OFF (localStorage)
+    data-guard-row-navigator.service.ts         ReplaySubject for 📍 locate flow;
+                                                pageIndexFor + nextCycleStep pure helpers
+    data-guard-row-navigator.service.spec.ts    Cycle-walk + ReplaySubject TTL + negative-cursor
+                                                coercion + serialization edge cases
+  component/
+    dataguard-checklist/                        <texera-dataguard-checklist> — floating draggable
+                                                panel (cdkDrag + cdkDragHandle + cdkDragBoundary),
+                                                row checklist, 📍 locate button (cyclic),
+                                                category roll-up, Scan-again, floating reopen icon
+    result-panel/result-table-frame/            Modified to subscribe to row-navigator and flash
+                                                cells via pageRendered$ Subject (waits for new
+                                                page render, not arbitrary timeout)
+    agent/agent-panel/
+      permission-prompt/                        <texera-permission-prompt> — inline approval UI
+                                                used by the chat-driven path
+      agent-chat/                               Renders <texera-permission-prompt> inside the
+                                                step loop when pendingApproval is set
+      agent-panel.component.ts                  No longer owns the auto-trigger subscription —
+                                                that moved to the checklist component
+    menu/                                       Toolbar 🛡 shield button (per-workflow toggle)
+    workspace.component.{ts,html}               Mounts <texera-dataguard-checklist *ngIf="copilotEnabled">
+```
+
+---
+
+## 14. Setup
+
+DataGuard runs as part of the standard Texera microservices + agent-service stack, plus a local LiteLLM proxy that wraps an LLM provider with an OpenAI-compatible API.
+
+### 14.1 One-time setup
+
+```bash
+# 1) API key for your LLM provider (e.g., Anthropic) — exported once
+echo 'export ANTHROPIC_API_KEY=sk-ant-…your-key…' >> ~/.zshrc
+source ~/.zshrc
+
+# 2) Python venv for LiteLLM proxy
+python3.12 -m venv ~/UCI/TexeraProject/venv312
+~/UCI/TexeraProject/venv312/bin/pip install --upgrade pip
+~/UCI/TexeraProject/venv312/bin/pip install 'litellm[proxy]'
+
+# 3) Bun for agent-service
+brew install oven-sh/bun/bun
+
+# 4) Yarn 4 via Corepack for frontend
+corepack enable
+
+# 5) GUI feature flag in Texera config
+# Set: common/config/src/main/resources/gui.conf
+#   copilot-enabled = true
+```
+
+### 14.2 Daily startup
+
+```bash
+# Terminal 1 — LiteLLM proxy on :4000 (OpenAI-style API over your provider)
+source ~/UCI/TexeraProject/venv312/bin/activate
+cd ~/UCI/TexeraProject/texera
+litellm --config bin/litellm-config.yaml
+
+# Terminal 2 — Texera Scala microservices
+# IntelliJ "texera micro services" run config, or:
+# bin/single-node/docker compose up -d
+
+# Terminal 3 — agent-service
+cd ~/UCI/TexeraProject/texera/agent-service
+bun install   # only if node_modules absent
+bun run dev   # :3001 with --watch reload
+
+# Terminal 4 — Frontend
+cd ~/UCI/TexeraProject/texera/frontend
+yarn install  # only if node_modules absent
+yarn start    # :4200, proxies /api/* per proxy.config.json
+```
+
+Then open `http://localhost:4200`, sign in, open or create a workflow.
+
+### 14.3 End-to-end flow
+
+1. Confirm the 🛡 shield is ON (toolbar — twotone icon = ON, outline = OFF).
+2. Drop a `CSVFileScan` operator and point it at any dataset in the system.
+3. The checklist panel slides in. While `/scan` runs, a loading message replaces the row list (typically a few seconds — one LLM call per issue).
+4. Review each row. `LOW` rows are pre-checked; `MEDIUM` / `HIGH` / `WARNING` need an explicit Allow. Click the **📍** on any row to jump to the affected row in the Result Panel — clicking again cycles to the next affected row, wrapping after the last.
+5. Click **Fix N & run** — the cleaned data is written back as a new dataset version with a timestamp-suffixed name, the operator is repointed at the new version, and the workflow auto-runs. The panel then **auto-re-scans** the new version so you immediately see whether anything is still left.
+6. Click **Scan again** at any time to iterate against the current version.
+7. Close the panel. The floating DataGuard icon appears (if the shield is ON) — click it any time to trigger a fresh scan; the panel re-opens immediately even if another scan is in flight (queued, never concurrent).
+
+---
+
+## 15. Testing
+
+```bash
+cd agent-service
+bun run typecheck   # exit 0
+bun test            # 199 pass / 0 fail (419 expect calls)
+
+cd frontend
+npx tsc --noEmit    # exit 0
+```
+
+Test coverage spans:
+
+- Types fixtures (12) — verifies the literal unions accept and reject the right members.
+- Profile (20+) — per-detector cases including the validRanges-based outlier, the explicit "clustered large readings are NOT auto-outliers" assertion, and the `inferIdColumn` heuristic across all id-name patterns (`id`, `*_id`, `*Id`, `id_*`).
+- Suggest (10+) — LLM-response schema validation.
+- Apply (16) — every op kind round-trips; original dataset never mutated; `replace_value` with `rowIndices` regression-locks the LakeFS "no changes detected" bug; `missingTokens` override threads through to impute.
+- With-approval (7) — low/medium/high/warning gating, the `warning`-with-remembered-rule case, and the buffered-decision race.
+- Session (8) — recordIssue/recordDecision/auto-allow lifecycle.
+- Decision log (6) + decision-log-no-modify (2) — RFC-4180 CSV shape and the post-#11a 9-column schema lock.
+- Apply-batch end-to-end (12+) — Modify-verdict rejection, `additionalProperties` rejection, `verdict==="deny" && remember===true` rejection, residual re-scan correctness.
+- Permission-types (4) — `@ts-expect-error` locks that `"modify"` and `modifiedAction` cannot type-check anywhere.
+- Frontend specs — `DataGuardRowNavigatorService` (cycle math, ReplaySubject TTL, negative-cursor coercion); `DataGuardAutoTriggerService` (resolveRescanTarget decision tree + pipeline serialization proof).
+
+---
+
+## 16. Differentiation
+
+Closest UX overlap among Texera AI proposals is `mengw15`'s **UDF Copilot** (Claude-Code-style permission UX), but it operates on **code in the Monaco editor**, not data. Complementary, not competing.
+
+| Project | Object of AI | User | Surface |
+|---|---|---|---|
+| UDF Copilot | Code | Developer | Monaco editor |
+| Macro Operators | Workflow structure | Workflow author | Canvas |
+| Self-healing workflows | Workflow JSON | Workflow author | Canvas |
+| **DataGuard** | **Data** | **Domain expert / scientist** | **Chat panel + checklist** |
+
+**Theme positioning:** the only proposal targeting the *Data / AI for Science* track.
+
+**Sibling feature: WorkflowGuard.** Applies the same permission UX to *workflow edits* (`addOperator` / `modifyOperator` / `deleteOperator`). Independent feature, shared `pendingApproval` mechanism. See `README_WorkflowGuard_Texera.md`.
+
+---
+
+## 17. Why it matters
+
+Pure AI automation in data cleaning is risky:
+- AI may silently remove scientifically meaningful outliers.
+- AI may introduce bias by removing data from one group more than another.
+- AI may misinterpret placeholder values as real values.
+- AI may make irreversible transformations the user never sanctioned.
+
+DataGuard treats the human as the **decision-maker**, not the **reviewer of a finished job**:
+- AI provides suggestions, not final decisions.
+- Every mutating action requires explicit authorization (or pre-authorization via "remember", but only at the user's request and only for tiers ≤ medium).
+- Each decision is supported by evidence and confidence.
+- Every step is logged for audit and reproducibility.
+- The workflow is fully replayable from the decision log.
+
+**The interaction model is the contribution.** DataGuard demonstrates that Claude Code's permission-based UX — already proven for code — translates naturally to data work, and is *especially* valuable in scientific contexts where reversibility and trust matter most.
+
+---
+
+## 18. HCI contribution
+
+DataGuard contributes a concrete instance of:
+
+```
+AI detects   →   AI explains with evidence   →   AI proposes specific action
+       ↓
+Human decides (Allow / Allow & remember / Deny)
+       ↓
+System applies (only if approved)   →   System records   →   Continue
+```
+
+Relevant concepts:
+- Permission-based AI agency (Claude Code, MCP tool authorization).
+- Trust calibration through evidence + confidence display.
+- Risk-tiered auto-apply (low-risk transparency vs. high-risk gating, plus the `warning` tier for "concrete fix, but verify").
+- Decision provenance and audit trail.
+- Reproducibility via replayable decision logs.
+- Mixed-initiative interaction with the human always at the final boundary.
+
+---
+
+## 19. Post-MVP follow-ups
+
+- **JSON / Parquet operator support.** Auto-trigger is intentionally narrowed to `CSVFileScan` + `ParallelCSVFileScan`. Adding `JSONFileScan` / `TableFileScan` / Parquet needs `loadFromOperatorFile` to branch by suffix instead of force-`Papa.parse`-ing every blob.
+- **Modify verdict.** Currently cut. Returns only with a real natural-language → `operationParams` parser; the legacy "modify" recorded a free-text override but executed the original params, which silently lied.
+- **Iceberg-backed decision log.** Via the existing Lakekeeper integration. Currently CSV.
+- **`run_cleaning_workflow` tool.** Distributed cleaning by delegating to a Texera workflow for datasets that don't fit in memory.
+- **`--replay decision_log.csv`.** Reproduce a cleaned dataset from a saved log without LLM calls.
+- **System-prompt switch when DataGuard is active.** If we want the chat path back, the agent's system prompt should temporarily become DataGuard-focused (currently workflow-centric).
+- **Disabled tools per agent.** Pass `disabledTools: ["addOperator", …]` when the auto-trigger creates an agent, so even an accidental chat doesn't risk workflow mutation.
+- **Bias-check banner in the panel.** Currently `bias_check.ts` produces structured output but only the chat path consumes it.
+- **Persist drag position across refresh.** Today it resets to bottom-right on reload.
+- **`pageRendered$` integration cleanup.** The result-table-frame change exposes a private completion signal; a small refactor could publish it as a public `Observable` so other consumers (e.g. a "scroll to row N" feature) can subscribe.
diff --git a/agent-service/demo/README.md b/agent-service/demo/README.md
index 968987a0703..e148521626f 100644
--- a/agent-service/demo/README.md
+++ b/agent-service/demo/README.md
@@ -14,7 +14,7 @@ column (`diabetic_outcome`) is the label.
 | 4 | `missing_value` | S005, S007, S009, S014 | empty `glucose` in Group A (imbalanced) |
 | 5 | `missing_value` | S045, S046, S048 | `age = "N/A"`, `glucose = " "`, `glucose = "null"` |
 | 6 | `duplicate_id` | S001, S017 | sample_id repeats with conflicting `diabetic_outcome` |
-| 7 | `out_of_range` | S041, S042, S044 | `bmi > 60` (clinical max ~70 — possibly real, possibly error) |
+| 7 | `outlier` | S041, S042, S044 | `bmi > 60` outside user-supplied `validRanges` (possible real extreme — flagged as `warning` tier) |
 
 ## How to load into DataGuard
 
@@ -28,13 +28,43 @@ curl -X POST http://localhost:8000/api/agents/<agentId>/dataguard/dataset \
   }')"
 ```
 
-Or via the demo script (Step 13's frontend auto-trigger handles this in the
-real flow — once a `CSVFileScan` operator is added that references this file,
-DataGuard auto-launches).
+The frontend auto-trigger handles this in the real flow — once a `CSVFileScan`
+operator is added that references this file, DataGuard auto-launches.
+
+## Single-category demo files
+
+For testing one detector at a time, each of these CSVs concentrates pollution
+in a single category so it's obvious which detector is firing:
+
+| File | Issue category | What's wrong |
+|---|---|---|
+| `missing_values_demo.csv` | `missing_value` | empty / `N/A` / `NA` / `null` cells across multiple columns |
+| `placeholder_values_demo.csv` | `placeholder_value` | `999`, `-1`, `Unknown` / `unknown` sentinels |
+| `duplicate_rows_demo.csv` | `duplicate_id` | repeated `sample_id`s, some with conflicting outcomes |
+| `outliers_demo.csv` | `outlier` | negative ages, BMI > 200, blood pressure > 250 — fires only when `validRanges` is supplied at scan time |
+| `inconsistent_labels_demo.csv` | `inconsistent_label` | `Male` / `male` / `MALE` and `Female` / `female` / `FEMALE` mixed |
+
+The `outliers_demo.csv` requires `validRanges` to be set when scanning (the
+profiler does not auto-detect numerical outliers via z-score — that variant was
+removed because it flagged legitimate clustered extremes as errors). The other
+four fire on default scan options.
+
+Suggested `validRanges` for the outlier demo:
+
+```json
+{
+  "age":            { "min": 0,   "max": 120 },
+  "bmi":            { "min": 10,  "max": 60 },
+  "blood_pressure": { "min": 40,  "max": 200 }
+}
+```
 
 ## Bias-check expectation
 
 Group A: 22 rows. Group B: 23 rows. After cleaning, missingness imbalance
 (more empties in A) means naive imputation drops ~18% of A but only ~4% of B
-— DataGuard surfaces this and recommends `flag` instead of `impute` for the
-missing-glucose issue (the §5 storyboard "Modify" beat).
+— DataGuard surfaces this and proposes a `replace_value` fix tagged with
+`riskTier: "warning"` so the user explicitly confirms instead of letting
+imputation run silently. The earlier `flag` operation kind was removed —
+every fix is now a concrete change, and "please review manually" is conveyed
+through the warning tier instead.
diff --git a/agent-service/demo/duplicate_rows_demo.csv b/agent-service/demo/duplicate_rows_demo.csv
new file mode 100644
index 00000000000..99bc4f4586a
--- /dev/null
+++ b/agent-service/demo/duplicate_rows_demo.csv
@@ -0,0 +1,31 @@
+sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
+S001,45,110,28.1,80,A,0
+S002,52,140,30.5,85,A,1
+S003,38,95,24.0,70,A,0
+S004,46,130,29.8,82,A,1
+S005,41,115,27.5,78,A,0
+S006,47,125,28.0,80,A,1
+S007,55,132,30.0,85,A,1
+S008,49,118,28.9,82,A,0
+S009,42,114,26.7,75,A,0
+S010,53,135,29.1,88,A,1
+S001,45,110,28.1,80,A,0
+S002,52,142,30.5,85,A,1
+S007,55,132,30.0,85,A,0
+S011,46,108,27.3,79,A,0
+S012,44,142,30.8,87,A,1
+S013,43,112,26.4,76,A,0
+S014,51,128,29.5,84,A,1
+S015,48,120,28.6,81,A,0
+S016,40,98,25.2,72,A,0
+S017,57,148,32.7,92,A,1
+S017,57,148,32.7,92,A,1
+S018,44,115,27.8,79,A,0
+S019,50,128,30.0,85,A,1
+S020,39,102,24.9,73,A,0
+S004,46,130,29.8,82,A,1
+S021,46,113,28.4,80,B,0
+S022,54,138,31.5,88,B,1
+S023,42,108,27.0,77,B,0
+S024,48,122,29.3,82,B,1
+S010,53,135,29.1,88,A,0
diff --git a/agent-service/demo/inconsistent_labels_demo.csv b/agent-service/demo/inconsistent_labels_demo.csv
new file mode 100644
index 00000000000..bc8759a93c5
--- /dev/null
+++ b/agent-service/demo/inconsistent_labels_demo.csv
@@ -0,0 +1,31 @@
+sample_id,age,glucose,bmi,gender,group,diabetic_outcome
+S001,45,110,28.1,Male,A,0
+S002,52,140,30.5,Female,A,1
+S003,38,95,24.0,male,A,0
+S004,46,130,29.8,Female,A,1
+S005,41,115,27.5,MALE,A,0
+S006,47,125,28.0,female,A,1
+S007,55,132,30.0,Male,A,1
+S008,49,118,28.9,Female,A,0
+S009,42,114,26.7,Male,A,0
+S010,53,135,29.1,FEMALE,A,1
+S011,46,108,27.3,Male,A,0
+S012,44,142,30.8,Female,A,1
+S013,43,112,26.4,male,A,0
+S014,51,128,29.5,Female,A,1
+S015,48,120,28.6,Male,A,0
+S016,40,98,25.2,Female,A,0
+S017,57,148,32.7,Male,A,1
+S018,44,115,27.8,Female,A,0
+S019,50,128,30.0,Male,A,1
+S020,39,102,24.9,Female,A,0
+S021,46,113,28.4,Male,B,0
+S022,54,138,31.5,Female,B,1
+S023,42,108,27.0,Male,B,0
+S024,48,122,29.3,female,B,1
+S025,45,116,28.7,Male,B,0
+S026,52,134,30.6,Female,B,1
+S027,41,105,25.8,Male,B,0
+S028,49,124,29.0,Female,B,1
+S029,47,118,28.2,male,B,0
+S030,43,110,26.9,Female,B,0
diff --git a/agent-service/demo/missing_values_demo.csv b/agent-service/demo/missing_values_demo.csv
new file mode 100644
index 00000000000..07d1cfab2a0
--- /dev/null
+++ b/agent-service/demo/missing_values_demo.csv
@@ -0,0 +1,31 @@
+sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
+S001,45,110,28.1,80,A,0
+S002,52,140,30.5,85,A,1
+S003,38,95,24.0,70,A,0
+S004,,130,29.8,82,A,1
+S005,41,,27.5,78,A,0
+S006,47,125,,80,A,1
+S007,55,,33.0,90,A,1
+S008,49,118,28.9,,A,0
+S009,42,,26.7,75,A,0
+S010,53,135,29.1,88,A,1
+S011,46,108,27.3,79,A,0
+S012,N/A,142,30.8,87,A,1
+S013,43,112,26.4,76,A,0
+S014,51,NA,29.5,84,A,1
+S015,48,120,28.6,81,A,0
+S016,40,98,25.2,72,A,0
+S017,57,148,32.7,92,A,1
+S018,44,115,27.8,79,A,0
+S019,50,128,30.0,85,A,1
+S020,39,102,24.9,73,A,0
+S021,46,113,28.4,80,B,0
+S022,54,138,31.5,88,B,1
+S023,42,108,27.0,77,B,0
+S024,48,122,29.3,82,B,1
+S025,45,116,28.7,79,B,0
+S026,null,134,30.6,86,B,1
+S027,41,105,25.8,74,B,0
+S028,49,124,29.0,83,B,1
+S029,47,118,28.2,81,B,0
+S030,43,110,26.9,76,B,0
diff --git a/agent-service/demo/outliers_demo.csv b/agent-service/demo/outliers_demo.csv
new file mode 100644
index 00000000000..2ddd4320679
--- /dev/null
+++ b/agent-service/demo/outliers_demo.csv
@@ -0,0 +1,31 @@
+sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
+S001,45,110,28.1,80,A,0
+S002,52,140,30.5,85,A,1
+S003,38,95,24.0,70,A,0
+S004,200,130,29.8,82,A,1
+S005,41,115,27.5,78,A,0
+S006,-5,125,28.0,80,A,1
+S007,55,132,300,85,A,1
+S008,49,118,28.9,82,A,0
+S009,42,114,26.7,250,A,0
+S010,53,135,29.1,88,A,1
+S011,46,108,27.3,79,A,0
+S012,44,142,30.8,87,A,1
+S013,43,112,26.4,76,A,0
+S014,51,128,29.5,84,A,1
+S015,180,120,28.6,81,A,0
+S016,40,98,25.2,72,A,0
+S017,57,148,500,92,A,1
+S018,44,115,27.8,79,A,0
+S019,50,128,30.0,85,A,1
+S020,39,102,24.9,73,A,0
+S021,46,113,28.4,80,B,0
+S022,54,138,31.5,300,B,1
+S023,42,108,27.0,77,B,0
+S024,48,122,29.3,82,B,1
+S025,45,116,28.7,79,B,0
+S026,52,134,30.6,86,B,1
+S027,-2,105,25.8,74,B,0
+S028,49,124,29.0,83,B,1
+S029,47,118,28.2,81,B,0
+S030,43,110,26.9,76,B,0
diff --git a/agent-service/demo/placeholder_values_demo.csv b/agent-service/demo/placeholder_values_demo.csv
new file mode 100644
index 00000000000..00d16b50df4
--- /dev/null
+++ b/agent-service/demo/placeholder_values_demo.csv
@@ -0,0 +1,31 @@
+sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
+S001,45,110,28.1,80,A,0
+S002,52,140,30.5,85,A,1
+S003,38,95,24.0,70,A,0
+S004,999,130,29.8,82,A,1
+S005,41,999,27.5,78,A,0
+S006,47,125,28.0,80,A,1
+S007,Unknown,128,30.0,85,A,1
+S008,49,118,28.9,82,A,0
+S009,42,114,26.7,75,A,0
+S010,53,135,29.1,88,A,1
+S011,46,108,27.3,79,A,0
+S012,999,142,30.8,87,A,1
+S013,43,112,26.4,76,A,0
+S014,51,unknown,29.5,84,A,1
+S015,48,120,28.6,81,A,0
+S016,40,98,25.2,72,A,0
+S017,57,148,32.7,92,A,1
+S018,44,115,27.8,79,A,0
+S019,50,128,30.0,85,A,1
+S020,39,102,24.9,73,A,0
+S021,46,113,-1,80,B,0
+S022,54,138,31.5,88,B,1
+S023,42,108,27.0,77,B,0
+S024,48,122,-1,82,B,1
+S025,45,116,28.7,-1,B,0
+S026,52,134,30.6,86,B,1
+S027,Unknown,105,25.8,74,B,0
+S028,49,124,29.0,83,B,1
+S029,47,118,28.2,81,B,0
+S030,43,110,26.9,76,B,0
diff --git a/agent-service/src/agent/texera-agent.ts b/agent-service/src/agent/texera-agent.ts
index f70cbadc358..f2bc6a2736a 100644
--- a/agent-service/src/agent/texera-agent.ts
+++ b/agent-service/src/agent/texera-agent.ts
@@ -272,6 +272,21 @@ export class TexeraAgent implements ApprovalGateway {
     return this.state;
   }
 
+  /**
+   * Stateless LLM call used by DataGuard's server-driven /scan endpoint.
+   * Bypasses the ReAct loop (no tool calls, no step recording) — just sends
+   * a prompt to the configured model and returns its text response. Used to
+   * generate FixProposals server-side without going through the chat panel.
+   */
+  public async callLlm(prompt: string): Promise<string> {
+    const result = await generateText({
+      model: this.model,
+      prompt,
+      temperature: 0.2,
+    });
+    return result.text;
+  }
+
   getWorkflowState(): WorkflowState {
     return this.workflowState;
   }
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts
new file mode 100644
index 00000000000..f5bfec93c4f
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Contract tests for cutting the Modify verdict (task #11a / #15) and for
+// the remember-flag scope rule (task #12). The HTTP body schema on
+// POST /api/agents/:id/dataguard/apply-batch must:
+//
+//   • reject verdict: "modify"
+//   • reject the modifiedAction field (no longer part of the contract)
+//   • reject { verdict: "deny", remember: true } — remember is only meaningful for "allow"
+//   • still accept { verdict: "allow", remember: true } and { verdict: "deny" }
+//
+// All assertions are at the schema layer (Elysia body validation runs before
+// the handler), so we don't need a real loaded dataset or LLM-derived
+// proposals to exercise them.
+
+import { beforeEach, describe, expect, test } from "bun:test";
+import { buildApp, _resetAgentStoreForTests } from "../../../../server";
+import { env } from "../../../../config/env";
+
+const API = env.API_PREFIX;
+const app = buildApp();
+
+function url(path: string): string {
+  return `http://localhost${path}`;
+}
+
+async function postJson(path: string, body: unknown): Promise<Response> {
+  return app.handle(
+    new Request(url(path), {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(body),
+    })
+  );
+}
+
+async function readJson<T = unknown>(res: Response): Promise<T> {
+  return (await res.json()) as T;
+}
+
+async function createAgent(): Promise<string> {
+  const res = await postJson(`${API}/agents`, { modelType: "test-model" });
+  const body = await readJson<{ id: string }>(res);
+  return body.id;
+}
+
+beforeEach(() => {
+  _resetAgentStoreForTests();
+});
+
+describe(`POST ${API}/agents/:id/dataguard/apply-batch — Modify verdict cut (#11a)`, () => {
+  test("rejects verdict: \"modify\" with a 4xx body-schema error", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [{ issueId: "iss-1", verdict: "modify" }],
+    });
+    expect(res.status).toBeGreaterThanOrEqual(400);
+    expect(res.status).toBeLessThan(500);
+  });
+
+  test("rejects unknown field `modifiedAction` on a decision entry", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [
+        { issueId: "iss-1", verdict: "allow", modifiedAction: "Flag instead of replace" },
+      ],
+    });
+    expect(res.status).toBeGreaterThanOrEqual(400);
+    expect(res.status).toBeLessThan(500);
+  });
+
+  test("still accepts verdict: \"allow\" (baseline — parity check that the cut didn't over-reach)", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [{ issueId: "iss-not-loaded", verdict: "allow" }],
+    });
+    // No proposal recorded for this issueId, so the handler returns 200 with
+    // a per-result error string — the SCHEMA accepted the body, which is the
+    // point of this test.
+    expect(res.status).toBe(200);
+  });
+
+  test("still accepts verdict: \"deny\" (baseline)", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [{ issueId: "iss-not-loaded", verdict: "deny" }],
+    });
+    expect(res.status).toBe(200);
+  });
+
+  test("rejects a mixed batch where ANY decision uses verdict: \"modify\"", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [
+        { issueId: "iss-1", verdict: "allow" },
+        { issueId: "iss-2", verdict: "modify" },
+        { issueId: "iss-3", verdict: "deny" },
+      ],
+    });
+    expect(res.status).toBeGreaterThanOrEqual(400);
+    expect(res.status).toBeLessThan(500);
+  });
+});
+
+describe(`POST ${API}/agents/:id/dataguard/apply-batch — remember flag scope (#12)`, () => {
+  test("rejects { verdict: \"deny\", remember: true } — remember only applies to allow", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [{ issueId: "iss-1", verdict: "deny", remember: true }],
+    });
+    expect(res.status).toBeGreaterThanOrEqual(400);
+    expect(res.status).toBeLessThan(500);
+  });
+
+  test("accepts { verdict: \"allow\", remember: true } (baseline)", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [{ issueId: "iss-not-loaded", verdict: "allow", remember: true }],
+    });
+    // Same as above — handler can't find the proposal but the schema accepted.
+    expect(res.status).toBe(200);
+  });
+
+  test("accepts { verdict: \"deny\", remember: false } — only `remember: true` + deny is the forbidden combo", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [{ issueId: "iss-not-loaded", verdict: "deny", remember: false }],
+    });
+    expect(res.status).toBe(200);
+  });
+
+  test("accepts { verdict: \"deny\" } with `remember` omitted entirely", async () => {
+    const id = await createAgent();
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [{ issueId: "iss-not-loaded", verdict: "deny" }],
+    });
+    expect(res.status).toBe(200);
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-rescan.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-rescan.test.ts
new file mode 100644
index 00000000000..22461c0d799
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-rescan.test.ts
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Verification re-scan contract: POST /apply-batch must re-run profileDataset
+// on the cleaned dataset and return residualIssues. The UI uses this to show
+// users honest residue instead of silently claiming success.
+
+import { beforeEach, describe, expect, test } from "bun:test";
+import { buildApp, _resetAgentStoreForTests, _getAgentForTests } from "../../../../server";
+import { env } from "../../../../config/env";
+import type { DataQualityIssue } from "../../../../types/dataguard";
+
+const API = env.API_PREFIX;
+const app = buildApp();
+
+function url(path: string): string {
+  return `http://localhost${path}`;
+}
+
+async function postJson(path: string, body: unknown): Promise<Response> {
+  return app.handle(
+    new Request(url(path), {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(body),
+    })
+  );
+}
+
+async function readJson<T = unknown>(res: Response): Promise<T> {
+  return (await res.json()) as T;
+}
+
+async function createAgent(): Promise<string> {
+  const res = await postJson(`${API}/agents`, { modelType: "test-model" });
+  const body = await readJson<{ id: string }>(res);
+  return body.id;
+}
+
+beforeEach(() => {
+  _resetAgentStoreForTests();
+});
+
+describe(`POST ${API}/agents/:id/dataguard/apply-batch — verification re-scan`, () => {
+  test("response includes residualIssues + residualCount fields", async () => {
+    const id = await createAgent();
+    const agent = _getAgentForTests(id)!;
+    agent.getDataGuardSession().setDataset({
+      columns: ["x"],
+      rows: [{ x: 1 }],
+    });
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [],
+    });
+    expect(res.status).toBe(200);
+    const body = await readJson<{ residualIssues: unknown; residualCount: unknown }>(res);
+    expect(Array.isArray(body.residualIssues)).toBe(true);
+    expect(typeof body.residualCount).toBe("number");
+  });
+
+  test("residualIssues empty when cleaned dataset has nothing left to flag", async () => {
+    const id = await createAgent();
+    const agent = _getAgentForTests(id)!;
+    const session = agent.getDataGuardSession();
+    // Pristine dataset → no proposals to apply, profiler finds nothing.
+    session.setDataset({
+      columns: ["age"],
+      rows: [{ age: 30 }, { age: 40 }, { age: 50 }],
+    });
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [],
+    });
+    const body = await readJson<{ residualCount: number; residualIssues: DataQualityIssue[] }>(res);
+    expect(body.residualCount).toBe(0);
+    expect(body.residualIssues).toEqual([]);
+  });
+
+  test("residualIssues surfaces leftovers when proposals leave data dirty", async () => {
+    const id = await createAgent();
+    const agent = _getAgentForTests(id)!;
+    const session = agent.getDataGuardSession();
+    // Dataset with a placeholder "999" the user denied — re-scan should still
+    // flag it because nothing was fixed.
+    session.setDataset({
+      columns: ["age"],
+      rows: [{ age: 30 }, { age: 999 }, { age: 40 }],
+    });
+    const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
+      decisions: [],
+    });
+    const body = await readJson<{ residualCount: number; residualIssues: DataQualityIssue[] }>(res);
+    expect(body.residualCount).toBeGreaterThan(0);
+    expect(body.residualIssues.some(i => i.issueType === "placeholder_value")).toBe(true);
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
index 2f7e706b380..e222b382de0 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
@@ -57,6 +57,54 @@ describe("applyFix", () => {
     expect(result.dataset.rows[3].age).toBe(null);
   });
 
+  test("replace_value with rowIndices: targets rows by index, ignores cell value", () => {
+    // Regression for the "No changes detected in dataset" LakeFS error: when
+    // the LLM proposed `match` that didn't equal the cell exactly (rounding,
+    // string-vs-number, etc.), cellEquals silently no-op'd and the exported
+    // CSV was byte-identical → version commit aborted. rowIndices is the
+    // deterministic escape hatch used by outlier proposals.
+    const ds: DatasetView = {
+      columns: ["glucose"],
+      rows: [{ glucose: 100 }, { glucose: 949.7 }, { glucose: 120 }, { glucose: 815.3 }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "replace_value",
+        operationParams: { column: "glucose", rowIndices: [1, 3], replacement: 110 },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows[0].glucose).toBe(100);
+    expect(result.dataset.rows[1].glucose).toBe(110);
+    expect(result.dataset.rows[2].glucose).toBe(120);
+    expect(result.dataset.rows[3].glucose).toBe(110);
+  });
+
+  test("replace_value: rowIndices wins when both rowIndices and match are present", () => {
+    // Deterministic precedence: if both are supplied (legacy LLM output),
+    // honor the index-based targeting since match is the fragile path.
+    const ds: DatasetView = {
+      columns: ["x"],
+      rows: [{ x: 1 }, { x: 2 }, { x: 3 }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "replace_value",
+        operationParams: {
+          column: "x",
+          rowIndices: [0],
+          match: 999, // would match nothing — only rowIndices should win
+          replacement: 99,
+        },
+      })
+    );
+    expect(result.rowsAffected).toBe(1);
+    expect(result.dataset.rows[0].x).toBe(99);
+    expect(result.dataset.rows[1].x).toBe(2);
+  });
+
   test("replace_value: original dataset is not mutated", () => {
     const ds: DatasetView = {
       columns: ["age"],
@@ -140,41 +188,105 @@ describe("applyFix", () => {
     expect(result.dataset.rows[2].v).toBe(4);
   });
 
-  test("impute mode: fills missing with most common string", () => {
+  test("impute: treats profiler missing-tokens (N/A, NULL, null, Unknown, whitespace) as missing", () => {
+    // Regression: apply-fix's isMissing used to only recognize null/undefined/NaN/""
+    // while the profiler flagged "N/A", "NULL", "null", "Unknown" as missing. Result:
+    // after Fix-and-run, the cleaned CSV still contained those literal tokens because
+    // impute silently skipped them. Both sides must agree.
     const ds: DatasetView = {
-      columns: ["c"],
+      columns: ["glucose"],
       rows: [
-        { c: "A" }, { c: "A" }, { c: "B" }, { c: null }, { c: "" },
+        { glucose: 100 },
+        { glucose: "NULL" },
+        { glucose: 120 },
+        { glucose: "null" },
+        { glucose: "N/A" },
+        { glucose: " " },
+        { glucose: 140 },
       ],
     };
     const result = applyFix(
       ds,
       makeProposal({
         operationKind: "impute",
-        operationParams: { column: "c", strategy: "mode" },
+        operationParams: { column: "glucose", strategy: "median" },
+      })
+    );
+    // 4 missing tokens replaced; non-missing observations [100, 120, 140] → median 120.
+    expect(result.rowsAffected).toBe(4);
+    expect(result.dataset.rows[1].glucose).toBe(120);
+    expect(result.dataset.rows[3].glucose).toBe(120);
+    expect(result.dataset.rows[4].glucose).toBe(120);
+    expect(result.dataset.rows[5].glucose).toBe(120);
+  });
+
+  test("impute mode: missing-token strings are not counted as mode candidates", () => {
+    // "NULL" appearing twice must not be voted the mode just because it's the
+    // most frequent string — it's a missing-marker, not data.
+    const ds: DatasetView = {
+      columns: ["group"],
+      rows: [
+        { group: "A" },
+        { group: "NULL" },
+        { group: "A" },
+        { group: "NULL" },
+        { group: "B" },
+        { group: null },
+      ],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "impute",
+        operationParams: { column: "group", strategy: "mode" },
       })
     );
+    expect(result.dataset.rows[1].group).toBe("A");
+    expect(result.dataset.rows[3].group).toBe("A");
+    expect(result.dataset.rows[5].group).toBe("A");
+  });
+
+  test("impute respects session-supplied missingTokens override", () => {
+    // Regression for the missingTokens-not-threaded bug: a user who set
+    // {missingTokens: ["xyz"]} at scan time would have rows whose value is
+    // literally "xyz" flagged by the profiler but silently skipped by impute
+    // (which only knew about the default tokens). Threading ApplyOptions
+    // through fixes this.
+    const ds: DatasetView = {
+      columns: ["v"],
+      rows: [{ v: 1 }, { v: "xyz" }, { v: 3 }, { v: "xyz" }, { v: 5 }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "impute",
+        operationParams: { column: "v", strategy: "median" },
+      }),
+      { missingTokens: ["xyz"] }
+    );
+    // Non-missing values [1, 3, 5] → median 3, both "xyz" cells replaced.
     expect(result.rowsAffected).toBe(2);
-    expect(result.dataset.rows[3].c).toBe("A");
-    expect(result.dataset.rows[4].c).toBe("A");
+    expect(result.dataset.rows[1].v).toBe(3);
+    expect(result.dataset.rows[3].v).toBe(3);
   });
 
-  test("flag: does not mutate rows, populates flaggedRows", () => {
+  test("impute mode: fills missing with most common string", () => {
     const ds: DatasetView = {
-      columns: ["x"],
-      rows: [{ x: 1 }, { x: 2 }, { x: 3 }],
+      columns: ["c"],
+      rows: [
+        { c: "A" }, { c: "A" }, { c: "B" }, { c: null }, { c: "" },
+      ],
     };
     const result = applyFix(
       ds,
       makeProposal({
-        operationKind: "flag",
-        operationParams: { rowIndices: [0, 2] },
+        operationKind: "impute",
+        operationParams: { column: "c", strategy: "mode" },
       })
     );
     expect(result.rowsAffected).toBe(2);
-    expect(result.flaggedRows).toEqual([0, 2]);
-    expect(result.dataset.rows[0].x).toBe(1);
-    expect(result.dataset.rows[2].x).toBe(3);
+    expect(result.dataset.rows[3].c).toBe("A");
+    expect(result.dataset.rows[4].c).toBe("A");
   });
 
   test("trim_whitespace: trims string cells in target column", () => {
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
index 27afdb8346f..389858105bf 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
@@ -58,7 +58,6 @@ describe("DataGuardSession", () => {
     expect(s.getDataset()).toBe(ds);
     expect(s.getIssues()).toEqual([]);
     expect(s.getDecisionLog()).toEqual([]);
-    expect(s.getFlaggedRows()).toEqual([]);
   });
 
   test("recordIssue accumulates and dedupes by issueId", () => {
@@ -110,10 +109,4 @@ describe("DataGuardSession", () => {
     expect(s.matchesAutoAllowRule("placeholder_value")).toBe(false);
   });
 
-  test("addFlaggedRows merges + dedupes + sorts", () => {
-    const s = new DataGuardSession();
-    s.addFlaggedRows([3, 1, 2]);
-    s.addFlaggedRows([2, 5]);
-    expect(s.getFlaggedRows()).toEqual([1, 2, 3, 5]);
-  });
 });
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/decision-log-no-modify.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/decision-log-no-modify.test.ts
new file mode 100644
index 00000000000..146f5b0961e
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/__tests__/decision-log-no-modify.test.ts
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Decision-log contract tests for the Modify-verdict cut (task #11a / #15).
+// The CSV-shaped audit trail in §4.4 of the design doc currently carries a
+// `modified_action` column. After #11a that column is gone and so is the
+// `modifiedAction` property on each in-memory DecisionLogEntry produced by
+// DataGuardSession.recordDecision.
+
+import { describe, expect, test } from "bun:test";
+import { serializeDecisionLogCsv } from "../decision-log";
+import { DataGuardSession } from "../dataguard-session";
+import type { DecisionLogEntry, FixProposal } from "../../../../types/dataguard";
+
+function entry(overrides: Partial<DecisionLogEntry> = {}): DecisionLogEntry {
+  return {
+    decisionId: "dec-1",
+    timestamp: "2026-05-15T12:00:00.000Z",
+    issueType: "placeholder_value",
+    targetRowCount: 5,
+    proposedAction: "Replace age=999 with NULL",
+    userDecision: "allow",
+    reason: "out of valid range",
+    confidence: "high",
+    appliedAt: "2026-05-15T12:00:01.000Z",
+    ...overrides,
+  };
+}
+
+function proposal(overrides: Partial<FixProposal> = {}): FixProposal {
+  return {
+    issueId: "iss-1",
+    issueType: "placeholder_value",
+    action: "Replace age=999 with NULL",
+    operationKind: "replace_value",
+    operationParams: { column: "age", match: 999, replacement: null },
+    riskTier: "medium",
+    reason: "out of valid range",
+    evidence: "5 of 5 placeholder rows",
+    confidence: "high",
+    targetRowCount: 5,
+    ...overrides,
+  };
+}
+
+describe("serializeDecisionLogCsv header — modified_action column removed (#11a)", () => {
+  test("empty-log header is the 9-column schema with no `modified_action`", () => {
+    const csv = serializeDecisionLogCsv([]);
+    const header = csv.split("\n")[0];
+    expect(header).toBe(
+      "decision_id,timestamp,issue_type,target_rows,proposed_action,user_decision,reason,confidence,applied_at"
+    );
+    expect(header).not.toContain("modified_action");
+  });
+
+  test("data rows have exactly 9 fields (matching the 9-column header)", () => {
+    const csv = serializeDecisionLogCsv([entry()]);
+    const lines = csv.split("\n");
+    const headerCols = lines[0].split(",").length;
+    // Use a CSV-aware split for the data row to handle quoted fields safely;
+    // here the fixture has no commas inside fields so a plain split is fine.
+    const dataCols = lines[1].split(",").length;
+    expect(headerCols).toBe(9);
+    expect(dataCols).toBe(9);
+  });
+});
+
+describe("DataGuardSession.recordDecision — modifiedAction is gone (#11a)", () => {
+  test("written entries never carry a `modifiedAction` field", () => {
+    const session = new DataGuardSession();
+    session.recordProposal(proposal());
+    session.recordDecision({
+      proposal: proposal(),
+      verdict: "allow",
+      applied: true,
+    });
+    const log = session.getDecisionLog();
+    expect(log).toHaveLength(1);
+    expect(log[0]).not.toHaveProperty("modifiedAction");
+  });
+
+  test("a denied decision likewise has no `modifiedAction`", () => {
+    const session = new DataGuardSession();
+    session.recordProposal(proposal());
+    session.recordDecision({
+      proposal: proposal(),
+      verdict: "deny",
+      applied: false,
+    });
+    const log = session.getDecisionLog();
+    expect(log[0]).not.toHaveProperty("modifiedAction");
+  });
+
+  test("auto_allow_low_risk and auto_allow_remembered entries also lack `modifiedAction`", () => {
+    const session = new DataGuardSession();
+    session.recordProposal(proposal());
+    session.recordDecision({
+      proposal: proposal(),
+      verdict: "auto_allow_low_risk",
+      applied: true,
+    });
+    session.recordDecision({
+      proposal: proposal({ issueId: "iss-2" }),
+      verdict: "auto_allow_remembered",
+      applied: true,
+    });
+    const log = session.getDecisionLog();
+    expect(log).toHaveLength(2);
+    expect(log[0]).not.toHaveProperty("modifiedAction");
+    expect(log[1]).not.toHaveProperty("modifiedAction");
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
index f31f1adbaa0..da8ddf4e496 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
@@ -38,9 +38,12 @@ function entry(overrides: Partial<DecisionLogEntry> = {}): DecisionLogEntry {
 
 describe("serializeDecisionLogCsv", () => {
   test("empty log returns header only", () => {
+    // The `modified_action` column was removed by #11a (Modify verdict cut);
+    // the canonical 9-column header lives below — `decision-log-no-modify.test.ts`
+    // also locks this contract from the opposite direction.
     const csv = serializeDecisionLogCsv([]);
     expect(csv.split("\n")).toEqual([
-      "decision_id,timestamp,issue_type,target_rows,proposed_action,user_decision,modified_action,reason,confidence,applied_at",
+      "decision_id,timestamp,issue_type,target_rows,proposed_action,user_decision,reason,confidence,applied_at",
     ]);
   });
 
@@ -65,13 +68,14 @@ describe("serializeDecisionLogCsv", () => {
     expect(dataRow).toContain('"line1\nline2"');
   });
 
-  test("missing appliedAt and modifiedAction render as empty fields", () => {
+  test("missing appliedAt renders as an empty trailing field", () => {
+    // Post-#11a the schema is 9 cols; appliedAt is the last and can be blank
+    // for denied decisions (nothing was applied).
     const csv = serializeDecisionLogCsv([
       entry({ userDecision: "deny", appliedAt: undefined }),
     ]);
     const row = csv.split("\n")[1];
-    expect(row.endsWith(",")).toBe(true); // appliedAt is the last column and is empty
-    expect(row).toContain(",,"); // modifiedAction is empty between reason+confidence's neighbors
+    expect(row.endsWith(",")).toBe(true);
   });
 
   test("multiple rows preserve insertion order", () => {
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts
new file mode 100644
index 00000000000..06402063524
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Type-level contract tests for the Modify-verdict cut (task #11a / #15).
+//
+// `tsc --noEmit` is part of the QA gate (bun run typecheck). Any one of the
+// ts-expect-error directives below failing to fire WILL produce a compile
+// error of the form "Unused 'ts-expect-error' directive". (Avoiding the @
+// prefix in this comment so tsc does not parse it as a directive itself.)
+// Backend's job (in #15) is to narrow `Verdict` to "allow" | "deny" (plus the
+// two internal auto_allow_* sentinels) and to drop `modifiedAction` from
+// `PermissionDecision`, `DecisionLogEntry`, and `RecordDecisionInput`.
+
+import { describe, expect, test } from "bun:test";
+import type {
+  DecisionLogEntry,
+  PermissionDecision,
+  Verdict,
+} from "../../../../types/dataguard";
+import type { RecordDecisionInput } from "../dataguard-session";
+
+describe("Verdict type — Modify is gone (#11a)", () => {
+  test("a value of \"modify\" is NOT assignable to Verdict", () => {
+    // @ts-expect-error "modify" is removed from the Verdict union by #11a.
+    const v: Verdict = "modify";
+    // The .toBe argument is also "modify" and that argument is statically
+    // checked against the Verdict union — silence the same error here too.
+    // @ts-expect-error
+    expect(v).toBe("modify");
+  });
+
+  test("\"allow\" and \"deny\" remain valid Verdict members", () => {
+    const allow: Verdict = "allow";
+    const deny: Verdict = "deny";
+    expect(allow).toBe("allow");
+    expect(deny).toBe("deny");
+  });
+
+  test("the two internal auto_allow_* sentinels remain valid", () => {
+    const low: Verdict = "auto_allow_low_risk";
+    const remembered: Verdict = "auto_allow_remembered";
+    expect(low).toBe("auto_allow_low_risk");
+    expect(remembered).toBe("auto_allow_remembered");
+  });
+});
+
+describe("PermissionDecision — modifiedAction is gone (#11a)", () => {
+  test("constructing a PermissionDecision with `modifiedAction` is a type error", () => {
+    const d: PermissionDecision = {
+      stepId: "step-1",
+      verdict: "allow",
+      // @ts-expect-error `modifiedAction` is removed from PermissionDecision by #11a.
+      modifiedAction: "Flag instead of replace",
+    };
+    expect(d.stepId).toBe("step-1");
+  });
+
+  test("a minimal PermissionDecision with only stepId + verdict still type-checks", () => {
+    const d: PermissionDecision = { stepId: "step-1", verdict: "deny" };
+    expect(d.verdict).toBe("deny");
+  });
+});
+
+describe("DecisionLogEntry — modifiedAction is gone (#11a)", () => {
+  test("constructing a DecisionLogEntry with `modifiedAction` is a type error", () => {
+    const e: DecisionLogEntry = {
+      decisionId: "dec-1",
+      timestamp: "2026-05-15T00:00:00.000Z",
+      issueType: "placeholder_value",
+      targetRowCount: 5,
+      proposedAction: "Replace age=999 with NULL",
+      userDecision: "allow",
+      // @ts-expect-error `modifiedAction` is removed from DecisionLogEntry by #11a.
+      modifiedAction: "Flag instead of replace",
+      reason: "test",
+      confidence: "high",
+    };
+    expect(e.decisionId).toBe("dec-1");
+  });
+});
+
+describe("RecordDecisionInput — modifiedAction is gone (#11a)", () => {
+  test("DataGuardSession.recordDecision callers can no longer pass `modifiedAction`", () => {
+    const proposal = {
+      issueId: "iss-1",
+      issueType: "placeholder_value" as const,
+      action: "Replace age=999 with NULL",
+      operationKind: "replace_value" as const,
+      operationParams: {},
+      riskTier: "medium" as const,
+      reason: "test",
+      evidence: "test",
+      confidence: "high" as const,
+      targetRowCount: 5,
+    };
+    const input: RecordDecisionInput = {
+      proposal,
+      verdict: "allow",
+      // @ts-expect-error `modifiedAction` is removed from RecordDecisionInput by #11a.
+      modifiedAction: "Flag instead of replace",
+      applied: true,
+    };
+    expect(input.verdict).toBe("allow");
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
index 0586e338445..8a22e1d0c23 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
@@ -121,6 +121,59 @@ describe("profileDataset", () => {
     expect(dup!.affectedRowCount).toBe(2);
   });
 
+  test("auto-infers idColumn from name patterns (sample_id) when caller omits it", () => {
+    // The auto-trigger pipeline POSTs /scan with an empty body — without this
+    // inference, dup-ID detection would never fire on user files. Match must
+    // be conservative (id-like column names only), not value-based.
+    const ds: DatasetView = {
+      columns: ["sample_id", "age"],
+      rows: [
+        { sample_id: "S1", age: 30 },
+        { sample_id: "S2", age: 40 },
+        { sample_id: "S1", age: 30 },
+      ],
+    };
+    const issues = profileDataset(ds);
+    const dup = issues.find(i => i.issueType === "duplicate_id");
+    expect(dup).toBeDefined();
+    expect(dup!.column).toBe("sample_id");
+    expect(dup!.affectedRowCount).toBe(2);
+  });
+
+  test("auto-infer recognizes bare `id`, `*Id`, `id_*` patterns too", () => {
+    const cases: Array<{ col: string }> = [
+      { col: "id" },
+      { col: "userId" },
+      { col: "id_card" },
+      { col: "ID" },
+    ];
+    for (const { col } of cases) {
+      const ds: DatasetView = {
+        columns: [col, "value"],
+        rows: [
+          { [col]: "a", value: 1 },
+          { [col]: "a", value: 2 },
+          { [col]: "b", value: 3 },
+        ],
+      };
+      const issues = profileDataset(ds);
+      const dup = issues.find(i => i.issueType === "duplicate_id");
+      expect(dup).toBeDefined();
+      expect(dup!.column).toBe(col);
+    }
+  });
+
+  test("auto-infer does NOT fire when no column name looks like an ID", () => {
+    // Conservative: just having repeated values isn't enough — the user's
+    // workflow may legitimately have duplicate categorical labels.
+    const ds: DatasetView = {
+      columns: ["color", "qty"],
+      rows: [{ color: "red", qty: 1 }, { color: "red", qty: 2 }],
+    };
+    const issues = profileDataset(ds);
+    expect(issues.find(i => i.issueType === "duplicate_id")).toBeUndefined();
+  });
+
   test("no idColumn → no duplicate_id issue even with repeated values", () => {
     const ds: DatasetView = {
       columns: ["x"],
@@ -130,7 +183,7 @@ describe("profileDataset", () => {
     expect(issues.find(i => i.issueType === "duplicate_id")).toBeUndefined();
   });
 
-  test("validRanges → detects out-of-range values", () => {
+  test("validRanges → detects outlier values", () => {
     const ds: DatasetView = {
       columns: ["bmi"],
       rows: [
@@ -138,18 +191,18 @@ describe("profileDataset", () => {
       ],
     };
     const issues = profileDataset(ds, { validRanges: { bmi: { min: 10, max: 60 } } });
-    const oor = issues.find(i => i.issueType === "out_of_range");
-    expect(oor).toBeDefined();
-    expect(oor!.affectedRowCount).toBe(2);
+    const outlier = issues.find(i => i.issueType === "outlier");
+    expect(outlier).toBeDefined();
+    expect(outlier!.affectedRowCount).toBe(2);
   });
 
-  test("placeholder values are not double-counted as out_of_range", () => {
+  test("placeholder values are not double-counted as outliers", () => {
     const ds: DatasetView = {
       columns: ["age"],
       rows: [{ age: 25 }, { age: 999 }, { age: 30 }],
     };
     const issues = profileDataset(ds, { validRanges: { age: { min: 0, max: 130 } } });
-    expect(issues.find(i => i.issueType === "out_of_range")).toBeUndefined();
+    expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
     expect(issues.find(i => i.issueType === "placeholder_value")).toBeDefined();
   });
 
@@ -204,6 +257,109 @@ describe("profileDataset", () => {
     expect(kinds.has("placeholder_value")).toBe(true);
     expect(kinds.has("missing_value")).toBe(true);
     expect(kinds.has("duplicate_id")).toBe(true);
-    expect(kinds.has("out_of_range")).toBe(true);
+    expect(kinds.has("outlier")).toBe(true);
+  });
+
+  describe("outlier detector (validRanges-based)", () => {
+    test("does not fire without validRanges hint — earlier z-score variant was removed", () => {
+      // 19 values near 50, one extreme reading at 500. The old z-score
+      // detector would have flagged the 500 automatically; the validRanges
+      // detector requires the caller to opt in by supplying a hard range.
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 19; i++) rows.push({ v: 50 + (i % 3) });
+      rows.push({ v: 500 });
+      const issues = profileDataset({ columns: ["v"], rows });
+      expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
+    });
+
+    test("flags values outside the user-supplied range", () => {
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 19; i++) rows.push({ age: 30 + (i % 5) });
+      rows.push({ age: 500 });
+      const issues = profileDataset(
+        { columns: ["age"], rows },
+        { validRanges: { age: { min: 0, max: 120 } } }
+      );
+      const outlier = issues.find(i => i.issueType === "outlier" && i.column === "age");
+      expect(outlier).toBeDefined();
+      expect(outlier!.affectedRowCount).toBe(1);
+      expect(outlier!.affectedRowIndices).toEqual([19]);
+    });
+
+    test("does NOT flag clusters of consecutive large readings — the whole point of the redesign", () => {
+      // The earlier z-score detector would have flagged half of this column.
+      // We deliberately don't: the user owns the definition of "out of range",
+      // and unless they say so, clustered extremes are real data.
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 10; i++) rows.push({ glucose: 100 });
+      for (let i = 0; i < 10; i++) rows.push({ glucose: 400 }); // sustained high
+      const issues = profileDataset({ columns: ["glucose"], rows });
+      expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
+    });
+  });
+
+  describe("inconsistent_label detector", () => {
+    test("flags rows using non-canonical spellings of the same label", () => {
+      const ds: DatasetView = {
+        columns: ["gender"],
+        rows: [
+          { gender: "Male" },
+          { gender: "Male" },
+          { gender: "Male" },
+          { gender: "male" },
+          { gender: "MALE" },
+          { gender: "Female" },
+          { gender: "Female" },
+          { gender: "female" },
+        ],
+      };
+      const issues = profileDataset(ds);
+      const ilab = issues.find(i => i.issueType === "inconsistent_label" && i.column === "gender");
+      expect(ilab).toBeDefined();
+      // "male" and "MALE" are non-canonical variants of "Male" (most common);
+      // "female" is non-canonical variant of "Female". Total: 3 rows.
+      expect(ilab!.affectedRowCount).toBe(3);
+    });
+
+    test("ignores columns that are pure data (every value unique)", () => {
+      const ds: DatasetView = {
+        columns: ["note"],
+        rows: Array.from({ length: 30 }, (_, i) => ({ note: `note-${i}` })),
+      };
+      const issues = profileDataset(ds);
+      expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined();
+    });
+
+    test("ignores columns above cardinality cap (likely free text)", () => {
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 25; i++) rows.push({ tag: `tag${i}` });
+      // Add a clear inconsistency for the 26th distinct group — but we exceed the cap.
+      rows.push({ tag: "tag1 " }); // would collide with "tag1" if checked
+      const issues = profileDataset(
+        { columns: ["tag"], rows },
+        { inconsistentLabelMaxCardinality: 20 }
+      );
+      expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined();
+    });
+
+    test("disabled when inconsistentLabelMaxCardinality = 0", () => {
+      const ds: DatasetView = {
+        columns: ["g"],
+        rows: [{ g: "A" }, { g: "a" }, { g: "A" }],
+      };
+      const issues = profileDataset(ds, { inconsistentLabelMaxCardinality: 0 });
+      expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined();
+    });
+
+    test("does not flag when all spellings agree (genuine low-cardinality categorical)", () => {
+      const ds: DatasetView = {
+        columns: ["group"],
+        rows: [
+          { group: "A" }, { group: "A" }, { group: "A" }, { group: "B" }, { group: "B" },
+        ],
+      };
+      const issues = profileDataset(ds);
+      expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined();
+    });
   });
 });
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts
new file mode 100644
index 00000000000..8161e261ad2
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Post-cut behavior pin for the with-approval permission gate (#11a).
+// The legacy chat-flow gate (`requestApproval` in with-approval.ts) is still
+// wired for any future caller (e.g. WorkflowGuard). After Modify is cut, the
+// gate must:
+//
+//   • still resolve cleanly on "allow" and "deny" verdicts arriving over WS
+//   • never receive a "modify" verdict — and the type system enforces this
+//     (the @ts-expect-error below proves `verdict: "modify"` won't compile)
+//
+// The pre-existing test in `with-approval.test.ts` named
+// "'modify' verdict carries through with the modifiedAction" is expected to
+// be removed by backend during #15 (it constructs a PermissionDecision
+// literal containing `verdict: "modify"`, which will be a type error after
+// the cut).
+
+import { describe, expect, test } from "bun:test";
+import { requestApproval, type ApprovalGateway } from "../with-approval";
+import type {
+  FixProposal,
+  IssueType,
+  PermissionDecision,
+} from "../../../../types/dataguard";
+
+function makeProposal(overrides: Partial<FixProposal> = {}): FixProposal {
+  return {
+    issueId: "iss-1",
+    issueType: "placeholder_value",
+    action: "Replace age=999 with NULL",
+    operationKind: "replace_value",
+    operationParams: { column: "age", match: 999, replacement: null },
+    riskTier: "medium",
+    reason: "test",
+    evidence: "test",
+    confidence: "high",
+    targetRowCount: 5,
+    ...overrides,
+  };
+}
+
+class MockGateway implements ApprovalGateway {
+  rules: Set<IssueType> = new Set();
+  emitted: Array<{ stepId: string; proposal: FixProposal }> = [];
+  private decisions: Map<string, PermissionDecision> = new Map();
+  private waiters: Map<string, (d: PermissionDecision) => void> = new Map();
+  private counter = 0;
+
+  matchesAutoAllowRule(issueType: IssueType): boolean {
+    return this.rules.has(issueType);
+  }
+  generateStepId(): string {
+    this.counter += 1;
+    return `mock-step-${this.counter}`;
+  }
+  emitPendingApproval(stepId: string, proposal: FixProposal): void {
+    this.emitted.push({ stepId, proposal });
+  }
+  awaitDecision(stepId: string): Promise<PermissionDecision> {
+    if (this.decisions.has(stepId)) return Promise.resolve(this.decisions.get(stepId)!);
+    return new Promise(resolve => this.waiters.set(stepId, resolve));
+  }
+  resolveLater(stepId: string, decision: PermissionDecision): void {
+    const w = this.waiters.get(stepId);
+    if (w) {
+      this.waiters.delete(stepId);
+      w(decision);
+    } else {
+      this.decisions.set(stepId, decision);
+    }
+  }
+}
+
+describe("requestApproval after Modify cut (#11a)", () => {
+  test("medium-risk allow flow round-trips with no modifiedAction in sight", async () => {
+    const gw = new MockGateway();
+    const promise = requestApproval(gw, makeProposal({ riskTier: "medium" }));
+    expect(gw.emitted).toHaveLength(1);
+    gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "allow" });
+    const decision = await promise;
+    expect(decision.verdict).toBe("allow");
+    expect(decision).not.toHaveProperty("modifiedAction");
+  });
+
+  test("high-risk deny flow round-trips with no modifiedAction in sight", async () => {
+    const gw = new MockGateway();
+    const promise = requestApproval(gw, makeProposal({ riskTier: "high" }));
+    expect(gw.emitted).toHaveLength(1);
+    gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "deny" });
+    const decision = await promise;
+    expect(decision.verdict).toBe("deny");
+    expect(decision).not.toHaveProperty("modifiedAction");
+  });
+
+  test("type system rejects a PermissionDecision literal with verdict: \"modify\"", () => {
+    // @ts-expect-error "modify" is no longer a Verdict member after #11a.
+    const bad: PermissionDecision = { stepId: "x", verdict: "modify" };
+    expect(bad.stepId).toBe("x");
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts
index 1cfbbb0b67b..c92ffe43d1c 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/with-approval.test.ts
@@ -112,17 +112,26 @@ describe("requestApproval", () => {
     expect(decision.verdict).toBe("deny");
   });
 
-  test("'modify' verdict carries through with the modifiedAction", async () => {
+  test("warning tier: prompts every time even with a remembered rule", async () => {
+    // The whole point of `warning` is "we have a concrete fix but need human
+    // judgement". A remembered rule from earlier in the session must NOT
+    // auto-approve it — otherwise the warning tier becomes pointless.
     const gw = new MockGateway();
-    const promise = requestApproval(gw, makeProposal({ riskTier: "medium" }));
-    gw.resolveLater("mock-step-1", {
-      stepId: "mock-step-1",
-      verdict: "modify",
-      modifiedAction: "Flag instead of replace",
-    });
+    gw.rules.add("outlier");
+    const promise = requestApproval(gw, makeProposal({ issueType: "outlier", riskTier: "warning" }));
+    expect(gw.emitted).toHaveLength(1);
+    gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "allow" });
+    const decision = await promise;
+    expect(decision.verdict).toBe("allow");
+  });
+
+  test("warning tier without a remembered rule: still prompts (no auto_allow_low_risk)", async () => {
+    const gw = new MockGateway();
+    const promise = requestApproval(gw, makeProposal({ riskTier: "warning" }));
+    expect(gw.emitted).toHaveLength(1);
+    gw.resolveLater("mock-step-1", { stepId: "mock-step-1", verdict: "deny" });
     const decision = await promise;
-    expect(decision.verdict).toBe("modify");
-    expect(decision.modifiedAction).toBe("Flag instead of replace");
+    expect(decision.verdict).toBe("deny");
   });
 
   test("a decision that arrives before the tool awaits is buffered and delivered", async () => {
diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.ts b/agent-service/src/agent/tools/dataguard/apply-fix.ts
index 77d2c0f85f4..fef924e9684 100644
--- a/agent-service/src/agent/tools/dataguard/apply-fix.ts
+++ b/agent-service/src/agent/tools/dataguard/apply-fix.ts
@@ -24,14 +24,27 @@
 
 import type { FixProposal } from "../../../types/dataguard";
 import type { DatasetView } from "./dataset";
+import { isMissing as isCellMissing } from "./missing-detection";
 
 export interface ApplyResult {
   dataset: DatasetView;
   rowsAffected: number;
-  flaggedRows: number[];
 }
 
-export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResult {
+export interface ApplyOptions {
+  // Extra tokens — beyond the DEFAULT_MISSING_TOKENS — that should also be
+  // treated as missing. Threaded through from the user's /scan call so that
+  // a session configured with custom `missingTokens` (e.g., ["xyz"]) sees
+  // those same tokens treated as missing during `impute`. Without this,
+  // apply-fix and the profiler would silently disagree on what's missing.
+  missingTokens?: string[];
+}
+
+export function applyFix(
+  dataset: DatasetView,
+  proposal: FixProposal,
+  options: ApplyOptions = {}
+): ApplyResult {
   const rows = dataset.rows.map(r => ({ ...r }));
   let columns = [...dataset.columns];
   const params = proposal.operationParams;
@@ -39,16 +52,36 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu
   switch (proposal.operationKind) {
     case "replace_value": {
       const column = params.column as string;
-      const match = params.match;
       const replacement = params.replacement;
+      // Two targeting modes:
+      //   - rowIndices: deterministic, used for outlier / placeholder
+      //     where the profiler already knows exactly which rows are wrong.
+      //   - match: value-based, used for cases like "replace every 'unknown' with null".
+      // rowIndices wins when both are present. Without rowIndices support,
+      // replace_value silently no-ops whenever the LLM-supplied `match` is
+      // slightly off (e.g. 950 vs 950.0 vs "950") — that turned every outlier
+      // proposal into a byte-identical re-export, which then made LakeFS abort
+      // the version commit with "No changes detected in dataset".
+      const targetIndices = params.rowIndices as number[] | undefined;
       let affected = 0;
-      for (const r of rows) {
-        if (cellEquals(r[column], match)) {
-          r[column] = replacement;
-          affected++;
+      if (targetIndices && targetIndices.length > 0) {
+        const indexSet = new Set(targetIndices);
+        for (let i = 0; i < rows.length; i++) {
+          if (indexSet.has(i)) {
+            rows[i][column] = replacement;
+            affected++;
+          }
+        }
+      } else {
+        const match = params.match;
+        for (const r of rows) {
+          if (cellEquals(r[column], match)) {
+            r[column] = replacement;
+            affected++;
+          }
         }
       }
-      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+      return { dataset: { columns, rows }, rowsAffected: affected };
     }
 
     case "drop_rows": {
@@ -57,31 +90,21 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu
       return {
         dataset: { columns, rows: kept },
         rowsAffected: rows.length - kept.length,
-        flaggedRows: [],
       };
     }
 
     case "impute": {
       const column = params.column as string;
       const strategy = params.strategy as "mean" | "median" | "mode";
-      const fill = computeImputeValue(rows, column, strategy);
+      const fill = computeImputeValue(rows, column, strategy, options.missingTokens);
       let affected = 0;
       for (const r of rows) {
-        if (isMissing(r[column])) {
+        if (isCellMissing(r[column], options.missingTokens)) {
           r[column] = fill;
           affected++;
         }
       }
-      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
-    }
-
-    case "flag": {
-      const indices = (params.rowIndices as number[]).slice();
-      return {
-        dataset: { columns, rows },
-        rowsAffected: indices.length,
-        flaggedRows: indices,
-      };
+      return { dataset: { columns, rows }, rowsAffected: affected };
     }
 
     case "trim_whitespace": {
@@ -97,7 +120,7 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu
           }
         }
       }
-      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+      return { dataset: { columns, rows }, rowsAffected: affected };
     }
 
     case "standardize": {
@@ -111,7 +134,7 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu
           affected++;
         }
       }
-      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+      return { dataset: { columns, rows }, rowsAffected: affected };
     }
 
     case "rename_column": {
@@ -126,7 +149,7 @@ export function applyFix(dataset: DatasetView, proposal: FixProposal): ApplyResu
           affected++;
         }
       }
-      return { dataset: { columns, rows }, rowsAffected: affected, flaggedRows: [] };
+      return { dataset: { columns, rows }, rowsAffected: affected };
     }
 
     default:
@@ -144,23 +167,20 @@ function cellEquals(a: unknown, b: unknown): boolean {
   return false;
 }
 
-function isMissing(v: unknown): boolean {
-  if (v === null || v === undefined) return true;
-  if (typeof v === "number" && Number.isNaN(v)) return true;
-  if (typeof v === "string" && v === "") return true;
-  return false;
-}
-
 function computeImputeValue(
   rows: Record<string, unknown>[],
   column: string,
-  strategy: "mean" | "median" | "mode"
+  strategy: "mean" | "median" | "mode",
+  missingTokens?: string[]
 ): unknown {
   const numericValues: number[] = [];
   const stringCounts = new Map<string, number>();
   for (const r of rows) {
     const v = r[column];
-    if (isMissing(v)) continue;
+    // Honor the session's missingTokens override so impute treats the exact
+    // same set of cells as missing that the profiler flagged. Without this,
+    // the user sees "NULL"/"N/A" still in the cleaned CSV after Fix-and-run.
+    if (isCellMissing(v, missingTokens)) continue;
     if (typeof v === "number" && Number.isFinite(v)) {
       numericValues.push(v);
     } else if (typeof v === "string") {
diff --git a/agent-service/src/agent/tools/dataguard/dataguard-session.ts b/agent-service/src/agent/tools/dataguard/dataguard-session.ts
index 1fc33799f8b..6b7ced8fcff 100644
--- a/agent-service/src/agent/tools/dataguard/dataguard-session.ts
+++ b/agent-service/src/agent/tools/dataguard/dataguard-session.ts
@@ -19,8 +19,8 @@
 
 // Per-agent DataGuard run state. One DataGuardSession lives on each
 // TexeraAgent (lazy-initialized when the first DataGuard tool fires) and
-// holds the working dataset, accumulated issues, decision log, flagged rows,
-// and auto-allow rules. Independent of the workflow state so resetting one
+// holds the working dataset, accumulated issues, decision log, and
+// auto-allow rules. Independent of the workflow state so resetting one
 // does not affect the other.
 
 import type {
@@ -36,20 +36,36 @@ import type { DatasetView } from "./dataset";
 export interface RecordDecisionInput {
   proposal: FixProposal;
   verdict: Verdict;
-  modifiedAction?: string;
   applied: boolean;
 }
 
+// Profiler options the user supplied at /scan time. Stored so the post-apply
+// re-scan can use the same configuration the issues were originally found with.
+export interface ScanOptions {
+  idColumn?: string;
+  validRanges?: Record<string, { min: number; max: number }>;
+  placeholderValues?: Array<string | number>;
+  missingTokens?: string[];
+}
+
 export class DataGuardSession {
   private dataset: DatasetView | undefined;
   private issues: Map<string, DataQualityIssue> = new Map();
   private proposals: Map<string, FixProposal> = new Map();
   private decisionLog: DecisionLogEntry[] = [];
-  private flaggedRows: Set<number> = new Set();
   private autoAllowRules: Map<string, AutoAllowRule> = new Map();
+  private scanOptions: ScanOptions = {};
   private decisionCounter = 0;
   private ruleCounter = 0;
 
+  setScanOptions(opts: ScanOptions): void {
+    this.scanOptions = { ...opts };
+  }
+
+  getScanOptions(): ScanOptions {
+    return this.scanOptions;
+  }
+
   setDataset(dataset: DatasetView): void {
     this.dataset = dataset;
     // A new dataset means a fresh DataGuard run — clear the per-run state.
@@ -57,7 +73,6 @@ export class DataGuardSession {
     this.issues.clear();
     this.proposals.clear();
     this.decisionLog = [];
-    this.flaggedRows.clear();
   }
 
   recordProposal(proposal: FixProposal): void {
@@ -98,7 +113,6 @@ export class DataGuardSession {
       targetRowCount: input.proposal.targetRowCount,
       proposedAction: input.proposal.action,
       userDecision: input.verdict,
-      modifiedAction: input.modifiedAction,
       reason: input.proposal.reason,
       confidence: input.proposal.confidence,
       appliedAt: input.applied ? now : undefined,
@@ -111,14 +125,6 @@ export class DataGuardSession {
     return [...this.decisionLog];
   }
 
-  addFlaggedRows(indices: number[]): void {
-    for (const i of indices) this.flaggedRows.add(i);
-  }
-
-  getFlaggedRows(): number[] {
-    return Array.from(this.flaggedRows).sort((a, b) => a - b);
-  }
-
   addAutoAllowRule(issueType: IssueType): AutoAllowRule {
     // Idempotent: if a rule already exists for this issueType, return it.
     for (const r of this.autoAllowRules.values()) {
diff --git a/agent-service/src/agent/tools/dataguard/dataguard-tools.ts b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts
index f58e29df8e4..50bb5ea3adf 100644
--- a/agent-service/src/agent/tools/dataguard/dataguard-tools.ts
+++ b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts
@@ -59,11 +59,12 @@ function createProfileDatasetTool(ctx: DataGuardToolContext) {
   return tool({
     description: `Scan the loaded dataset for quality issues. Read-only.
 
-Detects four categories:
+Detects five categories:
 - missing_value: null / empty / configured missing tokens
 - placeholder_value: numeric (999, -1) or string sentinels
 - duplicate_id: requires idColumn hint
-- out_of_range: requires validRanges hint per column
+- outlier: requires validRanges hint per column — flags numeric values outside [min, max]. The earlier z-score variant was removed because it flagged legitimate consecutive large readings.
+- inconsistent_label: low-cardinality string columns where trim+lowercase keys collide on multiple raw spellings (e.g. "Male"/"male"/"MALE")
 
 Call this once at the start of a DataGuard run. Returns a JSON array of DataQualityIssue records.`,
     inputSchema: z.object({
@@ -74,7 +75,7 @@ Call this once at the start of a DataGuard run. Returns a JSON array of DataQual
       validRanges: z
         .record(z.string(), z.object({ min: z.number(), max: z.number() }))
         .optional()
-        .describe("Per-column valid numeric range. Values outside are flagged as out_of_range."),
+        .describe("Per-column valid numeric range. Values outside are flagged as outlier."),
       placeholderValues: z
         .array(z.union([z.string(), z.number()]))
         .optional()
@@ -135,7 +136,13 @@ function createApplyFixTool(ctx: DataGuardToolContext) {
   return tool({
     description: `Apply a previously-proposed fix to the dataset. MUTATING — gated by user approval.
 
-Pass the issueId. The proposal stored from suggest_fix is looked up automatically. For risk tier "low" the fix is auto-applied with a summary line; for "medium" / "high" the user must approve through the chat panel. The result includes the user's verdict.`,
+Pass the issueId. The proposal stored from suggest_fix is looked up automatically. Risk-tier behaviour:
+- "low": auto-applied with a summary line (no prompt).
+- "medium": user must approve through the chat panel; "Allow & remember" is offered so future similar fixes auto-apply.
+- "high": user must approve every time; "remember" is NOT offered (always-ask, e.g. drop_rows).
+- "warning": user must approve every time; "remember" is NOT offered. Used for fixes that are concrete but where domain judgement is required (e.g. outlier clamping that might destroy a real extreme value).
+
+The result includes the user's verdict.`,
     inputSchema: z.object({
       issueId: z.string().describe("The issueId whose proposal should be applied."),
     }),
@@ -160,25 +167,19 @@ Pass the issueId. The proposal stored from suggest_fix is looked up automaticall
         });
       }
 
-      // For modify, MVP keeps the original operationKind/params but records the
-      // user's free-text override in the log. Future iteration can parse the
-      // modifiedAction back into a structured proposal override.
-      const modifiedAction = decision.verdict === "modify" ? decision.modifiedAction : undefined;
-
       try {
-        const result = applyFix(dataset, proposal);
+        const result = applyFix(dataset, proposal, {
+          missingTokens: ctx.session.getScanOptions().missingTokens,
+        });
         ctx.session.updateDataset(result.dataset);
-        if (result.flaggedRows.length > 0) ctx.session.addFlaggedRows(result.flaggedRows);
         ctx.session.recordDecision({
           proposal,
           verdict: decision.verdict,
-          modifiedAction,
           applied: true,
         });
         return JSON.stringify({
           verdict: decision.verdict,
           rowsAffected: result.rowsAffected,
-          flaggedRows: result.flaggedRows,
           datasetRowCount: result.dataset.rows.length,
           message: `Applied ${proposal.operationKind}. Rows affected: ${result.rowsAffected}.`,
         });
diff --git a/agent-service/src/agent/tools/dataguard/decision-log.ts b/agent-service/src/agent/tools/dataguard/decision-log.ts
index e9752fdf422..7d2c5bc9003 100644
--- a/agent-service/src/agent/tools/dataguard/decision-log.ts
+++ b/agent-service/src/agent/tools/dataguard/decision-log.ts
@@ -33,7 +33,6 @@ const HEADER_COLUMNS = [
   "target_rows",
   "proposed_action",
   "user_decision",
-  "modified_action",
   "reason",
   "confidence",
   "applied_at",
@@ -55,7 +54,6 @@ function rowToCsv(e: DecisionLogEntry): string {
     csvField(String(e.targetRowCount)),
     csvField(e.proposedAction),
     csvField(e.userDecision),
-    csvField(e.modifiedAction ?? ""),
     csvField(e.reason),
     csvField(e.confidence),
     csvField(e.appliedAt ?? ""),
diff --git a/agent-service/src/agent/tools/dataguard/missing-detection.ts b/agent-service/src/agent/tools/dataguard/missing-detection.ts
new file mode 100644
index 00000000000..afa6e463ae9
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/missing-detection.ts
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Single source of truth for "is this cell missing / placeholder?" Used by both
+// the profiler (which flags issues) and the applier (which fills/replaces).
+// Why: when the two disagree, impute silently leaves cells that the profiler
+// flagged as missing — the user sees "NULL"/"N/A" still in the cleaned CSV.
+
+export const DEFAULT_PLACEHOLDERS: ReadonlyArray<string | number> = [
+  999,
+  -1,
+  "unknown",
+  "Unknown",
+];
+
+// Case-insensitive set of tokens that mean "no value was recorded." Compared
+// against the *trimmed*, lowercased cell so whitespace and case can't smuggle
+// a missing cell past the check.
+const MISSING_TOKENS_LOWER: ReadonlySet<string> = new Set([
+  "na",
+  "n/a",
+  "null",
+  "none",
+  "nan",
+]);
+
+// Kept for places that still want the raw token list (e.g., the profiler's
+// ProfileOptions API surface).
+export const DEFAULT_MISSING_TOKENS: ReadonlyArray<string> = [
+  "NA",
+  "N/A",
+  "n/a",
+  "null",
+  "NULL",
+  "None",
+];
+
+export function isMissing(value: unknown, extraTokens: ReadonlyArray<string> = []): boolean {
+  if (value === null || value === undefined) return true;
+  if (typeof value === "number" && Number.isNaN(value)) return true;
+  if (typeof value !== "string") return false;
+  const trimmed = value.trim();
+  if (trimmed === "") return true;
+  if (MISSING_TOKENS_LOWER.has(trimmed.toLowerCase())) return true;
+  if (extraTokens.includes(value) || extraTokens.includes(trimmed)) return true;
+  return false;
+}
+
+export function toNumber(value: unknown): number | undefined {
+  if (typeof value === "number" && Number.isFinite(value)) return value;
+  if (typeof value === "string" && value.trim() !== "") {
+    const n = Number(value);
+    if (Number.isFinite(n)) return n;
+  }
+  return undefined;
+}
+
+export function placeholderHit(
+  value: unknown,
+  placeholders: ReadonlyArray<string | number>
+): string | number | undefined {
+  for (const p of placeholders) {
+    if (typeof p === "string" && typeof value === "string" && p === value) return p;
+    if (typeof p === "number") {
+      const n = toNumber(value);
+      if (n !== undefined && n === p) return p;
+    }
+  }
+  return undefined;
+}
diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
index 08cd8723bd2..a5c55d2167d 100644
--- a/agent-service/src/agent/tools/dataguard/profile-dataset.ts
+++ b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
@@ -22,6 +22,13 @@
 
 import type { DataQualityIssue } from "../../../types/dataguard";
 import type { DatasetView } from "./dataset";
+import {
+  DEFAULT_MISSING_TOKENS,
+  DEFAULT_PLACEHOLDERS,
+  isMissing as isCellMissing,
+  placeholderHit,
+  toNumber,
+} from "./missing-detection";
 
 export interface ProfileOptions {
   // Column to treat as a unique identifier; duplicates flagged as duplicate_id.
@@ -34,29 +41,12 @@ export interface ProfileOptions {
   missingTokens?: string[];
   // Above this row count, affectedRowIndices is omitted from the issue.
   maxIndicesInIssue?: number;
+  // Max distinct values a string column may have before inconsistent-label
+  // detection skips it (free-text columns will hit this and be ignored).
+  // Default 20. Set to 0 to disable label detection entirely.
+  inconsistentLabelMaxCardinality?: number;
 }
 
-// Placeholders are sentinel *values* that look like data but are actually
-// "no value." Kept distinct from missing-tokens to avoid double-flagging
-// the same cell under two issue types.
-const DEFAULT_PLACEHOLDERS: Array<string | number> = [
-  999,
-  -1,
-  "unknown",
-  "Unknown",
-];
-
-// Tokens that mean "no data was recorded." Empty string is always treated
-// as missing without needing to be listed.
-const DEFAULT_MISSING_TOKENS: string[] = [
-  "NA",
-  "N/A",
-  "n/a",
-  "null",
-  "NULL",
-  "None",
-];
-
 const DEFAULT_MAX_INDICES_IN_ISSUE = 50;
 
 let issueCounter = 0;
@@ -69,37 +59,8 @@ function nowIso(): string {
   return new Date().toISOString();
 }
 
-function isMissing(value: unknown, missingTokens: string[]): boolean {
-  if (value === null || value === undefined) return true;
-  if (typeof value === "number" && Number.isNaN(value)) return true;
-  if (typeof value === "string") {
-    if (value === "") return true;
-    if (missingTokens.includes(value)) return true;
-  }
-  return false;
-}
-
-function toNumber(value: unknown): number | undefined {
-  if (typeof value === "number" && Number.isFinite(value)) return value;
-  if (typeof value === "string" && value.trim() !== "") {
-    const n = Number(value);
-    if (Number.isFinite(n)) return n;
-  }
-  return undefined;
-}
-
-function placeholderHit(
-  value: unknown,
-  placeholders: Array<string | number>
-): string | number | undefined {
-  for (const p of placeholders) {
-    if (typeof p === "string" && typeof value === "string" && p === value) return p;
-    if (typeof p === "number") {
-      const n = toNumber(value);
-      if (n !== undefined && n === p) return p;
-    }
-  }
-  return undefined;
+function isMissing(value: unknown, missingTokens: ReadonlyArray<string>): boolean {
+  return isCellMissing(value, missingTokens);
 }
 
 function maybeIndices(
@@ -109,6 +70,26 @@ function maybeIndices(
   return indices.length <= cap ? indices : undefined;
 }
 
+// Guess which column is the row identifier when the caller didn't specify one.
+// Conservative: only matches columns whose names look unambiguously like IDs.
+// Tries the cheapest, most-specific patterns first so e.g. `sample_id` wins
+// over a generic `id_card` column elsewhere in the schema. Returns undefined
+// when nothing matches — the caller skips dup-ID detection in that case.
+function inferIdColumn(columns: ReadonlyArray<string>): string | undefined {
+  const matchers: Array<(name: string) => boolean> = [
+    name => /^id$/i.test(name),
+    name => /_id$/i.test(name),
+    name => /^id_/i.test(name),
+    name => /Id$/.test(name),
+    name => /^.+_uid$/i.test(name) || /^uid$/i.test(name),
+  ];
+  for (const m of matchers) {
+    const hit = columns.find(c => m(c));
+    if (hit) return hit;
+  }
+  return undefined;
+}
+
 export function profileDataset(
   dataset: DatasetView,
   options: ProfileOptions = {}
@@ -119,9 +100,9 @@ export function profileDataset(
   const detectedAt = nowIso();
   const issues: DataQualityIssue[] = [];
 
-  // Pre-compute placeholder hits per row/column so out_of_range can avoid
-  // double-counting and missing-value can avoid flagging a row that has a
-  // string placeholder like "N/A" twice.
+  // Pre-compute placeholder hits per row/column so outlier-detection can
+  // skip rows already flagged elsewhere and missing-value can avoid flagging
+  // a row that has a string placeholder like "N/A" twice.
   const placeholderHitByColRow = new Map<string, Map<number, string | number>>();
   for (const col of dataset.columns) {
     const map = new Map<number, string | number>();
@@ -170,9 +151,14 @@ export function profileDataset(
     });
   }
 
-  // Duplicate-ID detector (only when idColumn is configured and exists).
-  if (options.idColumn && dataset.columns.includes(options.idColumn)) {
-    const idCol = options.idColumn;
+  // Duplicate-ID detector. Honors options.idColumn when set; otherwise tries
+  // to infer one from column names (e.g. "sample_id" → use it). Without this
+  // inference the auto-trigger's empty-body /scan would never find dup IDs in
+  // user datasets — users don't configure scan options through the checklist UI.
+  const idCol = options.idColumn && dataset.columns.includes(options.idColumn)
+    ? options.idColumn
+    : inferIdColumn(dataset.columns);
+  if (idCol) {
     const positions = new Map<string, number[]>();
     for (let i = 0; i < dataset.rows.length; i++) {
       const v = dataset.rows[i][idCol];
@@ -205,28 +191,103 @@ export function profileDataset(
     }
   }
 
-  // Out-of-range detector — skips rows already flagged as placeholders so we
-  // don't surface the same row under two issue types.
+  // Inconsistent-label detector. For each low-cardinality string column,
+  // group raw values by a normalized key (trim + lowercase). If two or more
+  // raw spellings collapse to the same key, every row using a non-canonical
+  // spelling is flagged. Example: "Male"/"male"/"M" all map to "m"/"male"
+  // depending on key choice. We use trim+lowercase so "Yes"/"yes"/" yes "
+  // collide as expected.
+  const labelMaxCardinality = options.inconsistentLabelMaxCardinality ?? 20;
+  if (labelMaxCardinality > 0) {
+    for (const col of dataset.columns) {
+      const placeholderHits = placeholderHitByColRow.get(col)!;
+      // Count distinct non-missing, non-placeholder string values.
+      const raw = new Map<string, number[]>();
+      let nonStringSeen = false;
+      for (let i = 0; i < dataset.rows.length; i++) {
+        if (placeholderHits.has(i)) continue;
+        const v = dataset.rows[i][col];
+        if (isMissing(v, missingTokens)) continue;
+        if (typeof v !== "string") {
+          nonStringSeen = true;
+          continue;
+        }
+        const list = raw.get(v);
+        if (list) list.push(i);
+        else raw.set(v, [i]);
+      }
+      if (nonStringSeen || raw.size === 0 || raw.size > labelMaxCardinality) continue;
+
+      // Group by normalized key. A key with >1 raw spelling = inconsistent.
+      const groups = new Map<string, { canonical: string; spellings: Set<string>; rows: number[] }>();
+      for (const [rawValue, rows] of raw) {
+        const key = rawValue.trim().toLowerCase();
+        const existing = groups.get(key);
+        if (existing) {
+          existing.spellings.add(rawValue);
+          existing.rows.push(...rows);
+          // Prefer the most common spelling as canonical (heuristic).
+          if (rows.length > (raw.get(existing.canonical)?.length ?? 0)) {
+            existing.canonical = rawValue;
+          }
+        } else {
+          groups.set(key, { canonical: rawValue, spellings: new Set([rawValue]), rows: [...rows] });
+        }
+      }
+      const inconsistentRows: number[] = [];
+      const examples: string[] = [];
+      for (const g of groups.values()) {
+        if (g.spellings.size > 1) {
+          // Flag every row that uses a non-canonical spelling.
+          for (const [rawValue, rows] of raw) {
+            if (rawValue !== g.canonical && g.spellings.has(rawValue)) {
+              inconsistentRows.push(...rows);
+            }
+          }
+          examples.push(`{${Array.from(g.spellings).join(" / ")}}`);
+        }
+      }
+      if (inconsistentRows.length === 0) continue;
+      inconsistentRows.sort((a, b) => a - b);
+      issues.push({
+        issueId: nextIssueId(),
+        issueType: "inconsistent_label",
+        column: col,
+        description: `${inconsistentRows.length} row(s) in ${col} use non-canonical label spellings`,
+        evidence: `Mixed spellings (showing up to 3): ${examples.slice(0, 3).join(", ")}`,
+        affectedRowCount: inconsistentRows.length,
+        affectedRowIndices: maybeIndices(inconsistentRows, indexCap),
+        detectedAt,
+      });
+    }
+  }
+
+  // Outlier detector — values that fall outside a user-supplied hard range.
+  // We deliberately do NOT auto-detect outliers via z-score: legitimate
+  // consecutive large readings (e.g. clusters of high glucose in a clinical
+  // dataset) would be flagged en masse. Requires the caller to opt in by
+  // providing validRanges per column. Skips rows already flagged as
+  // placeholders so the same row doesn't surface under two issue types.
   if (options.validRanges) {
     for (const [col, range] of Object.entries(options.validRanges)) {
       if (!dataset.columns.includes(col)) continue;
       const placeholderHits = placeholderHitByColRow.get(col)!;
-      const oorIndices: number[] = [];
+      const outlierIndices: number[] = [];
       for (let i = 0; i < dataset.rows.length; i++) {
         if (placeholderHits.has(i)) continue;
         const v = toNumber(dataset.rows[i][col]);
         if (v === undefined) continue;
-        if (v < range.min || v > range.max) oorIndices.push(i);
+        if (v < range.min || v > range.max) outlierIndices.push(i);
       }
-      if (oorIndices.length === 0) continue;
+      if (outlierIndices.length === 0) continue;
       issues.push({
         issueId: nextIssueId(),
-        issueType: "out_of_range",
+        issueType: "outlier",
         column: col,
-        description: `${oorIndices.length} row(s) in ${col} fall outside [${range.min}, ${range.max}]`,
-        evidence: `Valid range: [${range.min}, ${range.max}]; violations: ${oorIndices.length}.`,
-        affectedRowCount: oorIndices.length,
-        affectedRowIndices: maybeIndices(oorIndices, indexCap),
+        description: `${outlierIndices.length} row(s) in ${col} fall outside the valid range [${range.min}, ${range.max}]`,
+        evidence: `Valid range: [${range.min}, ${range.max}]; violations: ${outlierIndices.length}.`,
+        affectedRowCount: outlierIndices.length,
+        affectedRowIndices: maybeIndices(outlierIndices, indexCap),
         detectedAt,
       });
     }
diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.ts b/agent-service/src/agent/tools/dataguard/suggest-fix.ts
index b4e6feda995..01554959c9d 100644
--- a/agent-service/src/agent/tools/dataguard/suggest-fix.ts
+++ b/agent-service/src/agent/tools/dataguard/suggest-fix.ts
@@ -36,25 +36,29 @@ const fixProposalSchema = z.object({
     "replace_value",
     "drop_rows",
     "impute",
-    "flag",
     "standardize",
     "trim_whitespace",
     "rename_column",
   ]),
   operationParams: z.record(z.string(), z.unknown()),
-  riskTier: z.enum(["low", "medium", "high"]),
+  riskTier: z.enum(["low", "medium", "high", "warning"]),
   reason: z.string().min(1),
   evidence: z.string().min(1),
   confidence: z.enum(["low", "medium", "high"]),
   targetRowCount: z.number().int().nonnegative(),
 });
 
+// `warning` here = "we have a concrete fix but recommend a human eyeball it
+// first" — the checklist UI defaults these to unchecked. Used for cases where
+// auto-applying could destroy meaningful signal (outliers that might be real
+// extremes, biologically plausible out-of-range values).
 const DEFAULT_RISK_TIER_BY_ISSUE: Record<string, RiskTier> = {
   placeholder_value: "medium",
   missing_value: "medium",
   duplicate_id: "high",
-  out_of_range: "medium",
-  outlier: "high",
+  // `outlier` is the validRanges-based detector — see profile-dataset.ts. The
+  // earlier z-score outlier detector was removed.
+  outlier: "warning",
   inconsistent_label: "medium",
 };
 
@@ -94,6 +98,10 @@ export async function suggestFix(
 
 export function buildPrompt(issue: DataQualityIssue): string {
   const defaultTier = DEFAULT_RISK_TIER_BY_ISSUE[issue.issueType] ?? "medium";
+  const indicesLine =
+    issue.affectedRowIndices && issue.affectedRowIndices.length > 0
+      ? `\n- affectedRowIndices: [${issue.affectedRowIndices.join(", ")}]`
+      : "";
   return `You are a data-cleaning assistant. Propose a single concrete fix for the following data-quality issue. Reply with one JSON object only — no prose, no markdown, no fences.
 
 Issue:
@@ -101,14 +109,14 @@ Issue:
 - column: ${issue.column}
 - description: ${issue.description}
 - evidence: ${issue.evidence}
-- affectedRowCount: ${issue.affectedRowCount}
+- affectedRowCount: ${issue.affectedRowCount}${indicesLine}
 
 Required JSON shape:
 {
   "action": "<one-sentence human-readable description of the fix>",
-  "operationKind": "replace_value | drop_rows | impute | flag | standardize | trim_whitespace | rename_column",
+  "operationKind": "replace_value | drop_rows | impute | standardize | trim_whitespace | rename_column",
   "operationParams": { ...operation-specific params... },
-  "riskTier": "low | medium | high",
+  "riskTier": "low | medium | high | warning",
   "reason": "<one-sentence justification>",
   "evidence": "<one-sentence supporting data from the issue>",
   "confidence": "low | medium | high",
@@ -116,15 +124,25 @@ Required JSON shape:
 }
 
 operationParams by kind:
-- replace_value: { "column": string, "match": any, "replacement": any }
+- replace_value: { "column": string, "replacement": any, "rowIndices"?: number[], "match"?: any }
+    Use "rowIndices" when the issue gives you affectedRowIndices — index targeting
+    is deterministic. Use "match" only for value-based swaps (e.g. replace every
+    literal "unknown" with null) when you don't have indices. Never invent a
+    "match" value from aggregate stats — silent no-ops happen when match doesn't
+    equal the cell exactly (e.g. 950 vs 950.0 vs "950").
 - drop_rows: { "rowIndices": number[] }
 - impute: { "column": string, "strategy": "mean" | "median" | "mode" }
-- flag: { "rowIndices": number[] }
 - standardize: { "column": string, "mapping": { [from: string]: string } }
 - trim_whitespace: { "column": string }
 - rename_column: { "from": string, "to": string }
 
-Default risk tier for ${issue.issueType}: ${defaultTier}. Override only with a strong reason. Prefer "flag" or "impute" over destructive "drop_rows".`;
+Rules:
+- Every proposal must be a concrete, applicable fix — no "flag-and-do-nothing" options.
+- For outlier issues (values outside the valid range stated in description / evidence): use replace_value with "rowIndices" set to the affectedRowIndices and "replacement" set to the nearest valid bound (min or max from the range), or to null if no bound is sensible. Never use "match" for outliers — always use rowIndices.
+- Set riskTier="warning" when you have a concrete fix but the user really should eyeball it before applying — typical for outlier values that are unusual but possibly real (extreme biological readings, etc.). The UI defaults these to unchecked so the user makes an explicit call.
+- Prefer impute over drop_rows for missing values.
+
+Default risk tier for ${issue.issueType}: ${defaultTier}. Override only with a strong reason.`;
 }
 
 function stripCodeFences(s: string): string {
diff --git a/agent-service/src/agent/tools/dataguard/with-approval.ts b/agent-service/src/agent/tools/dataguard/with-approval.ts
index 454ea6de901..025a7d101f9 100644
--- a/agent-service/src/agent/tools/dataguard/with-approval.ts
+++ b/agent-service/src/agent/tools/dataguard/with-approval.ts
@@ -43,9 +43,16 @@ export async function requestApproval(
   gateway: ApprovalGateway,
   proposal: FixProposal
 ): Promise<PermissionDecision> {
-  // High-risk fixes ALWAYS prompt — the "remember" rule does not apply.
+  // `high` and `warning` ALWAYS prompt — the "remember" rule does not apply.
   // This is the same shape Claude Code uses for destructive Bash operations.
-  if (proposal.riskTier !== "high" && gateway.matchesAutoAllowRule(proposal.issueType)) {
+  //
+  // `warning` exists specifically because the agent is *not* confident enough
+  // to act without a human eyeball (e.g., outliers that might be real extreme
+  // values). Letting an "Allow & remember placeholder_value" rule from earlier
+  // in the session auto-approve a warning-tier fix would defeat the whole
+  // point of the tier.
+  const alwaysPrompt = proposal.riskTier === "high" || proposal.riskTier === "warning";
+  if (!alwaysPrompt && gateway.matchesAutoAllowRule(proposal.issueType)) {
     return { stepId: "", verdict: "auto_allow_remembered" };
   }
   if (proposal.riskTier === "low") {
diff --git a/agent-service/src/server.ts b/agent-service/src/server.ts
index 3bfa0df9f7c..bfbaaa3ee7a 100644
--- a/agent-service/src/server.ts
+++ b/agent-service/src/server.ts
@@ -30,6 +30,7 @@ import { createLogger } from "./logger";
 
 const log = createLogger("Server");
 const wsLog = createLogger("WS");
+import type { DataQualityIssue } from "./types/dataguard";
 import type {
   AgentInfo,
   AgentDelegateConfig,
@@ -137,12 +138,22 @@ function getAgent(agentId: string): TexeraAgent {
   return agent;
 }
 
-const agentsRouter = new Elysia({ prefix: "/agents" })
+// `normalize: false` keeps unknown fields in the parsed body so additionalProperties:false
+// schemas can reject them (Elysia 1.4 strips by default otherwise — see #11a tests).
+const agentsRouter = new Elysia({ prefix: "/agents", normalize: false })
   // Error handler must live on the same Elysia instance whose routes throw, or
   // its scope will not see the errors. Elysia 1.x defaults to local scoping for
   // .onError, so attach here rather than on the outer app.
-  .onError(({ error, set }) => {
+  .onError(({ error, code, set }) => {
     log.error({ err: error }, "request error");
+    // Elysia body-schema rejection — surface as 400 so callers can distinguish
+    // bad input from server bugs. Without this, every typebox validation error
+    // ends up as a 500 and the modify-cut tests can't tell whether the route
+    // rejected the bad verdict or crashed.
+    if (code === "VALIDATION") {
+      set.status = 400;
+      return { error: error instanceof Error ? error.message : String(error) };
+    }
     const errorMessage = error instanceof Error ? error.message : String(error);
     if (errorMessage === "Agent not found") {
       set.status = 404;
@@ -345,6 +356,228 @@ const agentsRouter = new Elysia({ prefix: "/agents" })
     }
   )
 
+  // Server-driven DataGuard scan. Runs profile_dataset + suggest_fix entirely
+  // server-side (no chat / no LLM tool loop), returns a flat list of issues
+  // each paired with a FixProposal. The checklist UI consumes this directly.
+  //
+  // Body (optional): { idColumn?, validRanges?, placeholderValues?, missingTokens? }
+  // — same profile_dataset options, all optional.
+  .post(
+    "/:id/dataguard/scan",
+    async ({ params: { id }, body }) => {
+      const agent = getAgent(id);
+      const session = agent.getDataGuardSession();
+      const dataset = session.getDataset();
+      if (!dataset) {
+        return { error: "No dataset loaded. Call /dataguard/dataset first." };
+      }
+      const { profileDataset } = await import("./agent/tools/dataguard/profile-dataset");
+      const { suggestFix } = await import("./agent/tools/dataguard/suggest-fix");
+      const scanOptions = {
+        idColumn: body?.idColumn,
+        validRanges: body?.validRanges,
+        placeholderValues: body?.placeholderValues,
+        missingTokens: body?.missingTokens,
+      };
+      session.setScanOptions(scanOptions);
+      const issues = profileDataset(dataset, scanOptions);
+      for (const issue of issues) session.recordIssue(issue);
+
+      // Generate a proposal per issue in parallel. Each calls the LLM once.
+      const llmCall = (prompt: string) => agent.callLlm(prompt);
+      const proposals = await Promise.all(
+        issues.map(async issue => {
+          try {
+            const p = await suggestFix(issue, { llmCall });
+            session.recordProposal(p);
+            return { issueId: issue.issueId, proposal: p, error: null };
+          } catch (e: unknown) {
+            const msg = e instanceof Error ? e.message : String(e);
+            return { issueId: issue.issueId, proposal: null, error: msg };
+          }
+        })
+      );
+      return { issueCount: issues.length, issues, proposals };
+    },
+    {
+      body: t.Optional(
+        t.Object({
+          idColumn: t.Optional(t.String()),
+          validRanges: t.Optional(t.Record(t.String(), t.Object({ min: t.Number(), max: t.Number() }))),
+          placeholderValues: t.Optional(t.Array(t.Union([t.String(), t.Number()]))),
+          missingTokens: t.Optional(t.Array(t.String())),
+        })
+      ),
+    }
+  )
+
+  // Apply a user-selected batch of FixProposals (the checklist UI sends this
+  // when the user clicks "Apply Selected"). Each entry in `decisions` is one
+  // checkbox row: { issueId, verdict, remember? }.
+  .post(
+    "/:id/dataguard/apply-batch",
+    async ({ params: { id }, body, set }) => {
+      const agent = getAgent(id);
+      const session = agent.getDataGuardSession();
+      const { applyFix } = await import("./agent/tools/dataguard/apply-fix");
+
+      // `remember` only applies when the user approves a fix — it adds the
+      // issueType to autoAllowRules so future similar fixes are pre-approved.
+      // Pairing it with deny is nonsense and would silently teach the agent
+      // an unintended rule, so we reject the whole batch (#12).
+      const badRemember = body.decisions.find(d => d.verdict === "deny" && d.remember === true);
+      if (badRemember) {
+        set.status = 400;
+        return {
+          error: `decision for issueId="${badRemember.issueId}" combines verdict="deny" with remember=true; remember only applies to allow.`,
+        };
+      }
+
+      // Belt-and-suspenders rejection of legacy fields (e.g., #11a's
+      // `modifiedAction`). The typebox schema sets additionalProperties:false,
+      // but Elysia's body parser sometimes strips unknown keys before
+      // validation; this explicit check guarantees an honest 400.
+      const KNOWN_KEYS = new Set(["issueId", "verdict", "remember"]);
+      const rawBody = body as unknown as { decisions: Array<Record<string, unknown>> };
+      for (let i = 0; i < rawBody.decisions.length; i++) {
+        const entry = rawBody.decisions[i];
+        const extras = Object.keys(entry).filter(k => !KNOWN_KEYS.has(k));
+        if (extras.length > 0) {
+          set.status = 400;
+          return {
+            error: `decision at index ${i} has unknown field(s): ${extras.join(", ")}. Allowed: issueId, verdict, remember.`,
+          };
+        }
+      }
+
+      const results: Array<{
+        issueId: string;
+        verdict: string;
+        applied: boolean;
+        rowsAffected: number;
+        error?: string;
+      }> = [];
+
+      let dataset = session.getDataset();
+      if (!dataset) return { error: "No dataset loaded." };
+
+      for (const decision of body.decisions) {
+        const proposal = session.getProposal(decision.issueId);
+        if (!proposal) {
+          results.push({
+            issueId: decision.issueId,
+            verdict: decision.verdict,
+            applied: false,
+            rowsAffected: 0,
+            error: "no proposal for this issueId — call /scan first",
+          });
+          continue;
+        }
+        if (decision.verdict === "deny") {
+          session.recordDecision({ proposal, verdict: "deny", applied: false });
+          results.push({ issueId: decision.issueId, verdict: "deny", applied: false, rowsAffected: 0 });
+          continue;
+        }
+        try {
+          // Thread the session's scan-time missingTokens into applyFix so that
+          // user-configured tokens (e.g. ["xyz"]) are treated as missing by
+          // impute, matching what the profiler flagged.
+          const out = applyFix(dataset, proposal, {
+            missingTokens: session.getScanOptions().missingTokens,
+          });
+          dataset = out.dataset;
+          session.updateDataset(dataset);
+          session.recordDecision({
+            proposal,
+            verdict: decision.verdict,
+            applied: true,
+          });
+          if (decision.remember) {
+            session.addAutoAllowRule(proposal.issueType);
+          }
+          results.push({
+            issueId: decision.issueId,
+            verdict: decision.verdict,
+            applied: true,
+            rowsAffected: out.rowsAffected,
+          });
+        } catch (e: unknown) {
+          const msg = e instanceof Error ? e.message : String(e);
+          results.push({
+            issueId: decision.issueId,
+            verdict: decision.verdict,
+            applied: false,
+            rowsAffected: 0,
+            error: msg,
+          });
+        }
+      }
+      // Verification pass: re-run the profiler on the cleaned dataset so the
+      // UI can surface anything the proposals didn't actually fix. These are
+      // genuine leftovers (denied fixes, anything impute couldn't compute,
+      // etc.). The earlier split into "acknowledged" (flag-for-review) is
+      // gone because the flag op kind is gone — every proposal now produces a
+      // real concrete change.
+      let residualIssues: DataQualityIssue[] = [];
+      if (dataset) {
+        const { profileDataset } = await import("./agent/tools/dataguard/profile-dataset");
+        residualIssues = profileDataset(dataset, session.getScanOptions());
+      }
+      return {
+        applied: results.filter(r => r.applied).length,
+        denied: results.filter(r => r.verdict === "deny").length,
+        failed: results.filter(r => !r.applied && r.verdict !== "deny").length,
+        datasetRowCount: dataset?.rows.length ?? 0,
+        results,
+        residualIssues,
+        residualCount: residualIssues.length,
+      };
+    },
+    {
+      body: t.Object({
+        decisions: t.Array(
+          t.Object(
+            {
+              issueId: t.String(),
+              // "modify" was cut by #11a — body schema must reject it (the
+              // legacy handler executed the original proposalParams anyway and
+              // only logged the user's free-text, which silently lied to users).
+              verdict: t.Union([t.Literal("allow"), t.Literal("deny")]),
+              remember: t.Optional(t.Boolean()),
+            },
+            // Reject legacy fields like `modifiedAction` outright instead of
+            // silently dropping them — callers that still send them are buggy.
+            { additionalProperties: false }
+          )
+        ),
+      }),
+    }
+  )
+
+  // Return the in-memory cleaned dataset as a CSV blob. The frontend uses this
+  // after "Apply selected" to upload the cleaned data back as a new dataset
+  // version, then auto-runs the workflow.
+  .get("/:id/dataguard/export-csv", ({ params: { id }, set }) => {
+    const agent = getAgent(id);
+    const dataset = agent.getDataGuardSession().getDataset();
+    if (!dataset) {
+      set.status = 404;
+      return "No dataset loaded.";
+    }
+    const escape = (v: unknown): string => {
+      if (v === null || v === undefined) return "";
+      const s = String(v);
+      return /[",\n\r]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
+    };
+    const lines: string[] = [];
+    lines.push(dataset.columns.map(escape).join(","));
+    for (const row of dataset.rows) {
+      lines.push(dataset.columns.map(c => escape(row[c])).join(","));
+    }
+    set.headers["content-type"] = "text/csv; charset=utf-8";
+    return lines.join("\n");
+  })
+
   .get("/:id/dataguard/session", ({ params: { id } }) => {
     const agent = getAgent(id);
     const session = agent.getDataGuardSession();
@@ -354,7 +587,6 @@ const agentsRouter = new Elysia({ prefix: "/agents" })
       datasetColumnCount: dataset?.columns.length ?? 0,
       issues: session.getIssues(),
       decisionLog: session.getDecisionLog(),
-      flaggedRows: session.getFlaggedRows(),
       autoAllowRules: session.getAutoAllowRules(),
     };
   })
@@ -442,9 +674,10 @@ interface WsMessage {
   messageSource?: "chat" | "feedback";
   // Fields below carry the user's verdict on a pending-approval step.
   // Used when type === "decision". See agent/tools/dataguard/with-approval.ts.
+  // "modify" verdict was cut by #11a (it silently lied — the handler ran the
+  // original proposalParams and just logged the user's free-text).
   stepId?: string;
-  verdict?: "allow" | "deny" | "modify";
-  modifiedAction?: string;
+  verdict?: "allow" | "deny";
   remember?: boolean;
 }
 
@@ -515,7 +748,10 @@ function broadcastToAgent(agentId: string, message: WsOutgoingMessage): void {
 }
 
 export function buildApp() {
-  return new Elysia()
+  // `normalize: false` so body schemas with additionalProperties:false can
+  // reject unknown fields (Elysia 1.4 silently strips them by default — see
+  // the #11a modify-reject tests).
+  return new Elysia({ normalize: false })
     .use(cors())
     .group(env.API_PREFIX, app =>
       app
@@ -582,10 +818,27 @@ export function buildApp() {
             );
             return;
           }
+          if (msg.verdict !== "allow" && msg.verdict !== "deny") {
+            ws.send(
+              JSON.stringify({
+                type: "error",
+                error: `verdict must be "allow" or "deny" (got "${msg.verdict}")`,
+              })
+            );
+            return;
+          }
+          if (msg.verdict === "deny" && msg.remember === true) {
+            ws.send(
+              JSON.stringify({
+                type: "error",
+                error: "remember=true only applies to allow decisions",
+              })
+            );
+            return;
+          }
           const resolved = agent.resolveDecision(msg.stepId, {
             stepId: msg.stepId,
             verdict: msg.verdict,
-            modifiedAction: msg.modifiedAction,
             remember: msg.remember,
           });
           wsLog.info(
@@ -663,6 +916,12 @@ export function _resetAgentStoreForTests(): void {
   agentCounter = 0;
 }
 
+// Reach an agent by id from a test so a test can seed its DataGuardSession
+// directly (avoids running the LLM-backed /scan to set up state).
+export function _getAgentForTests(id: string): TexeraAgent | undefined {
+  return agentStore.get(id);
+}
+
 function printStartupMessage(app: ReturnType<typeof buildApp>) {
   const LINE = "=".repeat(60);
   console.log(LINE);
diff --git a/agent-service/src/types/dataguard.test.ts b/agent-service/src/types/dataguard.test.ts
index 53047e08879..534e657ad06 100644
--- a/agent-service/src/types/dataguard.test.ts
+++ b/agent-service/src/types/dataguard.test.ts
@@ -113,6 +113,8 @@ describe("DataGuard type shapes", () => {
     };
     expect(entry.userDecision).toBe("allow");
     expect(entry.appliedAt).toBeDefined();
+    // @ts-expect-error modifiedAction was cut from DecisionLogEntry by #11a;
+    // this assertion locks the absence of the property at the type level.
     expect(entry.modifiedAction).toBeUndefined();
   });
 
@@ -131,23 +133,6 @@ describe("DataGuard type shapes", () => {
     expect(entry.appliedAt).toBeUndefined();
   });
 
-  test("DecisionLogEntry: modified — carries modifiedAction", () => {
-    const entry: DecisionLogEntry = {
-      decisionId: "dec-3",
-      timestamp: "2026-05-14T12:02:00.000Z",
-      issueType: "missing_value",
-      targetRowCount: 17,
-      proposedAction: "Impute missing glucose with group median",
-      userDecision: "modify",
-      modifiedAction: "Flag for manual review",
-      reason: "Imbalance across groups makes imputation risky.",
-      confidence: "medium",
-      appliedAt: "2026-05-14T12:02:05.000Z",
-    };
-    expect(entry.userDecision).toBe("modify");
-    expect(entry.modifiedAction).toBe("Flag for manual review");
-  });
-
   test("AutoAllowRule: per-issue-type policy", () => {
     const rule: AutoAllowRule = {
       ruleId: "rule-1",
@@ -164,17 +149,6 @@ describe("DataGuard type shapes", () => {
       remember: true,
     };
     expect(decision.remember).toBe(true);
-    expect(decision.modifiedAction).toBeUndefined();
-  });
-
-  test("PermissionDecision: modify with modifiedAction", () => {
-    const decision: PermissionDecision = {
-      stepId: "step-42",
-      verdict: "modify",
-      modifiedAction: "Flag for manual review instead of impute",
-    };
-    expect(decision.verdict).toBe("modify");
-    expect(decision.modifiedAction).toBeDefined();
   });
 
   test("PermissionDecision: deny", () => {
@@ -186,13 +160,12 @@ describe("DataGuard type shapes", () => {
   });
 
   test("Literal unions accept all documented members", () => {
-    const risks: RiskTier[] = ["low", "medium", "high"];
+    const risks: RiskTier[] = ["low", "medium", "high", "warning"];
     const confidences: Confidence[] = ["low", "medium", "high"];
     const issueTypes: IssueType[] = [
       "placeholder_value",
       "missing_value",
       "duplicate_id",
-      "out_of_range",
       "outlier",
       "inconsistent_label",
     ];
@@ -200,7 +173,6 @@ describe("DataGuard type shapes", () => {
       "replace_value",
       "drop_rows",
       "impute",
-      "flag",
       "standardize",
       "trim_whitespace",
       "rename_column",
@@ -208,14 +180,13 @@ describe("DataGuard type shapes", () => {
     const verdicts: Verdict[] = [
       "allow",
       "deny",
-      "modify",
       "auto_allow_low_risk",
       "auto_allow_remembered",
     ];
-    expect(risks).toHaveLength(3);
+    expect(risks).toHaveLength(4);
     expect(confidences).toHaveLength(3);
-    expect(issueTypes).toHaveLength(6);
-    expect(opKinds).toHaveLength(7);
-    expect(verdicts).toHaveLength(5);
+    expect(issueTypes).toHaveLength(5);
+    expect(opKinds).toHaveLength(6);
+    expect(verdicts).toHaveLength(4);
   });
 });
diff --git a/agent-service/src/types/dataguard.ts b/agent-service/src/types/dataguard.ts
index d5b17ac1654..6eb113dd080 100644
--- a/agent-service/src/types/dataguard.ts
+++ b/agent-service/src/types/dataguard.ts
@@ -21,15 +21,21 @@
 // DataGuard tools (profile_dataset / suggest_fix / apply_fix / write_decision_log),
 // the agent's permission-gating layer, and the chat-panel approval UI.
 
-export type RiskTier = "low" | "medium" | "high";
+// `warning` marks a fix the agent thinks is risky enough to recommend manual
+// review — UI defaults the checkbox to unchecked and renders an orange badge.
+// The fix itself is still concrete and applicable (no more no-op "flag" ops).
+export type RiskTier = "low" | "medium" | "high" | "warning";
 
 export type Confidence = "low" | "medium" | "high";
 
+// `outlier` is the validRanges-based detector. Earlier there was a separate
+// z-score "outlier" detector that flagged anything beyond ±3σ — too aggressive
+// (it removed legitimately large but consecutive readings), so it was dropped.
+// The remaining detector requires the user to supply a hard min/max per column.
 export type IssueType =
   | "placeholder_value"
   | "missing_value"
   | "duplicate_id"
-  | "out_of_range"
   | "outlier"
   | "inconsistent_label";
 
@@ -37,15 +43,17 @@ export type FixOperationKind =
   | "replace_value"
   | "drop_rows"
   | "impute"
-  | "flag"
   | "standardize"
   | "trim_whitespace"
   | "rename_column";
 
+// "modify" was removed for MVP (#11a) to avoid silent fallback — the legacy
+// handler recorded a user-supplied action override in the log but always
+// executed the original proposal.operationParams. Revisit post-hackathon
+// with a real natural-language → operationParams parser.
 export type Verdict =
   | "allow"
   | "deny"
-  | "modify"
   | "auto_allow_low_risk"
   | "auto_allow_remembered";
 
@@ -78,7 +86,6 @@ export interface FixProposal {
 export interface PermissionDecision {
   stepId: string;
   verdict: Verdict;
-  modifiedAction?: string;
   remember?: boolean;
 }
 
@@ -89,7 +96,6 @@ export interface DecisionLogEntry {
   targetRowCount: number;
   proposedAction: string;
   userDecision: Verdict;
-  modifiedAction?: string;
   reason: string;
   confidence: Confidence;
   appliedAt?: string;
diff --git a/common/config/src/main/resources/gui.conf b/common/config/src/main/resources/gui.conf
index d58d94ac7b9..9bef054a404 100644
--- a/common/config/src/main/resources/gui.conf
+++ b/common/config/src/main/resources/gui.conf
@@ -109,7 +109,7 @@ gui {
     active-time-in-minutes = ${?GUI_WORKFLOW_WORKSPACE_ACTIVE_TIME_IN_MINUTES}
 
     # whether AI copilot feature is enabled
-    copilot-enabled = false
+    copilot-enabled = true
     copilot-enabled = ${?GUI_WORKFLOW_WORKSPACE_COPILOT_ENABLED}
 
     # the limit of columns to be displayed in the result table
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
index 34f88ced675..914f7306031 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
+++ b/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
@@ -21,7 +21,6 @@ import { Component, HostListener, Input, OnDestroy, OnInit, OnChanges, SimpleCha
 import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy";
 import { NzResizeEvent, NzResizableDirective, NzResizeHandlesComponent } from "ng-zorro-antd/resizable";
 import { AgentService, AgentInfo } from "../../../service/agent/agent.service";
-import { DataGuardAutoTriggerService } from "../../../service/agent/data-guard-auto-trigger.service";
 import { WorkflowActionService } from "../../../service/workflow-graph/model/workflow-action.service";
 import { NotificationService } from "../../../../common/service/notification/notification.service";
 import { calculateTotalTranslate3d } from "../../../../common/util/panel-dock";
@@ -99,8 +98,7 @@ export class AgentPanelComponent implements OnInit, OnDestroy, OnChanges {
   constructor(
     private agentService: AgentService,
     private workflowActionService: WorkflowActionService,
-    private notificationService: NotificationService,
-    private dataGuardAutoTrigger: DataGuardAutoTriggerService
+    private notificationService: NotificationService
   ) {}
 
   ngOnInit(): void {
@@ -128,20 +126,6 @@ export class AgentPanelComponent implements OnInit, OnDestroy, OnChanges {
         this.tryActivateAgentFromInput();
       });
 
-    // DataGuard auto-trigger: when a dataset-reading operator is added to the
-    // workflow, notify the user that DataGuard is ready to scan. Full
-    // integration (create agent + POST /agents/:id/dataguard/dataset + send
-    // initial "scan this dataset" message) is a follow-up that needs the
-    // referenced CSV bytes — for now we surface the trigger so the
-    // §5 storyboard "no manual invocation" beat is in place.
-    this.dataGuardAutoTrigger
-      .getDatasetAddedStream()
-      .pipe(untilDestroyed(this))
-      .subscribe(op => {
-        this.notificationService.info(
-          `DataGuard ready to scan ${op.operatorType}. Open the chat panel and ask "scan this dataset for quality issues".`
-        );
-      });
   }
 
   ngOnChanges(changes: SimpleChanges): void {
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
index e2544f8434d..58963b0893a 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
@@ -24,28 +24,15 @@
     </div>
   </div>
 
-  <div class="dg-permission__actions" *ngIf="!isModifying">
+  <div class="dg-permission__actions">
     <button nz-button nzType="primary" (click)="onAllow(false)">Allow</button>
     <button
       nz-button
-      *ngIf="step.pendingApproval.riskTier !== 'high'"
+      *ngIf="step.pendingApproval.riskTier !== 'high' && step.pendingApproval.riskTier !== 'warning'"
       (click)="onAllow(true)">
       Allow &amp; don't ask for similar
     </button>
     <button nz-button (click)="onDeny()">Deny</button>
-    <button nz-button (click)="openModify()">Modify…</button>
-  </div>
-
-  <div class="dg-permission__modify" *ngIf="isModifying">
-    <textarea
-      nz-input
-      rows="3"
-      [(ngModel)]="modifiedAction"
-      placeholder="Describe the modified action (e.g., 'Flag for manual review instead of impute')"></textarea>
-    <div class="dg-permission__actions">
-      <button nz-button nzType="primary" (click)="submitModify()">Submit modification</button>
-      <button nz-button (click)="cancelModify()">Cancel</button>
-    </div>
   </div>
 </div>
 
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
index f0f32539e43..07dc1c545a3 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
@@ -45,6 +45,11 @@
   color: #cf1322;
 }
 
+.dg-permission__tier--warning {
+  background: #ffe7ba;
+  color: #ad4e00;
+}
+
 .dg-permission__body {
   margin: 8px 0;
 }
@@ -67,12 +72,3 @@
   margin-top: 8px;
   flex-wrap: wrap;
 }
-
-.dg-permission__modify {
-  margin-top: 8px;
-
-  textarea {
-    width: 100%;
-    margin-bottom: 8px;
-  }
-}
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts
index 7423d9aefb8..b5b0315dd4b 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.ts
@@ -19,9 +19,7 @@
 
 import { Component, Input } from "@angular/core";
 import { NgIf } from "@angular/common";
-import { FormsModule } from "@angular/forms";
 import { NzButtonComponent } from "ng-zorro-antd/button";
-import { NzInputDirective } from "ng-zorro-antd/input";
 import { ReActStep } from "../../../../service/agent/agent-types";
 import { AgentService } from "../../../../service/agent/agent.service";
 
@@ -34,7 +32,7 @@ import { AgentService } from "../../../../service/agent/agent.service";
 @Component({
   selector: "texera-permission-prompt",
   standalone: true,
-  imports: [NgIf, FormsModule, NzButtonComponent, NzInputDirective],
+  imports: [NgIf, NzButtonComponent],
   templateUrl: "./permission-prompt.component.html",
   styleUrls: ["./permission-prompt.component.scss"],
 })
@@ -42,8 +40,6 @@ export class PermissionPromptComponent {
   @Input() step!: ReActStep;
   @Input() agentId!: string;
 
-  public isModifying = false;
-  public modifiedAction = "";
   public submitted = false;
 
   constructor(private readonly agentService: AgentService) {}
@@ -59,22 +55,4 @@ export class PermissionPromptComponent {
     this.submitted = true;
     this.agentService.sendDecision(this.agentId, this.step.id, "deny");
   }
-
-  public openModify(): void {
-    if (this.submitted) return;
-    this.isModifying = true;
-    this.modifiedAction = this.step.pendingApproval?.proposal.action ?? "";
-  }
-
-  public submitModify(): void {
-    if (this.submitted) return;
-    this.submitted = true;
-    this.agentService.sendDecision(this.agentId, this.step.id, "modify", {
-      modifiedAction: this.modifiedAction,
-    });
-  }
-
-  public cancelModify(): void {
-    this.isModifying = false;
-  }
 }
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html
new file mode 100644
index 00000000000..f50626061a0
--- /dev/null
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html
@@ -0,0 +1,160 @@
+<!-- cdkDragBoundary keeps the panel inside the viewport so a user can't drag
+     it behind the toolbar / off-screen and lose access to the close button. -->
+<div class="dg-panel" *ngIf="isOpen" cdkDrag cdkDragBoundary="body">
+  <!-- Header doubles as the drag handle. -->
+  <div class="dg-panel__header" cdkDragHandle>
+    <span class="dg-panel__title">
+      <i nz-icon nzType="safety-certificate" nzTheme="twotone" nzTwotoneColor="#0050b3"></i>
+      DataGuard
+    </span>
+    <span class="dg-panel__status">{{ statusBadge() }}</span>
+    <button
+      nz-button
+      nzType="text"
+      nzSize="small"
+      class="dg-panel__close"
+      (click)="onClose()"
+      title="Dismiss checklist">
+      <i nz-icon nzType="close"></i>
+    </button>
+  </div>
+
+  <!-- Subheader: dataset summary + message -->
+  <div class="dg-panel__sub">
+    <ng-container *ngIf="scan.datasetSource">
+      We checked <strong>{{ scan.datasetRows }} rows</strong> in
+      <code>{{ scan.datasetSource }}</code>.
+    </ng-container>
+    <div *ngIf="scan.message" class="dg-panel__message">{{ scan.message }}</div>
+  </div>
+
+  <!-- Empty state during scanning -->
+  <div *ngIf="scan.state === 'scanning'" class="dg-panel__empty">
+    <i nz-icon nzType="loading"></i>&nbsp;Looking for problems in your data…
+  </div>
+
+  <!-- Empty state when scan finished with 0 issues -->
+  <div *ngIf="scan.state === 'ready' && scan.entries.length === 0" class="dg-panel__empty dg-panel__empty--clean">
+    <i nz-icon nzType="check-circle" nzTheme="twotone"></i>
+    Your data looks good — nothing to fix.
+  </div>
+
+  <!-- Category roll-up: helps users see what *kinds* of mistakes the file has
+       at a glance ("3 Missing values · 2 Placeholder values · 1 Out-of-range value"). -->
+  <div
+    *ngIf="scan.state === 'ready' && scan.entries.length > 0"
+    class="dg-panel__categories">
+    <span
+      *ngFor="let c of categorySummary(); let last = last"
+      class="dg-panel__category-chip">
+      {{ c.count }} {{ c.label }}{{ c.count === 1 ? "" : "s" }}<span *ngIf="!last" class="dg-panel__category-sep">·</span>
+    </span>
+  </div>
+
+  <!-- Bulk-action toolbar (only when ready and there are issues) -->
+  <div
+    *ngIf="scan.state === 'ready' && scan.entries.length > 0"
+    class="dg-panel__bulk">
+    <span>{{ selectedCount }} to fix · {{ deniedCount }} skipped</span>
+    <span class="dg-panel__bulk-actions">
+      <button nz-button nzSize="small" (click)="onSelectAll()">Fix all</button>
+      <button nz-button nzSize="small" (click)="onDenyAll()">Skip all</button>
+    </span>
+  </div>
+
+  <!-- Checklist of issues -->
+  <ul class="dg-panel__list" *ngIf="scan.entries.length > 0">
+    <li
+      *ngFor="let entry of scan.entries"
+      class="dg-row"
+      [class.dg-row--deny]="entry.verdict === 'deny'"
+      [class.dg-row--allow]="entry.verdict === 'allow'"
+      [class.dg-row--error]="entry.error">
+      <div class="dg-row__head">
+        <label
+          nz-checkbox
+          [nzChecked]="entry.verdict === 'allow'"
+          [nzDisabled]="entry.verdict === 'deny' || !!entry.error"
+          (nzCheckedChange)="onToggleAllow(entry, $event)">
+          <span class="dg-row__category">{{ categoryLabel(entry) }}</span>
+          <span class="dg-row__action">{{ entry.proposal?.action || entry.issue.description }}</span>
+        </label>
+        <span class="dg-row__tier dg-row__tier--{{ riskTierLabel(entry) }}">
+          {{ riskTierLabel(entry) }}
+        </span>
+      </div>
+
+      <div class="dg-row__details">
+        <button
+          type="button"
+          class="dg-row__locate"
+          (click)="onShowInResultPanel(entry)"
+          [title]="locateTooltip(entry)">
+          <i nz-icon nzType="environment" nzTheme="outline"></i>
+          In column <b>{{ entry.issue.column }}</b> · affects {{ entry.issue.affectedRowCount }} row(s)
+        </button>
+        <div class="dg-row__field" *ngIf="entry.proposal">{{ entry.proposal.reason }}</div>
+        <div class="dg-row__field dg-row__field--error" *ngIf="entry.error">
+          ⚠ We couldn't suggest a fix for this one. You can skip it.
+        </div>
+      </div>
+
+      <!-- Row footer: Deny button + "remember" toggle -->
+      <div class="dg-row__foot">
+        <button
+          nz-button
+          nzType="text"
+          nzSize="small"
+          [disabled]="entry.verdict === 'deny' || !!entry.error"
+          (click)="onDeny(entry)">
+          <i nz-icon nzType="close-circle"></i>&nbsp;Skip
+        </button>
+        <label
+          *ngIf="!!entry.proposal && riskTierLabel(entry) !== 'high' && riskTierLabel(entry) !== 'warning'"
+          nz-checkbox
+          [nzChecked]="!!entry.remember"
+          [nzDisabled]="entry.verdict !== 'allow'"
+          (nzCheckedChange)="onToggleRemember(entry, $event)"
+          class="dg-row__remember">
+          Always do this
+        </label>
+      </div>
+    </li>
+  </ul>
+
+  <!-- Footer: Apply Selected -->
+  <div class="dg-panel__foot" *ngIf="scan.state === 'ready' && scan.entries.length > 0">
+    <button
+      nz-button
+      nzType="primary"
+      [disabled]="selectedCount === 0 && deniedCount === 0"
+      (click)="onApplySelected()">
+      Fix {{ selectedCount }} & run
+      <span *ngIf="deniedCount > 0">&nbsp;(skip {{ deniedCount }})</span>
+    </button>
+  </div>
+
+  <div class="dg-panel__foot dg-panel__foot--split" *ngIf="scan.state === 'done' || scan.state === 'error'">
+    <button
+      nz-button
+      nzType="default"
+      (click)="onRescan()"
+      [disabled]="isRescanning"
+      title="Re-run DataGuard on the current dataset version. Useful if the AI missed something the first time.">
+      <i nz-icon nzType="reload" [nzSpin]="isRescanning"></i>&nbsp;Scan again
+    </button>
+    <button nz-button nzType="primary" (click)="onClose()">Close</button>
+  </div>
+</div>
+
+<!-- Floating DataGuard icon, shown when the panel is closed but the shield
+     toggle in the toolbar is ON. Click triggers a fresh scan against the
+     current canvas. Hidden entirely when the user has turned DataGuard off. -->
+<button
+  *ngIf="showFloater"
+  type="button"
+  class="dg-floater"
+  (click)="onFloaterClick()"
+  title="Scan the current dataset operator with DataGuard">
+  <i nz-icon nzType="safety-certificate" nzTheme="twotone" nzTwotoneColor="#0050b3"></i>
+</button>
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss
new file mode 100644
index 00000000000..efc9146f712
--- /dev/null
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss
@@ -0,0 +1,319 @@
+.dg-panel {
+  position: fixed;
+  bottom: 100px;
+  right: 80px;
+  width: 480px;
+  max-height: 70vh;
+  background: #fff;
+  border: 1px solid #d9d9d9;
+  border-radius: 8px;
+  box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15);
+  z-index: 10;
+  display: flex;
+  flex-direction: column;
+  font-size: 0.85rem;
+}
+
+.dg-panel__header {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  padding: 10px 12px;
+  border-bottom: 1px solid #f0f0f0;
+  background: #fafafa;
+  border-radius: 8px 8px 0 0;
+  // Header is the cdkDragHandle; show a move cursor so it's discoverable.
+  cursor: move;
+
+  // The close button inside the header should still feel clickable, not draggable.
+  .dg-panel__close {
+    cursor: pointer;
+  }
+}
+
+// While dragging, the CDK adds .cdk-drag-preview to the moving copy.
+.dg-panel.cdk-drag-preview {
+  box-shadow: 0 12px 28px rgba(0, 0, 0, 0.25);
+}
+
+.dg-panel__title {
+  font-weight: 600;
+  font-size: 0.95rem;
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  flex: 1;
+}
+
+.dg-panel__status {
+  font-size: 0.75rem;
+  color: #8c8c8c;
+  background: #f0f5ff;
+  padding: 2px 8px;
+  border-radius: 12px;
+}
+
+.dg-panel__close {
+  margin-left: auto;
+}
+
+.dg-panel__sub {
+  padding: 8px 12px;
+  border-bottom: 1px solid #f5f5f5;
+  font-size: 0.78rem;
+  color: #595959;
+
+  code {
+    background: #f5f5f5;
+    padding: 1px 6px;
+    border-radius: 3px;
+    font-size: 0.75rem;
+  }
+}
+
+.dg-panel__message {
+  margin-top: 4px;
+  font-style: italic;
+  color: #1890ff;
+}
+
+.dg-row__category {
+  display: inline-block;
+  font-size: 0.65rem;
+  font-weight: 600;
+  text-transform: uppercase;
+  letter-spacing: 0.5px;
+  padding: 2px 8px;
+  border-radius: 4px;
+  background: #e6f4ff;
+  color: #003eb3;
+  margin-right: 6px;
+}
+
+.dg-panel__empty {
+  padding: 24px;
+  text-align: center;
+  color: #8c8c8c;
+
+  &--clean {
+    color: #389e0d;
+  }
+}
+
+.dg-panel__categories {
+  padding: 6px 12px;
+  border-bottom: 1px solid #f5f5f5;
+  display: flex;
+  flex-wrap: wrap;
+  gap: 4px;
+  font-size: 0.74rem;
+  color: #003eb3;
+  background: #f0f8ff;
+}
+
+.dg-panel__category-chip {
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+  font-weight: 500;
+}
+
+.dg-panel__category-sep {
+  margin: 0 2px;
+  color: #adc6ff;
+  font-weight: 400;
+}
+
+.dg-panel__bulk {
+  padding: 6px 12px;
+  border-bottom: 1px solid #f5f5f5;
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  font-size: 0.75rem;
+  color: #595959;
+}
+
+.dg-panel__bulk-actions {
+  display: flex;
+  gap: 6px;
+}
+
+.dg-panel__list {
+  list-style: none;
+  margin: 0;
+  padding: 0;
+  overflow-y: auto;
+  flex: 1;
+}
+
+.dg-row {
+  padding: 10px 12px;
+  border-bottom: 1px solid #f5f5f5;
+  transition: background 0.15s;
+
+  &:last-child {
+    border-bottom: none;
+  }
+
+  &--allow {
+    background: #f6ffed;
+  }
+
+  &--deny {
+    background: #fff1f0;
+    opacity: 0.75;
+  }
+
+  &--error {
+    background: #fff7e6;
+  }
+}
+
+.dg-row__head {
+  display: flex;
+  align-items: flex-start;
+  justify-content: space-between;
+  gap: 8px;
+}
+
+.dg-row__action {
+  font-weight: 500;
+  margin-left: 4px;
+}
+
+.dg-row__tier {
+  font-size: 0.7rem;
+  font-weight: 600;
+  text-transform: uppercase;
+  padding: 2px 8px;
+  border-radius: 4px;
+  letter-spacing: 0.5px;
+  flex-shrink: 0;
+
+  &--low {
+    background: #d9f7be;
+    color: #389e0d;
+  }
+
+  &--medium {
+    background: #fff1b8;
+    color: #d48806;
+  }
+
+  &--high {
+    background: #ffccc7;
+    color: #cf1322;
+  }
+
+  &--warning {
+    background: #ffe7ba;
+    color: #ad4e00;
+  }
+}
+
+.dg-row__details {
+  margin: 6px 0 0 26px;
+  font-size: 0.75rem;
+  color: #595959;
+  line-height: 1.4;
+}
+
+.dg-row__field {
+  margin: 2px 0;
+}
+
+.dg-row__field--error {
+  color: #cf1322;
+}
+
+// "Locate this issue in the result panel" affordance — visually a flat link
+// that sits on the same line as the column/row-count detail, so users don't
+// see a second control to think about.
+.dg-row__locate {
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+  padding: 0;
+  margin: 2px 0;
+  background: transparent;
+  border: none;
+  font: inherit;
+  color: #1677ff;
+  cursor: pointer;
+  text-align: left;
+
+  i {
+    font-size: 0.85rem;
+  }
+
+  &:hover {
+    text-decoration: underline;
+  }
+
+  &:focus-visible {
+    outline: 1px dashed #1677ff;
+    outline-offset: 2px;
+  }
+}
+
+.dg-row__foot {
+  margin-top: 6px;
+  margin-left: 22px;
+  display: flex;
+  gap: 4px;
+  align-items: center;
+}
+
+.dg-row__remember {
+  margin-left: 8px;
+  font-size: 0.72rem;
+  color: #8c8c8c;
+}
+
+.dg-panel__foot {
+  padding: 10px 12px;
+  border-top: 1px solid #f0f0f0;
+  background: #fafafa;
+  border-radius: 0 0 8px 8px;
+  text-align: right;
+}
+
+.dg-panel__foot--split {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  gap: 8px;
+}
+
+// Floating "open DataGuard" button, sits where the panel used to be.
+// Lives on the canvas overlay layer so it doesn't interfere with operators.
+.dg-floater {
+  position: fixed;
+  bottom: 100px;
+  right: 80px;
+  width: 44px;
+  height: 44px;
+  border-radius: 50%;
+  border: 1px solid #91caff;
+  background: #e6f4ff;
+  cursor: pointer;
+  z-index: 10;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.12);
+  transition: background 0.15s, transform 0.1s;
+
+  i {
+    font-size: 1.4rem;
+  }
+
+  &:hover {
+    background: #bae0ff;
+  }
+
+  &:active {
+    transform: scale(0.95);
+  }
+}
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
new file mode 100644
index 00000000000..d55d1eb16d7
--- /dev/null
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
@@ -0,0 +1,369 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Component, OnDestroy, OnInit } from "@angular/core";
+import { NgFor, NgIf } from "@angular/common";
+import { Subscription } from "rxjs";
+import { CdkDrag, CdkDragHandle } from "@angular/cdk/drag-drop";
+import { NzIconDirective } from "ng-zorro-antd/icon";
+import { NzButtonComponent } from "ng-zorro-antd/button";
+import { NzCheckboxComponent } from "ng-zorro-antd/checkbox";
+import {
+  DataGuardResultsService,
+  DataGuardScanResult,
+  ChecklistEntry,
+} from "../../service/agent/data-guard-results.service";
+import { DataGuardAutoTriggerService } from "../../service/agent/data-guard-auto-trigger.service";
+import { DataGuardSettingsService } from "../../service/agent/data-guard-settings.service";
+import { DataGuardRowNavigatorService } from "../../service/agent/data-guard-row-navigator.service";
+import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service";
+import { NotificationService } from "../../../common/service/notification/notification.service";
+
+/**
+ * Standalone DataGuard checklist panel.
+ *
+ * UX (per user request):
+ *   - When DataGuard auto-trigger detects a dataset and runs /scan, the
+ *     issues + proposals land in DataGuardResultsService.
+ *   - This component renders them as a checklist with per-row Allow / Deny
+ *     pickers (low-risk pre-checked Allow).
+ *   - Click "Apply Selected" to send the batch decision to
+ *     /api/agents/:id/dataguard/apply-batch. The agent's chat is NOT
+ *     involved — so the LLM can't accidentally call deleteOperator or modify
+ *     the workflow canvas.
+ *
+ * Visibility:
+ *   - Floating panel docked bottom-right, slides up when state ≠ "idle".
+ *   - Collapsed via close button → state goes back to "idle".
+ */
+@Component({
+  selector: "texera-dataguard-checklist",
+  standalone: true,
+  imports: [NgIf, NgFor, NzIconDirective, NzButtonComponent, NzCheckboxComponent, CdkDrag, CdkDragHandle],
+  templateUrl: "./dataguard-checklist.component.html",
+  styleUrls: ["./dataguard-checklist.component.scss"],
+})
+export class DataGuardChecklistComponent implements OnInit, OnDestroy {
+  public scan: DataGuardScanResult = {
+    agentId: "",
+    state: "idle",
+    entries: [],
+    datasetSource: "",
+    datasetRows: 0,
+    datasetColumns: 0,
+  };
+
+  // Cached row-verdict tallies. Recomputed once per state push (see ngOnInit)
+  // rather than three times per Angular change-detection tick. Default-CD
+  // means every event walks the template, so a get() that filters the entries
+  // array fires for *each* row in *each* tick — quadratic with a fat list.
+  public selectedCount = 0;
+  public deniedCount = 0;
+  public pendingCount = 0;
+
+  private sub?: Subscription;
+  // Owns the workflow-graph subscription that powers auto-trigger orchestration.
+  // Lives here (not in agent-panel) so the gate is "is the checklist mounted?"
+  // — the only consumer of the auto-trigger output.
+  private orchestrationSub?: Subscription;
+
+  // Per-row cursor for the "📍" locate-cycle affordance. Keyed by `issueId`
+  // so each detector row gets an independent cursor. Cleared on every fresh
+  // scan push — different issueIds means stale keys would just become
+  // garbage, but a hard reset keeps memory bounded and the behaviour
+  // predictable. The cursor value is the index of the *next* click —
+  // i.e., on entry it is 0, and after navigating to indices[0] it becomes 1.
+  private locateCursors = new Map<string, number>();
+
+  constructor(
+    private readonly results: DataGuardResultsService,
+    private readonly autoTrigger: DataGuardAutoTriggerService,
+    private readonly settings: DataGuardSettingsService,
+    private readonly workflowActionService: WorkflowActionService,
+    private readonly rowNavigator: DataGuardRowNavigatorService,
+    private readonly notificationService: NotificationService
+  ) {}
+
+  // ---------------- floating reopen button ----------------
+
+  /** Shield toggle in the toolbar gates the floater. When the user explicitly
+   *  turns DataGuard off for this workflow, the floater must disappear too —
+   *  otherwise the "OFF" toggle would be a lie. */
+  public get shieldOn(): boolean {
+    const wid = this.workflowActionService.getWorkflowMetadata()?.wid;
+    if (wid === undefined) return true;
+    return this.settings.isEnabled(wid);
+  }
+
+  /** Visible iff: panel is closed (idle) AND the shield toggle is on. */
+  public get showFloater(): boolean {
+    return this.scan.state === "idle" && this.shieldOn;
+  }
+
+  /** User clicked the floating DataGuard icon. Always triggers a fresh scan
+   *  of whatever dataset operator is on the canvas — that's what the user
+   *  picked when we asked "click behavior?". */
+  public onFloaterClick(): void {
+    void this.autoTrigger.rescanAny();
+  }
+
+  ngOnInit(): void {
+    // Track the issueId set of the previous push. `updateEntry` rebuilds the
+    // entries array on every verdict toggle (`.map(...)`), so identity-compare
+    // would spuriously reset cursors mid-review. Instead, reset only when the
+    // *set of issueIds* changes — that's the actual "fresh scan" signal.
+    let lastIssueIdsKey: string | undefined;
+    this.sub = this.results.getState$().subscribe(s => {
+      const key = s.entries.map(e => e.issueId).join("|");
+      if (key !== lastIssueIdsKey) {
+        this.locateCursors.clear();
+        lastIssueIdsKey = key;
+      }
+      this.scan = s;
+      // Tally once per state push instead of three full-walks per CD tick.
+      let allow = 0,
+        deny = 0,
+        pending = 0;
+      for (const entry of s.entries) {
+        if (entry.verdict === "allow") allow++;
+        else if (entry.verdict === "deny") deny++;
+        else pending++;
+      }
+      this.selectedCount = allow;
+      this.deniedCount = deny;
+      this.pendingCount = pending;
+    });
+    // Subscribe to operator-add / property-change so dropping a dataset
+    // operator on the canvas triggers /scan via the auto-trigger pipeline.
+    // Without this, the checklist never opens on its own.
+    this.orchestrationSub = this.autoTrigger.startOrchestration();
+  }
+
+  ngOnDestroy(): void {
+    this.sub?.unsubscribe();
+    this.orchestrationSub?.unsubscribe();
+  }
+
+  // ---------------- visibility helpers ----------------
+
+  public get isOpen(): boolean {
+    return this.scan.state !== "idle";
+  }
+
+  // ---------------- row actions ----------------
+
+  public onToggleAllow(entry: ChecklistEntry, checked: boolean): void {
+    this.results.updateEntry(entry.issueId, { verdict: checked ? "allow" : "pending" });
+  }
+
+  public onDeny(entry: ChecklistEntry): void {
+    this.results.updateEntry(entry.issueId, { verdict: "deny" });
+  }
+
+  public onToggleRemember(entry: ChecklistEntry, checked: boolean): void {
+    this.results.updateEntry(entry.issueId, { remember: checked });
+  }
+
+  // ---------------- panel actions ----------------
+
+  /** Mark every pending row as Allow. Skips already-denied. */
+  public onSelectAll(): void {
+    for (const e of this.scan.entries) {
+      if (e.verdict === "pending") this.results.updateEntry(e.issueId, { verdict: "allow" });
+    }
+  }
+
+  /** Mark every pending row as Deny. */
+  public onDenyAll(): void {
+    for (const e of this.scan.entries) {
+      if (e.verdict === "pending") this.results.updateEntry(e.issueId, { verdict: "deny" });
+    }
+  }
+
+  /** Apply the user's selection. Backend bypasses the LLM / chat entirely. */
+  public async onApplySelected(): Promise<void> {
+    const decisions = this.scan.entries
+      .filter((e): e is ChecklistEntry & { verdict: "allow" | "deny" } => e.verdict !== "pending")
+      .map(e => ({
+        issueId: e.issueId,
+        verdict: e.verdict,
+        remember: e.remember,
+      }));
+    await this.autoTrigger.applyBatch(decisions);
+  }
+
+  public onClose(): void {
+    this.results.reset();
+  }
+
+  public isRescanning = false;
+
+  /** "Scan again": re-runs DataGuard on the current dataset version. After
+   *  a previous Apply created v2, a re-scan + Apply produces v3 — letting the
+   *  user iterate when the AI missed an issue on the first pass. */
+  public async onRescan(): Promise<void> {
+    if (this.isRescanning) return;
+    this.isRescanning = true;
+    try {
+      await this.autoTrigger.rescanCurrent();
+    } finally {
+      this.isRescanning = false;
+    }
+  }
+
+  // ---------------- show-in-result-panel ----------------
+
+  /**
+   * Tooltip text for the "📍" button. Shows "Show next affected row (i of N)"
+   * where i is the 1-based index that the *next* click will navigate to. Falls
+   * back to a static label when row indices are unknown or empty.
+   */
+  public locateTooltip(entry: ChecklistEntry): string {
+    const rowIndices = entry.issue.affectedRowIndices;
+    if (!rowIndices || rowIndices.length === 0) return "Show this row in the result panel";
+    if (rowIndices.length === 1) return "Show the affected row in the result panel";
+    const cursor = this.locateCursors.get(entry.issueId) ?? 0;
+    const next = (cursor % rowIndices.length) + 1;
+    return `Show next affected row (${next} of ${rowIndices.length})`;
+  }
+
+  /**
+   * "📍" affordance: focus the source operator's result panel and jump to the
+   * next affected row, cycling through `affectedRowIndices` with a per-row
+   * cursor. Repeated clicks walk every affected row and wrap to the first.
+   * Length-1 rows re-emit the same navigator event so the result-panel pulse
+   * re-fires — the user has to *feel* that the click did something.
+   *
+   * Operator highlight + result-panel open are dispatched synchronously so the
+   * ResultTableFrameComponent has time to mount before we publish the row-nav
+   * event in a microtask — otherwise our subscriber on the table side may not
+   * exist yet on the first click after a panel close.
+   */
+  public onShowInResultPanel(entry: ChecklistEntry): void {
+    const opId = this.scan.sourceOperatorId;
+    if (!opId) {
+      this.notificationService.warning("DataGuard: no source operator recorded for this scan.");
+      return;
+    }
+    const jointGraph = this.workflowActionService.getJointGraphWrapper();
+    const operatorExists = this.workflowActionService
+      .getTexeraGraph()
+      .getAllOperators()
+      .some(op => op.operatorID === opId);
+    if (!operatorExists) {
+      this.notificationService.warning("DataGuard: the source operator was removed from the canvas.");
+      return;
+    }
+
+    const currentlyHighlighted = jointGraph.getCurrentHighlightedOperatorIDs();
+    if (!(currentlyHighlighted.length === 1 && currentlyHighlighted[0] === opId)) {
+      jointGraph.unhighlightOperators(...currentlyHighlighted);
+      jointGraph.highlightOperators(opId);
+    }
+    this.workflowActionService.openResultPanel();
+
+    const rowIndices = entry.issue.affectedRowIndices;
+    if (rowIndices === undefined) {
+      this.notificationService.info(
+        "DataGuard: opened the result panel — row indices weren't recorded for this issue."
+      );
+      return;
+    }
+    if (rowIndices.length === 0) {
+      this.notificationService.info(
+        "DataGuard: opened the result panel — no rows are affected by this issue."
+      );
+      return;
+    }
+    // Cycle through affectedRowIndices: each click advances this row's cursor
+    // by one and wraps modulo length. Length-1 rows still emit so the result
+    // panel re-pulses on every click. Pure helper for testability.
+    const cursor = this.locateCursors.get(entry.issueId) ?? 0;
+    const step = DataGuardRowNavigatorService.nextCycleStep(rowIndices, cursor);
+    this.locateCursors.set(entry.issueId, step.nextCursor);
+    // Defer one microtask so the table frame mounts before we ask it to page.
+    queueMicrotask(() =>
+      this.rowNavigator.navigate({
+        operatorId: opId,
+        rowIndex: step.value,
+        column: entry.issue.column,
+      })
+    );
+  }
+
+  // ---------------- display helpers ----------------
+
+  public riskTierLabel(entry: ChecklistEntry): string {
+    return entry.proposal?.riskTier ?? "—";
+  }
+
+  /** Human-readable category name for the row title. Maps the raw issueType
+   *  enum to plain English so non-technical users see "Missing value" instead
+   *  of "missing_value". */
+  public categoryLabel(entry: ChecklistEntry): string {
+    return this.categoryLabelForType(entry.issue.issueType);
+  }
+
+  /** Aggregate the current entries by category — drives the at-a-glance
+   *  "2 Missing values · 3 Placeholder values" summary above the row list. */
+  public categorySummary(): Array<{ label: string; count: number }> {
+    const counts = new Map<string, number>();
+    for (const entry of this.scan.entries) {
+      const label = this.categoryLabel(entry);
+      counts.set(label, (counts.get(label) ?? 0) + 1);
+    }
+    return Array.from(counts.entries()).map(([label, count]) => ({ label, count }));
+  }
+
+  private categoryLabelForType(issueType: string): string {
+    switch (issueType) {
+      case "missing_value":
+        return "Missing value";
+      case "placeholder_value":
+        return "Placeholder value";
+      case "duplicate_id":
+        return "Duplicate row";
+      case "outlier":
+        // The validRanges-based outlier — the earlier z-score variant was
+        // removed and "out_of_range" was renamed into this one.
+        return "Outlier";
+      case "inconsistent_label":
+        return "Inconsistent label";
+      default:
+        return issueType;
+    }
+  }
+
+  public statusBadge(): string {
+    switch (this.scan.state) {
+      case "scanning":
+        return "Checking…";
+      case "applying":
+        return "Fixing…";
+      case "ready":
+        return `${this.scan.entries.length} to review`;
+      case "done":
+        return "Done";
+      case "error":
+        return "Problem";
+      default:
+        return "";
+    }
+  }
+}
diff --git a/frontend/src/app/workspace/component/menu/menu.component.html b/frontend/src/app/workspace/component/menu/menu.component.html
index a21e4d56429..8d0865b8e02 100644
--- a/frontend/src/app/workspace/component/menu/menu.component.html
+++ b/frontend/src/app/workspace/component/menu/menu.component.html
@@ -360,6 +360,28 @@
           nzType="ellipsis"></i>
       </button>
 
+      <!-- DataGuard auto-scan toggle. Kept outside the utilities template so
+           it doesn't get clipped by overflow:hidden when the toolbar is tight.
+           Defaults to ON per workflow; OFF skips DataGuardAutoTriggerService
+           so adding a CSVFileScan does nothing. Persists in localStorage.
+           Icon: safety-certificate (exists in both outline + twotone — plain
+           "safety" only has an outline variant, so twotone renders blank). -->
+      <button
+        (click)="onToggleDataGuard()"
+        nz-button
+        nz-tooltip
+        id="dataguard-toggle-button"
+        [class.dataguard-toggle--on]="isDataGuardEnabled"
+        [nzTooltipTitle]="isDataGuardEnabled
+          ? 'DataGuard auto-scan: ON (click to disable for this workflow)'
+          : 'DataGuard auto-scan: OFF (click to re-enable)'">
+        <i
+          nz-icon
+          [nzType]="isDataGuardEnabled ? 'safety-certificate' : 'safety'"
+          [nzTheme]="isDataGuardEnabled ? 'twotone' : 'outline'"
+          nzTwotoneColor="#0050b3"></i>
+      </button>
+
       <nz-space-compact id="execution-buttons">
         <texera-computing-unit-selection id="texera-compute-unit-selection"> </texera-computing-unit-selection>
         <button
diff --git a/frontend/src/app/workspace/component/menu/menu.component.scss b/frontend/src/app/workspace/component/menu/menu.component.scss
index 414a1c9cef6..95b81d9a181 100644
--- a/frontend/src/app/workspace/component/menu/menu.component.scss
+++ b/frontend/src/app/workspace/component/menu/menu.component.scss
@@ -179,6 +179,24 @@ i {
   }
 }
 
+#dataguard-toggle-button {
+  flex: 0 0 auto;
+  margin-right: 8px;
+  min-width: 36px;
+
+  // Make the active state unmistakable — a dark-blue "lit" shield with a
+  // tinted background so users can tell at a glance that DataGuard is on.
+  &.dataguard-toggle--on {
+    background-color: #e6f4ff;
+    border-color: #69b1ff;
+    color: #0050b3;
+
+    i {
+      font-size: 18px;
+    }
+  }
+}
+
 texera-user-icon {
   margin-left: 10px;
 }
diff --git a/frontend/src/app/workspace/component/menu/menu.component.ts b/frontend/src/app/workspace/component/menu/menu.component.ts
index 6375fbcee4e..0f3e0eb211c 100644
--- a/frontend/src/app/workspace/component/menu/menu.component.ts
+++ b/frontend/src/app/workspace/component/menu/menu.component.ts
@@ -55,6 +55,7 @@ import { ComputingUnitStatusService } from "../../../common/service/computing-un
 import { ComputingUnitState } from "../../../common/type/computing-unit-connection.interface";
 import { ComputingUnitSelectionComponent } from "../power-button/computing-unit-selection.component";
 import { GuiConfigService } from "../../../common/service/gui-config.service";
+import { DataGuardSettingsService } from "../../service/agent/data-guard-settings.service";
 import { DashboardWorkflowComputingUnit } from "../../../common/type/workflow-computing-unit";
 import { Privilege } from "../../../dashboard/type/share-access.interface";
 import { MarkdownDescriptionComponent } from "../../../dashboard/component/user/markdown-description/markdown-description.component";
@@ -189,7 +190,8 @@ export class MenuComponent implements OnInit, OnDestroy {
     private panelService: PanelService,
     private computingUnitStatusService: ComputingUnitStatusService,
     protected config: GuiConfigService,
-    private router: Router
+    private router: Router,
+    public dataGuardSettings: DataGuardSettingsService
   ) {
     workflowWebsocketService
       .subscribeToEvent("ExecutionDurationUpdateEvent")
@@ -472,6 +474,35 @@ export class MenuComponent implements OnInit, OnDestroy {
     this.workflowActionService.addCommentBox(this.workflowUtilService.getNewCommentBox());
   }
 
+  /**
+   * DataGuard toolbar toggle — defaults to ON for every workflow.
+   * The shield button reads `isDataGuardEnabled` for its icon theme and
+   * tooltip; clicking it flips the state.
+   *
+   * Reads from `DataGuardSettingsService` (persists per-workflow in
+   * localStorage). When OFF, `DataGuardAutoTriggerService` skips its
+   * orchestration pipeline.
+   */
+  public get isDataGuardEnabled(): boolean {
+    const wid = this.workflowActionService.getWorkflowMetadata()?.wid;
+    // No workflow id yet (unsaved) → treat as enabled-by-default; the
+    // toggle itself becomes meaningful once the workflow is saved.
+    if (wid === undefined) return true;
+    return this.dataGuardSettings.isEnabled(wid);
+  }
+
+  public onToggleDataGuard(): void {
+    const wid = this.workflowActionService.getWorkflowMetadata()?.wid;
+    if (wid === undefined) {
+      this.notificationService.warning(
+        "Save the workflow first — DataGuard's setting is stored per workflow."
+      );
+      return;
+    }
+    const next = this.dataGuardSettings.toggle(wid);
+    this.notificationService.info(`DataGuard auto-scan is now ${next ? "ON" : "OFF"} for this workflow.`);
+  }
+
   public handleKill(): void {
     this.executeWorkflowService.killWorkflow();
   }
diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html
index 5400d978ee3..bc6bef666cd 100644
--- a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html
+++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html
@@ -155,10 +155,12 @@ <h5 class="rightAlign"><span [innerHTML]="compare(column.header, 'other')"></spa
       <tbody>
         <tr
           *ngFor="let row of basicTable.data; let i = index"
-          class="table-row-hover">
+          class="table-row-hover"
+          [class.dg-row-highlight]="i === highlightedRowIndexInPage">
           <td
             *ngFor="let column of currentColumns; let columnIndex = index"
             class="table-cell"
+            [class.dg-cell-highlight]="i === highlightedRowIndexInPage && (column.columnDef === highlightedColumn || column.header === highlightedColumn)"
             nzEllipsis
             (click)="open(i, row)">
             <span class="cell-content">{{ column.getCell(row) }}</span>
diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.scss b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.scss
index 6326b83eb2c..cff46f76f92 100644
--- a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.scss
+++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.scss
@@ -142,3 +142,29 @@ th.header-size {
   bottom: 16px;
   right: 0;
 }
+
+// DataGuard "jump to row" flash highlight. The component-side timer
+// (HIGHLIGHT_DURATION_MS) clears the class so the highlight fades on its own;
+// while present, the pulse loops infinitely so duration is owned by TS
+// rather than split between two layers. ::ng-deep is required because
+// nz-table's <td> live outside this component's view encapsulation.
+:host ::ng-deep tr.dg-row-highlight {
+  background-color: #fffbe6 !important;
+  animation: dgRowHighlightPulse 0.6s ease-in-out infinite;
+}
+
+:host ::ng-deep td.dg-cell-highlight {
+  background-color: #ffe58f !important;
+  outline: 2px solid #faad14;
+  outline-offset: -2px;
+}
+
+@keyframes dgRowHighlightPulse {
+  0%,
+  100% {
+    background-color: #fffbe6;
+  }
+  50% {
+    background-color: #fff1b8;
+  }
+}
diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
index 72a0dbbf72c..e9bba0d285b 100644
--- a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
+++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
@@ -17,7 +17,9 @@
  * under the License.
  */
 
-import { ChangeDetectorRef, Component, Input, OnChanges, OnInit, SimpleChanges } from "@angular/core";
+import { ChangeDetectorRef, Component, Input, OnChanges, OnDestroy, OnInit, SimpleChanges } from "@angular/core";
+import { Subject, race, timer } from "rxjs";
+import { filter, take } from "rxjs/operators";
 import { NzModalRef, NzModalService } from "ng-zorro-antd/modal";
 import {
   NzTableQueryParams,
@@ -40,6 +42,7 @@ import { DomSanitizer, SafeHtml } from "@angular/platform-browser";
 import { ResultExportationComponent } from "../../result-exportation/result-exportation.component";
 import { WorkflowStatusService } from "../../../service/workflow-status/workflow-status.service";
 import { GuiConfigService } from "../../../../common/service/gui-config.service";
+import { DataGuardRowNavigatorService } from "../../../service/agent/data-guard-row-navigator.service";
 import { NgIf, NgFor, NgClass } from "@angular/common";
 import { NzSpaceCompactItemDirective } from "ng-zorro-antd/space";
 import { NzInputDirective } from "ng-zorro-antd/input";
@@ -80,7 +83,10 @@ import { NzIconDirective } from "ng-zorro-antd/icon";
     NzCellEllipsisDirective,
   ],
 })
-export class ResultTableFrameComponent implements OnInit, OnChanges {
+export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
+  // DataGuard locate-row highlight duration. Must match the SCSS animation
+  // (which is set to iteration-count: infinite + clears when the class drops).
+  private static readonly HIGHLIGHT_DURATION_MS = 2000;
   @Input() operatorId?: string;
 
   // display result table
@@ -109,6 +115,17 @@ export class ResultTableFrameComponent implements OnInit, OnChanges {
   widthPercent: string = "";
   isOperatorFinished: boolean = false;
 
+  // DataGuard "show this row" highlight. Indexed in *page* coordinates so we
+  // can match against the same `let i = index` the *ngFor uses.
+  highlightedRowIndexInPage: number | null = null;
+  highlightedColumn: string | null = null;
+  private highlightTimer: ReturnType<typeof setTimeout> | null = null;
+  // Fires (with pageIndex) after a paginated page has been rendered into the
+  // table. The DataGuard locate-flow waits on this — semantically "I want to
+  // know when the page is shown" rather than "when the HTTP responds" — so
+  // we never double-subscribe to the cold `selectPage` Observable.
+  private readonly pageRendered$ = new Subject<number>();
+
   constructor(
     private modalService: NzModalService,
     private workflowActionService: WorkflowActionService,
@@ -117,7 +134,8 @@ export class ResultTableFrameComponent implements OnInit, OnChanges {
     private changeDetectorRef: ChangeDetectorRef,
     private sanitizer: DomSanitizer,
     private workflowStatusService: WorkflowStatusService,
-    private guiConfigService: GuiConfigService
+    private guiConfigService: GuiConfigService,
+    private dataGuardRowNavigator: DataGuardRowNavigatorService
   ) {}
 
   ngOnChanges(changes: SimpleChanges): void {
@@ -204,6 +222,74 @@ export class ResultTableFrameComponent implements OnInit, OnChanges {
       if (paginatedResultService) {
       }
     }
+
+    // DataGuard checklist row click → page to + flash the affected row.
+    // Only honored for this frame's operator; other ResultTableFrames ignore.
+    this.dataGuardRowNavigator
+      .getNav$()
+      .pipe(untilDestroyed(this))
+      .subscribe(req => {
+        if (!this.operatorId || req.operatorId !== this.operatorId) {
+          return;
+        }
+        // Capture operator at click time. The frame's `operatorId` can change
+        // before the async page-load completes (user switches operators); we
+        // bail in applyFlash if the frame is no longer showing this request's
+        // operator. pageSize is intentionally NOT captured — recomputed inside
+        // applyFlash so a panel resize between click and apply takes effect.
+        const requestOperatorId = req.operatorId;
+        const applyFlash = () => {
+          if (this.operatorId !== requestOperatorId) return;
+          const targetPage = DataGuardRowNavigatorService.pageIndexFor(req.rowIndex, this.pageSize);
+          const rowInPage = req.rowIndex - (targetPage - 1) * this.pageSize;
+          this.highlightedRowIndexInPage = rowInPage;
+          this.highlightedColumn = req.column ?? null;
+          this.changeDetectorRef.detectChanges();
+          if (this.highlightTimer !== null) {
+            clearTimeout(this.highlightTimer);
+          }
+          this.highlightTimer = setTimeout(() => {
+            this.highlightedRowIndexInPage = null;
+            this.highlightedColumn = null;
+            this.highlightTimer = null;
+            this.changeDetectorRef.detectChanges();
+          }, ResultTableFrameComponent.HIGHLIGHT_DURATION_MS);
+        };
+        // Compute target page once, off the freshest pageSize. (Reviewer 2:
+        // applyFlash recomputes too, but that branch only runs after the
+        // page-rendered round-trip, so a panel resize in flight still picks
+        // up the new pageSize there.)
+        const targetPage = DataGuardRowNavigatorService.pageIndexFor(req.rowIndex, this.pageSize);
+        if (this.currentPageIndex !== targetPage) {
+          this.currentPageIndex = targetPage;
+          // Wait on the *post-render* signal — not the cold selectPage
+          // Observable — so we don't trigger a duplicate HTTP fetch and so
+          // the flash lands after setupResultTable + detectChanges complete.
+          // race() against a generous timeout so a never-completing render
+          // (e.g., user navigates away) doesn't strand the flash forever.
+          race(
+            this.pageRendered$.pipe(
+              filter(p => p === targetPage),
+              take(1)
+            ),
+            timer(3000)
+          )
+            .pipe(take(1), untilDestroyed(this))
+            .subscribe(() => applyFlash());
+          this.changePaginatedResultData();
+        } else {
+          applyFlash();
+        }
+      });
+  }
+
+  ngOnDestroy(): void {
+    // @UntilDestroy handles RxJS subs but not raw timers — clear so the late
+    // callback can't fire detectChanges() on a destroyed view (NG0911).
+    if (this.highlightTimer !== null) {
+      clearTimeout(this.highlightTimer);
+      this.highlightTimer = null;
+    }
   }
 
   checkKeys(
@@ -377,6 +463,7 @@ export class ResultTableFrameComponent implements OnInit, OnChanges {
   // 1. result panel is opened - must display currently selected page
   // 2. user selects a new page - must display new page data
   // 3. current page is dirty - must re-fetch data
+  //
   changePaginatedResultData(): void {
     if (!this.operatorId) {
       return;
@@ -393,6 +480,9 @@ export class ResultTableFrameComponent implements OnInit, OnChanges {
         if (this.currentPageIndex === pageData.pageIndex) {
           this.setupResultTable(pageData.table, paginatedResultService.getCurrentTotalNumTuples());
           this.changeDetectorRef.detectChanges();
+          // Signal page-rendered AFTER setup + CD so the locate-flow's flash
+          // lands on the freshly rendered rows, not stale ones.
+          this.pageRendered$.next(pageData.pageIndex);
         }
       });
   }
diff --git a/frontend/src/app/workspace/component/workspace.component.html b/frontend/src/app/workspace/component/workspace.component.html
index c54446fb318..85c936fb2b2 100644
--- a/frontend/src/app/workspace/component/workspace.component.html
+++ b/frontend/src/app/workspace/component/workspace.component.html
@@ -36,5 +36,9 @@
 <texera-agent-panel
   *ngIf="copilotEnabled"
   [agentIdToActivate]="agentIdToActivate"></texera-agent-panel>
+<!-- DataGuard quality-issue checklist. Separate from the chat — issues are
+     server-scanned, user picks via checkboxes, applied in a single batch
+     without going through the LLM ReAct loop. -->
+<texera-dataguard-checklist *ngIf="copilotEnabled"></texera-dataguard-checklist>
 <texera-property-editor></texera-property-editor>
 <ng-template #codeEditor></ng-template>
diff --git a/frontend/src/app/workspace/component/workspace.component.ts b/frontend/src/app/workspace/component/workspace.component.ts
index 9968c26f647..eaef19868b2 100644
--- a/frontend/src/app/workspace/component/workspace.component.ts
+++ b/frontend/src/app/workspace/component/workspace.component.ts
@@ -59,6 +59,7 @@ import { MenuComponent } from "./menu/menu.component";
 import { MiniMapComponent } from "./workflow-editor/mini-map/mini-map.component";
 import { LeftPanelComponent } from "./left-panel/left-panel.component";
 import { AgentPanelComponent } from "./agent/agent-panel/agent-panel.component";
+import { DataGuardChecklistComponent } from "./dataguard-checklist/dataguard-checklist.component";
 import { PropertyEditorComponent } from "./property-editor/property-editor.component";
 import { FormlyRepeatDndComponent } from "../../common/formly/repeat-dnd/repeat-dnd.component";
 
@@ -82,6 +83,7 @@ export const SAVE_DEBOUNCE_TIME_IN_MS = 5000;
     LeftPanelComponent,
     NgIf,
     AgentPanelComponent,
+    DataGuardChecklistComponent,
     PropertyEditorComponent,
     FormlyRepeatDndComponent,
   ],
diff --git a/frontend/src/app/workspace/service/agent/agent-types.ts b/frontend/src/app/workspace/service/agent/agent-types.ts
index c9b3ed29cab..ae52d5a99d9 100644
--- a/frontend/src/app/workspace/service/agent/agent-types.ts
+++ b/frontend/src/app/workspace/service/agent/agent-types.ts
@@ -86,7 +86,7 @@ export interface ReActStep {
    */
   pendingApproval?: {
     toolName: string;
-    riskTier: "low" | "medium" | "high";
+    riskTier: "low" | "medium" | "high" | "warning";
     proposal: {
       issueId: string;
       issueType: string;
diff --git a/frontend/src/app/workspace/service/agent/agent.service.ts b/frontend/src/app/workspace/service/agent/agent.service.ts
index 01c10b8ea00..5e5502aa99e 100644
--- a/frontend/src/app/workspace/service/agent/agent.service.ts
+++ b/frontend/src/app/workspace/service/agent/agent.service.ts
@@ -974,8 +974,8 @@ export class AgentService {
   public sendDecision(
     agentId: string,
     stepId: string,
-    verdict: "allow" | "deny" | "modify",
-    options: { modifiedAction?: string; remember?: boolean } = {}
+    verdict: "allow" | "deny",
+    options: { remember?: boolean } = {}
   ): void {
     const tracking = this.agentStateTracking.get(agentId);
     if (!tracking?.websocket || tracking.websocket.readyState !== WebSocket.OPEN) {
@@ -988,7 +988,6 @@ export class AgentService {
           type: "decision",
           stepId,
           verdict,
-          modifiedAction: options.modifiedAction,
           remember: options.remember,
         })
       );
diff --git a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts
new file mode 100644
index 00000000000..81c9b23dd22
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { DataGuardAutoTriggerService } from "./data-guard-auto-trigger.service";
+import { OperatorPredicate } from "../../types/workflow-common.interface";
+
+/**
+ * Pure-logic tests for the rescan target resolver. Exercising the static
+ * helper avoids a full TestBed harness for what is, fundamentally, a switch
+ * over (priorOpId × operator graph). The behaviour we care about:
+ *
+ *  - Floater click after the panel was reset (priorOpId undefined) must still
+ *    pick up the dataset operator on the canvas — the user-reported symptom
+ *    was the second click silently doing nothing.
+ *  - When the prior operator is gone but a different dataset operator exists,
+ *    fall through to the candidate rather than warning "nothing on canvas".
+ *  - When nothing on the canvas is a dataset operator, return "none" so the
+ *    caller can surface the right toast.
+ */
+describe("DataGuardAutoTriggerService.resolveRescanTarget", () => {
+  function op(operatorID: string, operatorType: string): OperatorPredicate {
+    return {
+      operatorID,
+      operatorType,
+      operatorProperties: {},
+      inputPorts: [],
+      outputPorts: [],
+      showAdvanced: false,
+      isDisabled: false,
+      customDisplayName: operatorID,
+      operatorVersion: "0",
+    } as unknown as OperatorPredicate;
+  }
+  const isDataset = (t: string) => t === "CSVFileScan" || t === "ParallelCSVFileScan";
+
+  it("returns 'prior' when the prior operator is still on the graph", () => {
+    const a = op("op-1", "CSVFileScan");
+    const b = op("op-2", "ViewResult");
+    const result = DataGuardAutoTriggerService.resolveRescanTarget("op-1", [a, b], isDataset);
+    expect(result.kind).toBe("prior");
+    if (result.kind === "prior") expect(result.operator.operatorID).toBe("op-1");
+  });
+
+  it("falls through to a 'candidate' when priorOpId is undefined", () => {
+    // Symptom of the floater bug: after onClose(), results.reset() sets
+    // sourceOperatorId=undefined. The floater click must still find the
+    // dataset operator that's sitting on the canvas.
+    const a = op("op-1", "CSVFileScan");
+    const result = DataGuardAutoTriggerService.resolveRescanTarget(undefined, [a], isDataset);
+    expect(result.kind).toBe("candidate");
+    if (result.kind === "candidate") expect(result.operator.operatorID).toBe("op-1");
+  });
+
+  it("falls through to a 'candidate' when the prior operator was removed", () => {
+    // User deleted the originally-scanned operator and dropped a new one. We
+    // shouldn't warn "nothing to rescan" just because the prior id is stale.
+    const replacement = op("op-2", "CSVFileScan");
+    const result = DataGuardAutoTriggerService.resolveRescanTarget(
+      "op-1-gone",
+      [replacement],
+      isDataset
+    );
+    expect(result.kind).toBe("candidate");
+    if (result.kind === "candidate") expect(result.operator.operatorID).toBe("op-2");
+  });
+
+  it("returns 'none' when no dataset operator is on the canvas", () => {
+    const a = op("op-1", "ViewResult");
+    const result = DataGuardAutoTriggerService.resolveRescanTarget(undefined, [a], isDataset);
+    expect(result.kind).toBe("none");
+  });
+
+  it("returns 'none' on an empty canvas", () => {
+    const result = DataGuardAutoTriggerService.resolveRescanTarget(undefined, [], isDataset);
+    expect(result.kind).toBe("none");
+  });
+
+  it("returns 'prior' even when the survivor is no longer a dataset operator", () => {
+    // Edge case: results state somehow has a sourceOperatorId pointing at an
+    // operator that is no longer a dataset operator (e.g., the user replaced
+    // the CSVFileScan's contents). 'prior' branch will return it, runPipeline
+    // will then read no fileName and bail with the userInitiated message —
+    // this is acceptable; the helper isn't responsible for re-filtering by
+    // type once it has a live match. Test pins behaviour.
+    const stale = op("op-1", "ViewResult");
+    const result = DataGuardAutoTriggerService.resolveRescanTarget("op-1", [stale], isDataset);
+    expect(result.kind).toBe("prior");
+  });
+});
+
+/**
+ * Concurrency control: a user-initiated rescan that arrives while another
+ * pipeline is already in flight must SERIALIZE behind it — never fire a
+ * second concurrent /scan. This test exercises the runPipeline Promise-
+ * tracking field (`currentPipeline`) by constructing the service with
+ * collaborator stubs and gating its first pipeline on a deferred promise.
+ */
+describe("DataGuardAutoTriggerService concurrent pipeline serialization", () => {
+  function op(operatorID: string, operatorType: string, fileName = "/o/d/v/x.csv"): OperatorPredicate {
+    return {
+      operatorID,
+      operatorType,
+      operatorProperties: { fileName },
+      inputPorts: [],
+      outputPorts: [],
+      showAdvanced: false,
+      isDisabled: false,
+      customDisplayName: operatorID,
+      operatorVersion: "0",
+    } as unknown as OperatorPredicate;
+  }
+
+  function deferred<T>(): { promise: Promise<T>; resolve: (v: T) => void } {
+    let resolve!: (v: T) => void;
+    const promise = new Promise<T>(r => {
+      resolve = r;
+    });
+    return { promise, resolve };
+  }
+
+  /**
+   * Reach into the private slot the production code uses. We're testing
+   * the concurrency invariant ("at most one pipeline running at a time
+   * for a user click"), so peeking at the field is the most direct way
+   * to assert what we care about without an integration harness.
+   */
+  function pipelineSlot(svc: unknown): Promise<void> | null {
+    return (svc as { currentPipeline: Promise<void> | null }).currentPipeline;
+  }
+
+  it("a user-initiated rescan waits for the in-flight pipeline instead of running concurrently", async () => {
+    // Gate the first pipeline's load step on a deferred promise so we can
+    // hold it open and observe what happens when a second user click arrives
+    // while it's still in flight.
+    const loadGate = deferred<{ source: string; rows: number; columns: number }>();
+    let loadCallCount = 0;
+    let scanCallCount = 0;
+
+    const opA = op("op-1", "CSVFileScan", "/o/d/v/a.csv");
+
+    // Minimal stubs — just enough for runPipeline to reach the gated load
+    // step and stop, and for rescanAny to discover opA and resolve.
+    const graphStub = {
+      getOperator: (id: string) => (id === "op-1" ? opA : undefined),
+      getAllOperators: () => [opA],
+      getOperatorAddStream: () => ({ pipe: () => ({ subscribe: () => ({ unsubscribe() {} }) }) }),
+      getOperatorPropertyChangeStream: () => ({ pipe: () => ({ subscribe: () => ({ unsubscribe() {} }) }) }),
+    };
+    const workflowActionService = {
+      getTexeraGraph: () => graphStub,
+      getWorkflowMetadata: () => ({ wid: 7 }),
+    };
+    const agentService = {
+      getAllAgents: () => ({ subscribe: (o: { next: (v: unknown[]) => void; complete?: () => void }) => { o.next([{ id: "agent-1", delegate: { workflowId: 7 } }]); o.complete?.(); return { unsubscribe() {} }; } }),
+    };
+    const notificationService = {
+      info: () => {},
+      warning: () => {},
+      error: () => {},
+      success: () => {},
+    };
+    const settings = { isEnabled: () => true };
+
+    // BehaviorSubject-shaped enough for runPipeline.results.setState +
+    // rescanAny's read of state.sourceOperatorId.
+    const stateHolder: { [k: string]: unknown } = { state: "idle", entries: [], sourceOperatorId: undefined };
+    const results = {
+      getState: () => stateHolder,
+      setState: (patch: Record<string, unknown>) => Object.assign(stateHolder, patch),
+    };
+
+    const datasetService = {
+      retrieveDatasetVersionSingleFile: () => ({ subscribe: () => ({ unsubscribe() {} }) }),
+    };
+    const executeWorkflowService = { executeWorkflow: () => {} };
+    const http = {
+      post: () => ({ subscribe: () => ({ unsubscribe() {} }) }),
+    };
+
+    // Construct the service. Cast everything; this test only depends on
+    // the concurrency-control surface, not on the deps' real behaviour.
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    const svc = new DataGuardAutoTriggerService(
+      workflowActionService as any,
+      agentService as any,
+      notificationService as any,
+      settings as any,
+      results as any,
+      datasetService as any,
+      executeWorkflowService as any,
+      http as any
+    );
+
+    // Replace loadFromOperatorFile with the gated stub so the first pipeline
+    // suspends predictably.
+    (svc as unknown as { loadFromOperatorFile: unknown }).loadFromOperatorFile = () => {
+      loadCallCount++;
+      return loadGate.promise;
+    };
+    // Replace ensureAgent so we don't depend on agentService.createAgent.
+    (svc as unknown as { ensureAgent: unknown }).ensureAgent = async () => "agent-1";
+    // Spy the /scan POST. If a second pipeline ran concurrently, this would
+    // observe a second call before loadGate.resolve() fires.
+    (svc as unknown as { http: { post: () => unknown } }).http = {
+      post: () => {
+        scanCallCount++;
+        return { subscribe: (o: { next: (v: unknown) => void; complete?: () => void }) => { o.next({ issueCount: 0, issues: [], proposals: [] }); o.complete?.(); return { unsubscribe() {} }; } };
+      },
+    };
+
+    // Start pipeline #1 (simulating an auto-trigger) — it will suspend on the load.
+    const first = (svc as unknown as { runPipeline: (op: OperatorPredicate, o: { userInitiated: boolean }) => Promise<void> })
+      .runPipeline(opA, { userInitiated: false });
+
+    // Yield once so runPipeline reaches the load await.
+    await Promise.resolve();
+    expect(loadCallCount).toBe(1);
+    expect(scanCallCount).toBe(0);
+    expect(pipelineSlot(svc)).not.toBeNull();
+
+    // Now: user clicks the floater while #1 is suspended.
+    const second = svc.rescanAny();
+
+    // Let microtasks drain. The second call MUST NOT start a parallel
+    // load or /scan while #1 is gated.
+    await Promise.resolve();
+    await Promise.resolve();
+    expect(loadCallCount).toBe(1); // still only the first
+    expect(scanCallCount).toBe(0); // no parallel /scan POSTed
+
+    // Release the gate. Pipeline #1 completes, currentPipeline.finally
+    // clears the slot, and pipeline #2 proceeds — loadCallCount becomes 2.
+    loadGate.resolve({ source: "x", rows: 0, columns: 0 });
+
+    await first;
+    await second;
+
+    // Both ran, but serialized — second only started after first finished.
+    expect(loadCallCount).toBe(2);
+    expect(scanCallCount).toBe(2);
+    expect(pipelineSlot(svc)).toBeNull();
+  });
+});
diff --git a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts
index fc6a7242f1b..a3712089c34 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts
@@ -18,64 +18,612 @@
  */
 
 import { Injectable } from "@angular/core";
-import { Observable, filter, map } from "rxjs";
+import { HttpClient } from "@angular/common/http";
+import { Observable, Subscription, debounceTime, filter, firstValueFrom, map, merge } from "rxjs";
+import * as Papa from "papaparse";
 import { OperatorPredicate } from "../../types/workflow-common.interface";
 import { WorkflowActionService } from "../workflow-graph/model/workflow-action.service";
+import { AgentService, AgentInfo } from "./agent.service";
+import { DataGuardSettingsService } from "./data-guard-settings.service";
+import {
+  DataGuardResultsService,
+  ChecklistEntry,
+  DataQualityIssue,
+  FixProposal,
+} from "./data-guard-results.service";
+import { NotificationService } from "../../../common/service/notification/notification.service";
+import { DatasetService } from "../../../dashboard/service/user/dataset/dataset.service";
+import { ExecuteWorkflowService } from "../execute-workflow/execute-workflow.service";
 
 /**
- * DataGuard auto-trigger.
+ * DataGuard auto-trigger orchestration — checklist-driven flow.
  *
- * Watches the texera-graph for newly-added dataset-reading operators
- * (CSVFileScan, TableFileScan, JSONFileScan, …) and emits a hint that the
- * agent-panel should auto-launch a DataGuard agent and open the chat panel.
+ * Pipeline when a dataset-reading operator is added (or its file property is set):
+ *   1. Resolve workflow context (id, settings on/off).
+ *   2. Find or create an agent bound to this workflow.
+ *   3. Load the dataset into the agent's DataGuardSession:
+ *      - if operator has a `fileName` → fetch via DatasetService + papaparse
+ *      - else → server-side load of bundled demo CSV
+ *   4. Call server-side `/dataguard/scan` (NOT the chat — this bypasses the
+ *      LLM ReAct loop, so the LLM can't decide to call `deleteOperator` and
+ *      vaporize the user's workflow). The endpoint runs profile_dataset +
+ *      suggest_fix server-side and returns issues + proposals in one shot.
+ *   5. Publish results into DataGuardResultsService → the
+ *      DataGuardChecklistComponent re-renders as a checklist with
+ *      checkboxes. The user picks what to apply.
  *
- * The actual agent-creation flow (POST /agents → POST /agents/:id/dataguard/dataset
- * → activate websocket → send "scan this dataset") is owned by the panel
- * component. This service is the *event source*, not the orchestrator.
- *
- * Wire from a panel:
- *
- *   constructor(private trigger: DataGuardAutoTriggerService) {}
- *   ngOnInit() {
- *     this.trigger.getDatasetAddedStream().pipe(untilDestroyed(this)).subscribe(op => {
- *       // create agent, load dataset, send message...
- *     });
- *   }
+ * Toggle: DataGuardSettingsService gates the pipeline (per-workflow,
+ * default ON, controlled by toolbar 🛡 shield button).
  */
 @Injectable({ providedIn: "root" })
 export class DataGuardAutoTriggerService {
-  // Operator types that imply "the user just brought a tabular dataset onto
-  // the canvas." Extend cautiously — every type here triggers DataGuard.
+  // CSV-only for MVP — `loadFromOperatorFile` blindly Papa.parses every blob,
+  // so JSON / Table / Parquet operators would either crash or produce garbage
+  // rows. Per-format parsing is the obvious follow-up; until then narrowing
+  // the trigger set is honest. See §16.4 of README_DataGuard_Texera.md.
   private static readonly DATASET_OPERATOR_TYPES = new Set<string>([
     "CSVFileScan",
-    "TableFileScan",
-    "JSONFileScan",
     "ParallelCSVFileScan",
   ]);
 
-  constructor(private readonly workflowActionService: WorkflowActionService) {}
+  /** Dedup: re-orchestrate only if (operatorID, filePath) changes. */
+  private readonly lastOrchestratedFile = new Map<string, string>();
+  /**
+   * The currently-running pipeline, if any. Replaces the old boolean `busy`
+   * flag so a user-initiated rescan can *await* the in-flight scan instead
+   * of either silently dropping (old behaviour) or double-firing a parallel
+   * /scan against agent-service (which would race results.setState calls and
+   * double LLM cost). Auto-trigger paths still treat a non-null pipeline as
+   * "busy" and drop — they have no UX cost to skipping a redundant scan.
+   */
+  private currentPipeline: Promise<void> | null = null;
+
+  constructor(
+    private readonly workflowActionService: WorkflowActionService,
+    private readonly agentService: AgentService,
+    private readonly notificationService: NotificationService,
+    private readonly settings: DataGuardSettingsService,
+    private readonly results: DataGuardResultsService,
+    private readonly datasetService: DatasetService,
+    private readonly executeWorkflowService: ExecuteWorkflowService,
+    private readonly http: HttpClient
+  ) {}
+
+  public isDatasetOperatorType(operatorType: string): boolean {
+    return DataGuardAutoTriggerService.DATASET_OPERATOR_TYPES.has(operatorType);
+  }
 
   /**
-   * Emits an OperatorPredicate every time a dataset-reading operator is
-   * added to the workflow. Subscribers should react by auto-launching a
-   * DataGuard agent and loading the referenced dataset.
+   * Force a re-scan of whichever operator the current panel state was tied
+   * to. After an Apply, the user might notice the AI missed something on
+   * the cleaned version and want another pass. The auto-trigger dedup would
+   * normally block this (same opId + same fileName), so we clear the dedup
+   * entry before running. A subsequent Apply produces a new timestamped
+   * dataset version (v3, v4, …).
    */
-  public getDatasetAddedStream(): Observable<OperatorPredicate> {
-    return this.workflowActionService
-      .getTexeraGraph()
-      .getOperatorAddStream()
-      .pipe(
-        filter((op: OperatorPredicate) =>
-          DataGuardAutoTriggerService.DATASET_OPERATOR_TYPES.has(op.operatorType)
-        ),
-        map((op: OperatorPredicate) => op)
+  public async rescanCurrent(): Promise<void> {
+    const state = this.results.getState();
+    const opId = state.sourceOperatorId;
+    if (!opId) {
+      this.notificationService.warning("DataGuard: nothing to re-scan yet — drop a dataset operator first.");
+      return;
+    }
+    const op = this.workflowActionService.getTexeraGraph().getOperator(opId);
+    if (!op) {
+      this.notificationService.warning("DataGuard: the original operator is gone — can't re-scan.");
+      return;
+    }
+    // Bypass the per-(opId, filePath) dedup so the pipeline runs even though
+    // nothing about the file path changed since the last scan.
+    this.lastOrchestratedFile.delete(opId);
+    await this.runPipeline(op, { userInitiated: true });
+  }
+
+  /**
+   * Re-scan whatever is currently on the canvas. Used by the floating
+   * DataGuard icon (when the panel is closed) and any "scan anytime" entry
+   * point — works even if the panel was previously reset to idle (i.e., the
+   * sourceOperatorId is gone).
+   *
+   * An explicit user click should always win:
+   *  - The panel is forced into the "scanning" state immediately so the user
+   *    sees feedback even if a downstream guard bails (every bail-out path
+   *    now overwrites this with a meaningful "error" / "idle" message instead
+   *    of leaving the panel hanging on "Re-scanning…").
+   *  - The (opId, filePath) dedup is cleared wholesale — the dedup exists to
+   *    suppress auto-debounce, not to block deliberate user actions.
+   *  - If a pipeline is genuinely still in flight (HTTP /scan mid-call), we
+   *    *await* it before starting the new one. Forcing `busy=false` instead
+   *    would let two `/scan` POSTs race, doubling LLM cost and clobbering
+   *    `results.setState` in undefined order.
+   *
+   * Resolution order:
+   *  1. If results state still has a live sourceOperatorId → re-scan it.
+   *  2. Else pick the first dataset-reading operator on the canvas.
+   *  3. Else surface a panel "idle" with a clear message and toast.
+   */
+  public async rescanAny(): Promise<void> {
+    // Always show the panel immediately — even if a guard below bails, the
+    // user must see *something* happen. Every bail-out path overwrites this.
+    this.results.setState({ state: "scanning", entries: [], message: "Re-scanning…" });
+    // Wipe the dedup map — an explicit click should never be suppressed.
+    this.lastOrchestratedFile.clear();
+    // Serialize behind any in-flight pipeline. The setState above gives the
+    // user immediate visual feedback while we wait. runPipeline itself also
+    // awaits currentPipeline on the userInitiated path; awaiting here too
+    // keeps the resolveRescanTarget call below from racing a pipeline that's
+    // mid-mutation of results state.
+    if (this.currentPipeline) {
+      try {
+        await this.currentPipeline;
+      } catch {
+        /* swallow — the prior pipeline's own error UX already fired */
+      }
+    }
+
+    const target = DataGuardAutoTriggerService.resolveRescanTarget(
+      this.results.getState().sourceOperatorId,
+      this.workflowActionService.getTexeraGraph().getAllOperators(),
+      t => this.isDatasetOperatorType(t)
+    );
+    if (target.kind === "none") {
+      this.results.setState({
+        state: "idle",
+        message: undefined,
+      });
+      this.notificationService.warning("DataGuard: drop a dataset operator on the canvas first.");
+      return;
+    }
+    await this.runPipeline(target.operator, { userInitiated: true });
+  }
+
+  /**
+   * Pure helper: pick which operator a user-initiated rescan should target.
+   *  - "prior"     — the operator tied to the current panel state is still on the graph.
+   *  - "candidate" — that operator is gone (or never set); fall back to the first
+   *                  dataset operator on the canvas.
+   *  - "none"      — no dataset operator exists; caller must surface a message.
+   * Extracted as a static so it's exercisable without a TestBed harness.
+   */
+  public static resolveRescanTarget(
+    priorOpId: string | undefined,
+    allOperators: OperatorPredicate[],
+    isDatasetType: (operatorType: string) => boolean
+  ):
+    | { kind: "prior"; operator: OperatorPredicate }
+    | { kind: "candidate"; operator: OperatorPredicate }
+    | { kind: "none" } {
+    if (priorOpId) {
+      const prior = allOperators.find(o => o.operatorID === priorOpId);
+      if (prior) return { kind: "prior", operator: prior };
+    }
+    const candidate = allOperators.find(o => isDatasetType(o.operatorType));
+    if (candidate) return { kind: "candidate", operator: candidate };
+    return { kind: "none" };
+  }
+
+  /** Subscribe both operator-add and operator-property-change. */
+  public startOrchestration(): Subscription {
+    const graph = this.workflowActionService.getTexeraGraph();
+    const addStream$ = graph.getOperatorAddStream();
+    const propertyStream$ = graph
+      .getOperatorPropertyChangeStream()
+      .pipe(debounceTime(500), map(event => event.operator));
+
+    return merge(addStream$, propertyStream$)
+      .pipe(filter(op => this.isDatasetOperatorType(op.operatorType)))
+      .subscribe(op => {
+        void this.runPipeline(op, { userInitiated: false });
+      });
+  }
+
+  /**
+   * Apply a batch of user decisions. Triggered by the checklist "Apply" button.
+   *
+   * Pipeline:
+   *   1. POST /apply-batch — agent-service mutates its in-memory dataset.
+   *   2. GET  /export-csv  — pull the cleaned dataset back as a CSV Blob.
+   *   3. Upload that CSV to the source dataset (multipart-upload) and commit a
+   *      new dataset version named "DataGuard cleaned".
+   *   4. Rewrite the operator's `fileName` to point at the new version.
+   *   5. Auto-run the workflow so the result panel populates.
+   *
+   * If we can't write back (no write access, can't locate the dataset, etc.)
+   * we surface a friendly message and stop — the in-memory fix is still
+   * recorded server-side, just not reflected on the canvas.
+   */
+  public async applyBatch(
+    decisions: Array<{ issueId: string; verdict: "allow" | "deny"; remember?: boolean }>
+  ): Promise<void> {
+    const state = this.results.getState();
+    if (!state.agentId) {
+      this.notificationService.error("DataGuard: no active scan.");
+      return;
+    }
+    this.results.setState({ state: "applying", message: `Cleaning ${decisions.length} item(s)…` });
+
+    // Step 1 — apply in memory
+    let applyResp: {
+      applied: number;
+      denied: number;
+      failed: number;
+      datasetRowCount: number;
+      residualIssues?: DataQualityIssue[];
+      residualCount?: number;
+    };
+    try {
+      applyResp = await firstValueFrom(
+        this.http.post<typeof applyResp>(
+          `/api/agents/${state.agentId}/dataguard/apply-batch`,
+          { decisions }
+        )
+      );
+    } catch (e: unknown) {
+      const msg = this.extractMessage(e);
+      this.results.setState({ state: "error", message: `Couldn't apply fixes: ${msg}` });
+      this.notificationService.error(`DataGuard: ${msg}`);
+      return;
+    }
+
+    const { applied, denied } = applyResp;
+    const residualCount = applyResp.residualCount ?? applyResp.residualIssues?.length ?? 0;
+    // Verification re-scan: surface a quiet notification if anything was left
+    // behind. We deliberately keep residualCount accurate (= what's still in
+    // the data) rather than excluding issues the agent couldn't propose a fix
+    // for — hiding those would make the toast lie. Calling out the user's own
+    // skips separately when relevant tells them which leftovers are their
+    // call vs. which are limitations of the auto-fix pass.
+    if (residualCount > 0) {
+      const skippedNote = denied > 0 ? ` (${denied} you skipped)` : "";
+      this.notificationService.warning(
+        `DataGuard applied ${applied} fix${applied === 1 ? "" : "es"}, but ${residualCount} issue${residualCount === 1 ? " is" : "s are"} still present${skippedNote}.`
       );
+    }
+    if (applied === 0) {
+      // Nothing actually changed — skip the upload + rerun dance.
+      this.results.setState({
+        state: "done",
+        message: denied > 0 ? `Skipped ${denied} item(s). Nothing changed.` : "Nothing to apply.",
+      });
+      return;
+    }
+
+    // Step 2 — write the cleaned file back as a new dataset version, then run.
+    const sourceFile = state.sourceFilePath;
+    const opId = state.sourceOperatorId;
+    if (!sourceFile || !opId) {
+      this.results.setState({
+        state: "done",
+        message: `Cleaned ${applied} item(s). (Source file unknown — couldn't save back automatically.)`,
+      });
+      return;
+    }
+
+    try {
+      this.results.setState({ message: `Saving cleaned data as a new version…` });
+      const newPath = await this.writeBackAsNewVersion(state.agentId, sourceFile);
+
+      // Step 4 — repoint the operator at the cleaned version
+      const graph = this.workflowActionService.getTexeraGraph();
+      const op = graph.getOperator(opId);
+      if (op) {
+        this.workflowActionService.setOperatorProperty(opId, {
+          ...op.operatorProperties,
+          fileName: newPath,
+        });
+      }
+      // Prevent the auto-trigger pipeline from re-scanning this fresh file
+      this.lastOrchestratedFile.set(opId, newPath);
+
+      // Step 5 — auto-run
+      this.results.setState({
+        state: "done",
+        message: `Cleaned ${applied} item(s). Running workflow with fixed data…`,
+      });
+      this.notificationService.success(`DataGuard: cleaned ${applied} item(s). Running workflow…`);
+      this.executeWorkflowService.executeWorkflow("DataGuard cleaned run");
+
+      // Step 6 — auto-rescan the cleaned version so the checklist reflects
+      // reality (typically empty, or whatever residue is left). Without this
+      // step the panel keeps showing the previous scan's entries even after
+      // the user pressed Fix — confusing, since each row's fix has already
+      // been applied to a NEW dataset version that's now what the operator
+      // points at. The workflow execute above is independent; we don't wait.
+      void this.rescanCurrent();
+    } catch (e: unknown) {
+      const msg = this.extractMessage(e);
+      this.results.setState({
+        state: "error",
+        message: `Cleaned ${applied} item(s) in memory, but couldn't save back: ${msg}`,
+      });
+      this.notificationService.error(`DataGuard: couldn't save cleaned data — ${msg}`);
+    }
   }
 
   /**
-   * For tests / debugging: is a given operatorType one we'd auto-trigger on?
+   * Download the cleaned CSV from agent-service, push it to the source
+   * dataset, commit a new version, and return the new fileName path so the
+   * operator can be re-pointed.
+   *
+   * `sourceFile` is the path the operator was reading from, in the canonical
+   * format `/ownerEmail/datasetName/versionName/fileRelativePath`.
    */
-  public isDatasetOperatorType(operatorType: string): boolean {
-    return DataGuardAutoTriggerService.DATASET_OPERATOR_TYPES.has(operatorType);
+  private async writeBackAsNewVersion(agentId: string, sourceFile: string): Promise<string> {
+    // 1. Pull cleaned CSV from agent-service
+    const csvBlob = await firstValueFrom(
+      this.http.get(`/api/agents/${agentId}/dataguard/export-csv`, { responseType: "blob" })
+    );
+    const fileName = sourceFile.split("/").pop() || "cleaned.csv";
+    const csvFile = new File([csvBlob], fileName, { type: "text/csv" });
+
+    // 2. Parse source path: /ownerEmail/datasetName/versionName/fileRelative...
+    const parts = sourceFile.replace(/^\/+/, "").split("/");
+    if (parts.length < 4) {
+      throw new Error(`source path "${sourceFile}" is not /owner/dataset/version/file`);
+    }
+    const [ownerEmail, datasetName, , ...fileRel] = parts;
+    const fileRelativePath = fileRel.join("/");
+
+    // 3. Find dataset (need its did + write access)
+    const datasets = await firstValueFrom(this.datasetService.retrieveAccessibleDatasets());
+    const match = datasets.find(
+      d => d.ownerEmail === ownerEmail && d.dataset.name === datasetName
+    );
+    if (!match || !match.dataset.did) {
+      throw new Error(`dataset "${ownerEmail}/${datasetName}" not accessible to you`);
+    }
+    if (match.accessPrivilege !== "WRITE" && !match.isOwner) {
+      throw new Error(`you don't have write access to "${ownerEmail}/${datasetName}"`);
+    }
+
+    // 4. Multipart-upload as a single part (CSVs are small for a hackathon demo)
+    const partSize = Math.max(csvFile.size, 5 * 1024 * 1024); // LakeFS likes ≥5MB parts but accepts last
+    await firstValueFrom(
+      this.datasetService
+        .multipartUpload(ownerEmail, datasetName, fileRelativePath, csvFile, partSize, 1, true)
+        .pipe(filter(p => p.status === "finished"))
+    );
+
+    // 5. Commit a new version. Append a timestamp so re-running on the same
+    // source version (e.g., user picks v1 again after we already produced a
+    // "DataGuard cleaned" v2) doesn't collide on the version name — every run
+    // gets a fresh unique version instead of failing.
+    const versionLabel = `DataGuard cleaned ${this.timestampSuffix()}`;
+    const newVersion = await firstValueFrom(
+      this.datasetService.createDatasetVersion(match.dataset.did, versionLabel)
+    );
+
+    return `/${ownerEmail}/${datasetName}/${newVersion.name}/${fileRelativePath}`;
+  }
+
+  /** YYYY-MM-DD HH:mm:ss in local time — unique per second, human-readable in
+   *  the dataset version dropdown. */
+  private timestampSuffix(): string {
+    const d = new Date();
+    const pad = (n: number) => String(n).padStart(2, "0");
+    return `${d.getFullYear()}-${pad(d.getMonth() + 1)}-${pad(d.getDate())} ${pad(d.getHours())}:${pad(d.getMinutes())}:${pad(d.getSeconds())}`;
+  }
+
+  /** Pull a readable message out of any throwable. Angular's HttpErrorResponse
+   *  is a class but does NOT extend Error, so the prior `instanceof Error`
+   *  check fell through to String(e) and printed "[object Object]". Try the
+   *  obvious shapes in order. */
+  private extractMessage(e: unknown): string {
+    if (typeof e === "string") return e;
+    if (e instanceof Error) return e.message;
+    if (e && typeof e === "object") {
+      const obj = e as Record<string, unknown>;
+      // Angular HttpErrorResponse: { error: { message } | string, message, statusText }
+      const inner = obj["error"];
+      if (typeof inner === "string" && inner) return inner;
+      if (inner && typeof inner === "object") {
+        const im = (inner as Record<string, unknown>)["message"];
+        if (typeof im === "string" && im) return im;
+      }
+      if (typeof obj["message"] === "string" && obj["message"]) return obj["message"] as string;
+      if (typeof obj["statusText"] === "string" && obj["statusText"]) return obj["statusText"] as string;
+      try {
+        return JSON.stringify(e);
+      } catch {
+        return "unknown error";
+      }
+    }
+    return String(e);
+  }
+
+  // ============================================================
+  //   Pipeline
+  // ============================================================
+
+  private async runPipeline(
+    op: OperatorPredicate,
+    options: { userInitiated: boolean } = { userInitiated: false }
+  ): Promise<void> {
+    const filePath = this.readFilePath(op);
+    // Don't auto-trigger until the user actually picks a file. Otherwise the
+    // panel pops open against a bundled demo dataset that has no relation to
+    // the empty operator on the canvas — confusing, and the workflow can't
+    // even run yet (Texera throws FileNotFoundException on an unset fileName).
+    if (!filePath) {
+      if (options.userInitiated) {
+        // Floater click against an unconfigured operator: surface a visible
+        // panel state + toast rather than silently no-op'ing.
+        this.results.setState({
+          state: "idle",
+          message: undefined,
+        });
+        this.notificationService.warning(
+          "DataGuard: set the operator's file path before scanning."
+        );
+      }
+      return;
+    }
+    // Auto-trigger dedup. A userInitiated call already cleared the map in
+    // rescanAny / rescanCurrent, so this only suppresses property-change spam.
+    if (!options.userInitiated && this.lastOrchestratedFile.get(op.operatorID) === filePath) return;
+    // Concurrency control. Two callers compete for the slot:
+    //   - Auto-trigger (userInitiated=false): a slot conflict means a redundant
+    //     scan, so we drop silently. Property-change events fire in bursts and
+    //     we don't want every one of them tail-piling.
+    //   - User click  (userInitiated=true ): a slot conflict means the user
+    //     deliberately asked for a fresh scan while one is running. We serialize
+    //     by AWAITING the in-flight pipeline before starting ours. This avoids
+    //     two parallel /scan POSTs (which would double LLM cost and race
+    //     results.setState in undefined order) while still honouring the click.
+    if (this.currentPipeline) {
+      if (!options.userInitiated) return;
+      this.notificationService.info("DataGuard: queuing your scan behind one already in progress…");
+      try {
+        await this.currentPipeline;
+      } catch {
+        /* prior pipeline's own error UX already fired */
+      }
+      // currentPipeline.finally has cleared the slot by the time we resume.
+    }
+
+    // Register THIS pipeline as the in-flight one. We assign the promise
+    // before awaiting it (synchronously visible to any rescanAny caller that
+    // checks `currentPipeline` in the same tick), then await ourselves.
+    // _runPipelineBody clears `currentPipeline` in its own finally.
+    const promise = this._runPipelineBody(op, filePath, options);
+    this.currentPipeline = promise;
+    await promise;
+  }
+
+  /** The actual pipeline body. Split out from runPipeline so the concurrency
+   *  control above is straight-line: assign currentPipeline once, await it,
+   *  done. The finally clears the slot so the next caller (auto-trigger or
+   *  user click) doesn't see a stale Promise. */
+  private async _runPipelineBody(
+    op: OperatorPredicate,
+    filePath: string,
+    options: { userInitiated: boolean }
+  ): Promise<void> {
+    try {
+      const workflowId = this.workflowActionService.getWorkflowMetadata()?.wid;
+      if (!workflowId) {
+        if (options.userInitiated) {
+          this.results.setState({ state: "idle", message: undefined });
+        }
+        this.notificationService.warning("DataGuard: save the workflow first.");
+        return;
+      }
+
+      if (!this.settings.isEnabled(workflowId)) {
+        if (options.userInitiated) {
+          this.results.setState({ state: "idle", message: undefined });
+        }
+        this.notificationService.info(
+          "DataGuard is OFF for this workflow. Click the 🛡 shield in the toolbar to re-enable."
+        );
+        return;
+      }
+
+      // Phase 1: agent + dataset
+      const agentId = await this.ensureAgent(workflowId);
+      this.results.setState({
+        agentId,
+        state: "scanning",
+        entries: [],
+        message: `Loading dataset…`,
+      });
+
+      const loaded = await this.loadFromOperatorFile(agentId, filePath);
+
+      this.results.setState({
+        datasetSource: loaded.source,
+        datasetRows: loaded.rows,
+        datasetColumns: loaded.columns,
+        message: `Checking ${loaded.rows} rows for problems…`,
+      });
+      this.notificationService.info(
+        `DataGuard is checking ${loaded.rows} rows from ${loaded.source}…`
+      );
+
+      // Phase 2: server-side scan (NO chat involved — bypasses LLM ReAct loop)
+      const scan: {
+        issueCount: number;
+        issues: DataQualityIssue[];
+        proposals: Array<{ issueId: string; proposal: FixProposal | null; error: string | null }>;
+      } = await firstValueFrom(
+        this.http.post<any>(`/api/agents/${agentId}/dataguard/scan`, {})
+      );
+
+      const entries: ChecklistEntry[] = scan.issues.map(issue => {
+        const p = scan.proposals.find(x => x.issueId === issue.issueId);
+        return {
+          issueId: issue.issueId,
+          issue,
+          proposal: p?.proposal ?? null,
+          error: p?.error ?? null,
+          // Default: low-risk = "allow" pre-checked. medium/high/warning all
+          // start "pending" so the user makes an explicit call — especially
+          // important for "warning" where we deliberately want manual review.
+          verdict: p?.proposal?.riskTier === "low" ? "allow" : "pending",
+        };
+      });
+
+      this.results.setState({
+        state: "ready",
+        entries,
+        message: scan.issueCount === 0
+          ? "Your data looks good — nothing to fix."
+          : `Found ${scan.issueCount} thing${scan.issueCount === 1 ? "" : "s"} we can clean up. Pick which to fix.`,
+        sourceOperatorId: op.operatorID,
+        sourceFilePath: filePath,
+      });
+      this.lastOrchestratedFile.set(op.operatorID, filePath);
+      if (scan.issueCount > 0) {
+        this.notificationService.success(
+          `DataGuard found ${scan.issueCount} thing${scan.issueCount === 1 ? "" : "s"} to clean up. Review and click Fix.`
+        );
+      } else {
+        this.notificationService.info("DataGuard: your data looks good.");
+      }
+    } catch (e: unknown) {
+      const msg = this.extractMessage(e);
+      this.results.setState({ state: "error", message: msg });
+      this.notificationService.error(`DataGuard auto-trigger failed: ${msg}`);
+    } finally {
+      this.currentPipeline = null;
+    }
+  }
+
+  private readFilePath(op: OperatorPredicate): string {
+    const props = (op.operatorProperties ?? {}) as Record<string, unknown>;
+    const v = props["fileName"];
+    return typeof v === "string" ? v.trim() : "";
+  }
+
+  private async loadFromOperatorFile(
+    agentId: string,
+    filePath: string
+  ): Promise<{ source: string; rows: number; columns: number }> {
+    const blob = await firstValueFrom(this.datasetService.retrieveDatasetVersionSingleFile(filePath));
+    const text = await blob.text();
+    const parsed = Papa.parse<Record<string, unknown>>(text, {
+      header: true,
+      skipEmptyLines: true,
+      dynamicTyping: true,
+    });
+    if (parsed.errors.length > 0) {
+      throw new Error(`CSV parse failed: ${parsed.errors[0].message} (row ${parsed.errors[0].row})`);
+    }
+    const columns = parsed.meta.fields ?? [];
+    const rows = parsed.data;
+    await firstValueFrom(
+      this.http.post(`/api/agents/${agentId}/dataguard/dataset`, { columns, rows })
+    );
+    return { source: filePath, rows: rows.length, columns: columns.length };
+  }
+
+  private async ensureAgent(workflowId: number): Promise<string> {
+    const all: AgentInfo[] = await firstValueFrom(this.agentService.getAllAgents());
+    const match = all.find(a => a.delegate?.workflowId === workflowId);
+    if (match) return match.id;
+    const created = await firstValueFrom(
+      this.agentService.createAgent("claude-haiku-4.5", "DataGuard", workflowId)
+    );
+    return created.id;
   }
 }
diff --git a/frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts
new file mode 100644
index 00000000000..21476337ed2
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { TestBed } from "@angular/core/testing";
+import { firstValueFrom } from "rxjs";
+import { take } from "rxjs/operators";
+import {
+  DataGuardResultsService,
+  ChecklistEntry,
+  DataQualityIssue,
+  FixProposal,
+} from "./data-guard-results.service";
+
+function makeIssue(issueId: string): DataQualityIssue {
+  return {
+    issueId,
+    issueType: "PLACEHOLDER",
+    column: "age",
+    description: "age=999 placeholder",
+    evidence: "5 rows",
+    affectedRowCount: 5,
+    detectedAt: "2026-05-15T00:00:00Z",
+  };
+}
+
+function makeProposal(issueId: string): FixProposal {
+  return {
+    issueId,
+    issueType: "PLACEHOLDER",
+    action: "Replace age=999 with NULL",
+    operationKind: "REPLACE_VALUE",
+    operationParams: { column: "age", from: 999, to: null },
+    riskTier: "low",
+    reason: "999 is outside valid age range",
+    evidence: "5 of 5 rows with age=999 have no other anomalies",
+    confidence: "high",
+    targetRowCount: 5,
+  };
+}
+
+function makeEntry(issueId: string, overrides: Partial<ChecklistEntry> = {}): ChecklistEntry {
+  return {
+    issueId,
+    issue: makeIssue(issueId),
+    proposal: makeProposal(issueId),
+    error: null,
+    verdict: "pending",
+    ...overrides,
+  };
+}
+
+describe("DataGuardResultsService", () => {
+  let service: DataGuardResultsService;
+
+  beforeEach(() => {
+    TestBed.configureTestingModule({});
+    service = TestBed.inject(DataGuardResultsService);
+  });
+
+  it("initializes in the idle state with an empty entry list", async () => {
+    const state = await firstValueFrom(service.getState$().pipe(take(1)));
+    expect(state.state).toBe("idle");
+    expect(state.entries).toEqual([]);
+    expect(state.agentId).toBe("");
+    expect(state.datasetRows).toBe(0);
+    expect(state.datasetColumns).toBe(0);
+    expect(state.datasetSource).toBe("");
+  });
+
+  it("setState merges the patch shallowly into the current state", () => {
+    service.setState({ state: "scanning", agentId: "agent-1", message: "loading" });
+    const after = service.getState();
+    expect(after.state).toBe("scanning");
+    expect(after.agentId).toBe("agent-1");
+    expect(after.message).toBe("loading");
+    // Untouched fields preserved
+    expect(after.entries).toEqual([]);
+    expect(after.datasetRows).toBe(0);
+  });
+
+  it("setState overwrites entries when explicitly patched", () => {
+    const entries = [makeEntry("i-1"), makeEntry("i-2")];
+    service.setState({ state: "ready", entries });
+    expect(service.getState().entries).toHaveLength(2);
+    expect(service.getState().entries[0].issueId).toBe("i-1");
+  });
+
+  it("updateEntry patches only the row whose issueId matches", () => {
+    service.setState({ state: "ready", entries: [makeEntry("i-1"), makeEntry("i-2")] });
+    service.updateEntry("i-2", { verdict: "allow" });
+    const entries = service.getState().entries;
+    expect(entries[0].verdict).toBe("pending");
+    expect(entries[1].verdict).toBe("allow");
+  });
+
+  it("updateEntry is a no-op when the issueId is not present", () => {
+    service.setState({ state: "ready", entries: [makeEntry("i-1")] });
+    const before = service.getState().entries;
+    service.updateEntry("does-not-exist", { verdict: "deny" });
+    const after = service.getState().entries;
+    expect(after).toEqual(before);
+  });
+
+  it("updateEntry supports remember and surface-error flags", () => {
+    service.setState({ state: "ready", entries: [makeEntry("i-1")] });
+    service.updateEntry("i-1", { verdict: "allow", remember: true });
+    const entry = service.getState().entries[0];
+    expect(entry.verdict).toBe("allow");
+    expect(entry.remember).toBe(true);
+  });
+
+  it("ChecklistEntry.verdict type union is exactly allow | deny | pending (no modify)", () => {
+    // Compile-time guard: assigning "modify" should fail typecheck once #11b
+    // narrows the union. We exercise the runtime contract here by sweeping the
+    // allowed verdicts and asserting none of them is the removed "modify".
+    const verdicts: Array<ChecklistEntry["verdict"]> = ["allow", "deny", "pending"];
+    expect(verdicts).not.toContain("modify");
+    // Bracket-access guards against the structural-type escape hatch: if the
+    // union ever regresses to include "modify", TypeScript will narrow this
+    // back to a valid assignment and the runtime check still holds.
+    for (const v of verdicts) {
+      service.setState({ state: "ready", entries: [makeEntry("i-1", { verdict: v })] });
+      expect(service.getState().entries[0].verdict).toBe(v);
+    }
+  });
+
+  it("ChecklistEntry no longer carries modifiedAction", () => {
+    const entry = makeEntry("i-1");
+    expect("modifiedAction" in entry).toBe(false);
+  });
+
+  it("reset returns to the initial idle state regardless of prior state", () => {
+    service.setState({
+      state: "ready",
+      agentId: "agent-1",
+      entries: [makeEntry("i-1"), makeEntry("i-2")],
+      datasetSource: "demo.csv",
+      datasetRows: 100,
+      datasetColumns: 5,
+      message: "Found 2 issues",
+    });
+    service.reset();
+    const after = service.getState();
+    expect(after.state).toBe("idle");
+    expect(after.entries).toEqual([]);
+    expect(after.agentId).toBe("");
+    expect(after.datasetRows).toBe(0);
+    expect(after.datasetColumns).toBe(0);
+    expect(after.datasetSource).toBe("");
+    expect(after.message).toBeUndefined();
+  });
+
+  it("notifies subscribers when state changes", async () => {
+    const seen: string[] = [];
+    const sub = service.getState$().subscribe(s => seen.push(s.state));
+    service.setState({ state: "scanning" });
+    service.setState({ state: "ready", entries: [makeEntry("i-1")] });
+    service.reset();
+    sub.unsubscribe();
+    expect(seen).toEqual(["idle", "scanning", "ready", "idle"]);
+  });
+});
diff --git a/frontend/src/app/workspace/service/agent/data-guard-results.service.ts b/frontend/src/app/workspace/service/agent/data-guard-results.service.ts
new file mode 100644
index 00000000000..3e92f8ace61
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-results.service.ts
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Injectable } from "@angular/core";
+import { BehaviorSubject, Observable } from "rxjs";
+
+/** Shape of a DataGuard quality issue (mirrors agent-service/src/types/dataguard.ts). */
+export interface DataQualityIssue {
+  issueId: string;
+  issueType: string;
+  column: string;
+  description: string;
+  evidence: string;
+  affectedRowCount: number;
+  affectedRowIndices?: number[];
+  detectedAt: string;
+}
+
+export interface FixProposal {
+  issueId: string;
+  issueType: string;
+  action: string;
+  operationKind: string;
+  operationParams: Record<string, unknown>;
+  // `warning` = fix is concrete but agent recommends manual review (e.g.,
+  // outliers that might be real extremes). UI defaults these unchecked.
+  riskTier: "low" | "medium" | "high" | "warning";
+  reason: string;
+  evidence: string;
+  confidence: "low" | "medium" | "high";
+  targetRowCount: number;
+}
+
+/** One row in the checklist UI: issue + (optional) proposal + (optional) error. */
+export interface ChecklistEntry {
+  issueId: string;
+  issue: DataQualityIssue;
+  proposal: FixProposal | null;
+  /** Error generating the proposal (LLM failure, etc.) */
+  error: string | null;
+  /** User's selection for the apply-batch call. */
+  verdict: "allow" | "deny" | "pending";
+  /** When verdict === "allow", remember this issueType for next time. */
+  remember?: boolean;
+}
+
+export type DataGuardPanelState = "idle" | "scanning" | "ready" | "applying" | "done" | "error";
+
+export interface DataGuardScanResult {
+  agentId: string;
+  state: DataGuardPanelState;
+  entries: ChecklistEntry[];
+  datasetSource: string; // e.g., "bundled demo (diabetes_messy.csv)" or filePath
+  datasetRows: number;
+  datasetColumns: number;
+  message?: string; // status message, e.g., "applied 3 fixes" or error text
+  /** Operator + file the current scan was triggered against — used by Apply
+   *  to write the cleaned CSV back into the same dataset as a new version. */
+  sourceOperatorId?: string;
+  sourceFilePath?: string;
+  // Post-apply verification re-scan is read directly from the HTTP response
+  // in DataGuardAutoTriggerService — it produces a toast and is not held on
+  // state. If a future UI needs to display residual leftovers in the panel,
+  // add the field back here.
+}
+
+/**
+ * Per-page-session DataGuard state. Drives the DataGuardChecklistComponent.
+ *
+ * - `DataGuardAutoTriggerService` writes here after loading a dataset and
+ *   running /scan.
+ * - `DataGuardChecklistComponent` subscribes and re-renders.
+ * - Clearing the workflow / closing the panel resets to `idle`.
+ */
+@Injectable({ providedIn: "root" })
+export class DataGuardResultsService {
+  private readonly state$ = new BehaviorSubject<DataGuardScanResult>({
+    agentId: "",
+    state: "idle",
+    entries: [],
+    datasetSource: "",
+    datasetRows: 0,
+    datasetColumns: 0,
+  });
+
+  public getState$(): Observable<DataGuardScanResult> {
+    return this.state$.asObservable();
+  }
+
+  public getState(): DataGuardScanResult {
+    return this.state$.value;
+  }
+
+  public setState(patch: Partial<DataGuardScanResult>): void {
+    this.state$.next({ ...this.state$.value, ...patch });
+  }
+
+  public updateEntry(issueId: string, patch: Partial<ChecklistEntry>): void {
+    const current = this.state$.value;
+    const entries = current.entries.map(e => (e.issueId === issueId ? { ...e, ...patch } : e));
+    this.state$.next({ ...current, entries });
+  }
+
+  public reset(): void {
+    this.state$.next({
+      agentId: "",
+      state: "idle",
+      entries: [],
+      datasetSource: "",
+      datasetRows: 0,
+      datasetColumns: 0,
+      sourceOperatorId: undefined,
+      sourceFilePath: undefined,
+    });
+  }
+}
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
new file mode 100644
index 00000000000..2449c88e49d
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { fakeAsync, tick } from "@angular/core/testing";
+import { vi } from "vitest";
+import { DataGuardRowNavigatorService, DataGuardRowNavRequest } from "./data-guard-row-navigator.service";
+
+describe("DataGuardRowNavigatorService", () => {
+  describe("pageIndexFor", () => {
+    it("returns page 1 for the first row regardless of page size", () => {
+      expect(DataGuardRowNavigatorService.pageIndexFor(0, 5)).toBe(1);
+      expect(DataGuardRowNavigatorService.pageIndexFor(0, 25)).toBe(1);
+    });
+
+    it("computes 1-based page index for arbitrary row indices", () => {
+      expect(DataGuardRowNavigatorService.pageIndexFor(4, 5)).toBe(1);
+      expect(DataGuardRowNavigatorService.pageIndexFor(5, 5)).toBe(2);
+      expect(DataGuardRowNavigatorService.pageIndexFor(11, 5)).toBe(3);
+      expect(DataGuardRowNavigatorService.pageIndexFor(99, 10)).toBe(10);
+    });
+
+    it("guards against degenerate page sizes (returns page 1)", () => {
+      expect(DataGuardRowNavigatorService.pageIndexFor(7, 0)).toBe(1);
+    });
+
+    it("clamps negative indices to page 1 and logs a warning", () => {
+      const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
+      expect(DataGuardRowNavigatorService.pageIndexFor(-3, 5)).toBe(1);
+      expect(warnSpy).toHaveBeenCalled();
+      warnSpy.mockRestore();
+    });
+  });
+
+  describe("nextCycleStep", () => {
+    it("walks the cycle and wraps to the start", () => {
+      const indices = [3, 7, 12];
+      // start: cursor=0 → value=3, next=1
+      let step = DataGuardRowNavigatorService.nextCycleStep(indices, 0);
+      expect(step).toEqual({ value: 3, nextCursor: 1 });
+      // cursor=1 → value=7, next=2
+      step = DataGuardRowNavigatorService.nextCycleStep(indices, step.nextCursor);
+      expect(step).toEqual({ value: 7, nextCursor: 2 });
+      // cursor=2 → value=12, next=3
+      step = DataGuardRowNavigatorService.nextCycleStep(indices, step.nextCursor);
+      expect(step).toEqual({ value: 12, nextCursor: 3 });
+      // cursor=3 wraps → value=3, next=1 (3 % 3 = 0)
+      step = DataGuardRowNavigatorService.nextCycleStep(indices, step.nextCursor);
+      expect(step).toEqual({ value: 3, nextCursor: 1 });
+      // cursor=1 again → value=7, next=2
+      step = DataGuardRowNavigatorService.nextCycleStep(indices, step.nextCursor);
+      expect(step).toEqual({ value: 7, nextCursor: 2 });
+    });
+
+    it("treats a single-element array as a no-op cycle that re-emits", () => {
+      const indices = [42];
+      expect(DataGuardRowNavigatorService.nextCycleStep(indices, 0)).toEqual({ value: 42, nextCursor: 1 });
+      expect(DataGuardRowNavigatorService.nextCycleStep(indices, 1)).toEqual({ value: 42, nextCursor: 1 });
+      expect(DataGuardRowNavigatorService.nextCycleStep(indices, 7)).toEqual({ value: 42, nextCursor: 1 });
+    });
+
+    it("coerces corrupted cursors to 0", () => {
+      const indices = [10, 20];
+      expect(DataGuardRowNavigatorService.nextCycleStep(indices, -5)).toEqual({ value: 10, nextCursor: 1 });
+      expect(DataGuardRowNavigatorService.nextCycleStep(indices, NaN)).toEqual({ value: 10, nextCursor: 1 });
+    });
+
+    it("throws when affectedRowIndices is empty (callers must toast first)", () => {
+      expect(() => DataGuardRowNavigatorService.nextCycleStep([], 0)).toThrow();
+    });
+  });
+
+  describe("navigate / getNav$", () => {
+    it("multicasts the request to subscribers", () => {
+      const svc = new DataGuardRowNavigatorService();
+      const req: DataGuardRowNavRequest = { operatorId: "op-1", rowIndex: 42, column: "age" };
+      const received: DataGuardRowNavRequest[] = [];
+      svc.getNav$().subscribe(r => received.push(r));
+      svc.navigate(req);
+      expect(received).toEqual([req]);
+    });
+
+    // R3: cold-mount replay. The ResultTableFrameComponent mounts lazily after
+    // openResultPanel(), so a plain Subject would lose the emission. The
+    // ReplaySubject(1) ensures the late subscriber still sees the most recent
+    // request — this is the buffer behaviour reviewers asked us to lock down.
+    it("replays the most recent request to a late subscriber (cold-mount)", () => {
+      const svc = new DataGuardRowNavigatorService();
+      const req: DataGuardRowNavRequest = { operatorId: "op-1", rowIndex: 7, column: "x" };
+      svc.navigate(req);
+      const received: DataGuardRowNavRequest[] = [];
+      svc.getNav$().subscribe(r => received.push(r));
+      expect(received).toEqual([req]);
+    });
+
+    // R3 followup: the replay window is bounded (500 ms) so a request from a
+    // long-ago click doesn't leak into a much later mount.
+    it("does not replay requests older than the buffer window", fakeAsync(() => {
+      const svc = new DataGuardRowNavigatorService();
+      svc.navigate({ operatorId: "op-1", rowIndex: 7 });
+      tick(1000);
+      const received: DataGuardRowNavRequest[] = [];
+      svc.getNav$().subscribe(r => received.push(r));
+      expect(received).toEqual([]);
+    }));
+  });
+});
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
new file mode 100644
index 00000000000..8c721dca560
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Injectable } from "@angular/core";
+import { Observable, ReplaySubject } from "rxjs";
+
+/** Payload of a "show this row in the result panel" request. */
+export interface DataGuardRowNavRequest {
+  operatorId: string;
+  /** 0-based row index in the operator's full result set. */
+  rowIndex: number;
+  /** Optional column to focus / highlight inside the row. */
+  column?: string;
+}
+
+/**
+ * Tiny pub/sub for the "click an issue → jump to that row in the result panel"
+ * affordance. The checklist publishes here; ResultTableFrameComponent subscribes
+ * and pages to the right offset + flashes the row.
+ *
+ * Why ReplaySubject(1, 500ms): the result panel mounts `ResultTableFrameComponent`
+ * lazily via NgComponentOutlet — when the checklist click triggers
+ * `openResultPanel()`, the frame is created on the next CD tick, *after* the
+ * navigator emit fires. A plain Subject would drop the emission. ReplaySubject
+ * with a 1-element / 500 ms window replays the most recent request to a
+ * cold-mounted subscriber, but only briefly so stale requests don't bleed into
+ * unrelated later mounts (e.g., user switches operators a minute later).
+ */
+@Injectable({ providedIn: "root" })
+export class DataGuardRowNavigatorService {
+  // TODO: 500 ms is empirical — covers the openResultPanel → NgComponentOutlet
+  // mount on a warm laptop. Bump if QA reports drops on slow CPUs / first
+  // contentful paint stalls.
+  private readonly nav$ = new ReplaySubject<DataGuardRowNavRequest>(1, 500);
+
+  public getNav$(): Observable<DataGuardRowNavRequest> {
+    return this.nav$.asObservable();
+  }
+
+  public navigate(req: DataGuardRowNavRequest): void {
+    this.nav$.next(req);
+  }
+
+  /**
+   * Pure helper, broken out for unit testability. Pages are 1-based.
+   * Logs a warning (does not throw) on negative `rowIndex` — a silent clamp
+   * would hide caller bugs.
+   */
+  public static pageIndexFor(rowIndex: number, pageSize: number): number {
+    if (rowIndex < 0) {
+      // eslint-disable-next-line no-console
+      console.warn(`DataGuardRowNavigatorService: negative rowIndex=${rowIndex}, clamping to page 1`);
+      return 1;
+    }
+    if (pageSize <= 0) return 1;
+    return Math.floor(rowIndex / pageSize) + 1;
+  }
+
+  /**
+   * Advance a per-row cycle cursor through `affectedRowIndices` and return the
+   * `{ value, nextCursor }` pair for the click that just happened. Used by
+   * the checklist so repeated clicks on the same "📍" button walk every
+   * affected row in turn and wrap to the start. Modulo on `length` so length-0
+   * inputs are rejected (caller toasts before calling). Negative or NaN
+   * cursors are coerced to 0 — defensive against a corrupted Map entry.
+   */
+  public static nextCycleStep(
+    affectedRowIndices: ReadonlyArray<number>,
+    cursor: number
+  ): { value: number; nextCursor: number } {
+    const len = affectedRowIndices.length;
+    if (len === 0) {
+      throw new Error("DataGuardRowNavigatorService.nextCycleStep: empty affectedRowIndices");
+    }
+    const safe = Number.isFinite(cursor) && cursor >= 0 ? Math.floor(cursor) : 0;
+    const idx = safe % len;
+    return { value: affectedRowIndices[idx], nextCursor: idx + 1 };
+  }
+}
diff --git a/frontend/src/app/workspace/service/agent/data-guard-settings.service.ts b/frontend/src/app/workspace/service/agent/data-guard-settings.service.ts
new file mode 100644
index 00000000000..58786f2616e
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-settings.service.ts
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { Injectable } from "@angular/core";
+import { BehaviorSubject, Observable, distinctUntilChanged, map } from "rxjs";
+
+/**
+ * Per-workflow DataGuard auto-trigger enable/disable.
+ *
+ * - Default is **ON** for every new workflow (matches the storyboard:
+ *   "drag a dataset → DataGuard auto-pops").
+ * - Per-workflow because some users will want to scan one workflow but not
+ *   another. Keyed by the workflow id in `localStorage` so the choice
+ *   persists across page reloads but is not synced server-side.
+ * - The state is exposed as an Observable so the toolbar's shield button can
+ *   reactively recolor itself, and `DataGuardAutoTriggerService` reads it
+ *   synchronously in `runPipeline` to gate orchestration.
+ */
+@Injectable({ providedIn: "root" })
+export class DataGuardSettingsService {
+  private static readonly STORAGE_KEY_PREFIX = "dataguard.enabled.wid.";
+  private static readonly OFF = "off";
+  private static readonly ON = "on";
+
+  // Cache of in-memory state, in addition to localStorage. Map<workflowId, enabled>.
+  // localStorage is the source of truth across reloads; the cache is just
+  // here so subscribers get instant updates without re-reading.
+  private readonly cache$ = new BehaviorSubject<ReadonlyMap<number, boolean>>(new Map());
+
+  /** True if DataGuard auto-trigger should fire for this workflow. */
+  public isEnabled(workflowId: number): boolean {
+    const cached = this.cache$.value.get(workflowId);
+    if (cached !== undefined) return cached;
+    const stored = localStorage.getItem(this.key(workflowId));
+    // Absent key = default-on. Only an explicit "off" disables.
+    return stored !== DataGuardSettingsService.OFF;
+  }
+
+  /** Reactive view of the enabled flag for a specific workflow. */
+  public isEnabled$(workflowId: number): Observable<boolean> {
+    return this.cache$.asObservable().pipe(
+      map(m => {
+        const v = m.get(workflowId);
+        if (v !== undefined) return v;
+        return localStorage.getItem(this.key(workflowId)) !== DataGuardSettingsService.OFF;
+      }),
+      distinctUntilChanged()
+    );
+  }
+
+  /** Persist + broadcast a new enabled state for this workflow. */
+  public setEnabled(workflowId: number, enabled: boolean): void {
+    localStorage.setItem(
+      this.key(workflowId),
+      enabled ? DataGuardSettingsService.ON : DataGuardSettingsService.OFF
+    );
+    const next = new Map(this.cache$.value);
+    next.set(workflowId, enabled);
+    this.cache$.next(next);
+  }
+
+  /** Convenience toggle. Returns the new state. */
+  public toggle(workflowId: number): boolean {
+    const next = !this.isEnabled(workflowId);
+    this.setEnabled(workflowId, next);
+    return next;
+  }
+
+  private key(workflowId: number): string {
+    return DataGuardSettingsService.STORAGE_KEY_PREFIX + workflowId;
+  }
+}

From 3e19fed72064b5609470ab6269b4b8397b516488 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 00:47:15 -0700
Subject: [PATCH 06/14] chore(dataguard): untrack agent-service/demo/,
 gitignore it

The demo CSVs (diabetes_messy + the six single-category fixtures) and their
README are kept on disk for local hand-testing but shouldn't live in upstream.
git rm --cached preserves the files locally; new .gitignore prevents
accidental re-staging.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 agent-service/.gitignore                      |  5 ++
 agent-service/demo/README.md                  | 70 -------------------
 agent-service/demo/diabetes_messy.csv         | 51 --------------
 agent-service/demo/duplicate_rows_demo.csv    | 31 --------
 .../demo/inconsistent_labels_demo.csv         | 31 --------
 agent-service/demo/missing_values_demo.csv    | 31 --------
 agent-service/demo/outliers_demo.csv          | 31 --------
 .../demo/placeholder_values_demo.csv          | 31 --------
 8 files changed, 5 insertions(+), 276 deletions(-)
 create mode 100644 agent-service/.gitignore
 delete mode 100644 agent-service/demo/README.md
 delete mode 100644 agent-service/demo/diabetes_messy.csv
 delete mode 100644 agent-service/demo/duplicate_rows_demo.csv
 delete mode 100644 agent-service/demo/inconsistent_labels_demo.csv
 delete mode 100644 agent-service/demo/missing_values_demo.csv
 delete mode 100644 agent-service/demo/outliers_demo.csv
 delete mode 100644 agent-service/demo/placeholder_values_demo.csv

diff --git a/agent-service/.gitignore b/agent-service/.gitignore
new file mode 100644
index 00000000000..d28d2f27c42
--- /dev/null
+++ b/agent-service/.gitignore
@@ -0,0 +1,5 @@
+# Local-only demo datasets for DataGuard. Kept on disk for hand-testing the
+# auto-trigger flow against single-category CSVs, but not committed —
+# `agent-service/demo/diabetes_messy.csv` and its single-category siblings
+# can be re-generated or re-downloaded; they shouldn't live in upstream.
+demo/
diff --git a/agent-service/demo/README.md b/agent-service/demo/README.md
deleted file mode 100644
index e148521626f..00000000000
--- a/agent-service/demo/README.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# DataGuard demo dataset
-
-`diabetes_messy.csv` — a deliberately polluted ~50-row sample modeled on the
-UCI Pima Indians Diabetes dataset. Each row is one subject; the rightmost
-column (`diabetic_outcome`) is the label.
-
-## Injected issues (every issue type DataGuard detects)
-
-| # | Issue type | Rows | What's wrong |
-|---|---|---|---|
-| 1 | `placeholder_value` | S004, S012 | `age = 999` (sentinel) |
-| 2 | `placeholder_value` | S043 | `bmi = -1` (sentinel) |
-| 3 | `placeholder_value` | S047 | `age = "Unknown"` (string sentinel) |
-| 4 | `missing_value` | S005, S007, S009, S014 | empty `glucose` in Group A (imbalanced) |
-| 5 | `missing_value` | S045, S046, S048 | `age = "N/A"`, `glucose = " "`, `glucose = "null"` |
-| 6 | `duplicate_id` | S001, S017 | sample_id repeats with conflicting `diabetic_outcome` |
-| 7 | `outlier` | S041, S042, S044 | `bmi > 60` outside user-supplied `validRanges` (possible real extreme — flagged as `warning` tier) |
-
-## How to load into DataGuard
-
-```bash
-# After bun install + bun run dev in agent-service/
-curl -X POST http://localhost:8000/api/agents/<agentId>/dataguard/dataset \
-  -H "Content-Type: application/json" \
-  -d "$(jq -nR --rawfile c diabetes_messy.csv '{
-    columns: ($c | split("\n")[0] | split(",")),
-    rows: ($c | split("\n")[1:] | map(split(",") | length as $n | reduce range(0;$n) as $i ({}; . + {(.|keys|"col\($i)"): .[$i]})))
-  }')"
-```
-
-The frontend auto-trigger handles this in the real flow — once a `CSVFileScan`
-operator is added that references this file, DataGuard auto-launches.
-
-## Single-category demo files
-
-For testing one detector at a time, each of these CSVs concentrates pollution
-in a single category so it's obvious which detector is firing:
-
-| File | Issue category | What's wrong |
-|---|---|---|
-| `missing_values_demo.csv` | `missing_value` | empty / `N/A` / `NA` / `null` cells across multiple columns |
-| `placeholder_values_demo.csv` | `placeholder_value` | `999`, `-1`, `Unknown` / `unknown` sentinels |
-| `duplicate_rows_demo.csv` | `duplicate_id` | repeated `sample_id`s, some with conflicting outcomes |
-| `outliers_demo.csv` | `outlier` | negative ages, BMI > 200, blood pressure > 250 — fires only when `validRanges` is supplied at scan time |
-| `inconsistent_labels_demo.csv` | `inconsistent_label` | `Male` / `male` / `MALE` and `Female` / `female` / `FEMALE` mixed |
-
-The `outliers_demo.csv` requires `validRanges` to be set when scanning (the
-profiler does not auto-detect numerical outliers via z-score — that variant was
-removed because it flagged legitimate clustered extremes as errors). The other
-four fire on default scan options.
-
-Suggested `validRanges` for the outlier demo:
-
-```json
-{
-  "age":            { "min": 0,   "max": 120 },
-  "bmi":            { "min": 10,  "max": 60 },
-  "blood_pressure": { "min": 40,  "max": 200 }
-}
-```
-
-## Bias-check expectation
-
-Group A: 22 rows. Group B: 23 rows. After cleaning, missingness imbalance
-(more empties in A) means naive imputation drops ~18% of A but only ~4% of B
-— DataGuard surfaces this and proposes a `replace_value` fix tagged with
-`riskTier: "warning"` so the user explicitly confirms instead of letting
-imputation run silently. The earlier `flag` operation kind was removed —
-every fix is now a concrete change, and "please review manually" is conveyed
-through the warning tier instead.
diff --git a/agent-service/demo/diabetes_messy.csv b/agent-service/demo/diabetes_messy.csv
deleted file mode 100644
index 12115816b1a..00000000000
--- a/agent-service/demo/diabetes_messy.csv
+++ /dev/null
@@ -1,51 +0,0 @@
-sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
-S001,45,110,28.1,80,A,0
-S002,52,140,30.5,85,A,1
-S003,38,95,24.0,70,A,0
-S004,999,130,29.8,82,A,1
-S005,41,,27.5,78,A,0
-S006,47,125,31.2,80,A,1
-S007,55,,33.0,90,A,1
-S008,49,118,28.9,82,A,0
-S009,42,,26.7,75,A,0
-S010,53,135,29.1,88,A,1
-S011,46,108,27.3,79,A,0
-S012,999,142,30.8,87,A,1
-S013,43,112,26.4,76,A,0
-S014,51,,29.5,84,A,1
-S015,48,120,28.6,81,A,0
-S016,40,98,25.2,72,A,0
-S017,57,148,32.7,92,A,1
-S018,44,115,27.8,79,A,0
-S019,50,128,30.0,85,A,1
-S020,39,102,24.9,73,A,0
-S021,46,113,28.4,80,B,0
-S022,54,138,31.5,88,B,1
-S023,42,108,27.0,77,B,0
-S024,48,122,29.3,82,B,1
-S025,45,116,28.7,79,B,0
-S026,52,134,30.6,86,B,1
-S027,41,105,25.8,74,B,0
-S028,49,124,29.0,83,B,1
-S029,47,118,28.2,81,B,0
-S030,43,110,26.9,76,B,0
-S031,55,145,32.1,90,B,1
-S032,46,114,28.5,80,B,0
-S033,50,130,30.2,85,B,1
-S034,38,72,23.5,68,B,0
-S035,53,140,31.0,87,B,1
-S036,44,112,27.6,78,B,0
-S037,51,132,30.4,86,B,1
-S038,42,107,26.3,75,B,0
-S039,48,121,28.8,82,B,1
-S040,45,115,27.9,80,B,0
-S001,45,110,28.1,80,A,1
-S017,57,148,32.7,92,A,0
-S041,62,180,67.5,95,A,1
-S042,58,165,65.2,93,B,1
-S043,49,128,-1,82,A,0
-S044,46,112,72.0,84,B,1
-S045,N/A,118,28.7,80,A,0
-S046,44, ,26.9,76,B,0
-S047,Unknown,124,29.2,83,A,1
-S048,50,null,28.8,81,B,0
diff --git a/agent-service/demo/duplicate_rows_demo.csv b/agent-service/demo/duplicate_rows_demo.csv
deleted file mode 100644
index 99bc4f4586a..00000000000
--- a/agent-service/demo/duplicate_rows_demo.csv
+++ /dev/null
@@ -1,31 +0,0 @@
-sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
-S001,45,110,28.1,80,A,0
-S002,52,140,30.5,85,A,1
-S003,38,95,24.0,70,A,0
-S004,46,130,29.8,82,A,1
-S005,41,115,27.5,78,A,0
-S006,47,125,28.0,80,A,1
-S007,55,132,30.0,85,A,1
-S008,49,118,28.9,82,A,0
-S009,42,114,26.7,75,A,0
-S010,53,135,29.1,88,A,1
-S001,45,110,28.1,80,A,0
-S002,52,142,30.5,85,A,1
-S007,55,132,30.0,85,A,0
-S011,46,108,27.3,79,A,0
-S012,44,142,30.8,87,A,1
-S013,43,112,26.4,76,A,0
-S014,51,128,29.5,84,A,1
-S015,48,120,28.6,81,A,0
-S016,40,98,25.2,72,A,0
-S017,57,148,32.7,92,A,1
-S017,57,148,32.7,92,A,1
-S018,44,115,27.8,79,A,0
-S019,50,128,30.0,85,A,1
-S020,39,102,24.9,73,A,0
-S004,46,130,29.8,82,A,1
-S021,46,113,28.4,80,B,0
-S022,54,138,31.5,88,B,1
-S023,42,108,27.0,77,B,0
-S024,48,122,29.3,82,B,1
-S010,53,135,29.1,88,A,0
diff --git a/agent-service/demo/inconsistent_labels_demo.csv b/agent-service/demo/inconsistent_labels_demo.csv
deleted file mode 100644
index bc8759a93c5..00000000000
--- a/agent-service/demo/inconsistent_labels_demo.csv
+++ /dev/null
@@ -1,31 +0,0 @@
-sample_id,age,glucose,bmi,gender,group,diabetic_outcome
-S001,45,110,28.1,Male,A,0
-S002,52,140,30.5,Female,A,1
-S003,38,95,24.0,male,A,0
-S004,46,130,29.8,Female,A,1
-S005,41,115,27.5,MALE,A,0
-S006,47,125,28.0,female,A,1
-S007,55,132,30.0,Male,A,1
-S008,49,118,28.9,Female,A,0
-S009,42,114,26.7,Male,A,0
-S010,53,135,29.1,FEMALE,A,1
-S011,46,108,27.3,Male,A,0
-S012,44,142,30.8,Female,A,1
-S013,43,112,26.4,male,A,0
-S014,51,128,29.5,Female,A,1
-S015,48,120,28.6,Male,A,0
-S016,40,98,25.2,Female,A,0
-S017,57,148,32.7,Male,A,1
-S018,44,115,27.8,Female,A,0
-S019,50,128,30.0,Male,A,1
-S020,39,102,24.9,Female,A,0
-S021,46,113,28.4,Male,B,0
-S022,54,138,31.5,Female,B,1
-S023,42,108,27.0,Male,B,0
-S024,48,122,29.3,female,B,1
-S025,45,116,28.7,Male,B,0
-S026,52,134,30.6,Female,B,1
-S027,41,105,25.8,Male,B,0
-S028,49,124,29.0,Female,B,1
-S029,47,118,28.2,male,B,0
-S030,43,110,26.9,Female,B,0
diff --git a/agent-service/demo/missing_values_demo.csv b/agent-service/demo/missing_values_demo.csv
deleted file mode 100644
index 07d1cfab2a0..00000000000
--- a/agent-service/demo/missing_values_demo.csv
+++ /dev/null
@@ -1,31 +0,0 @@
-sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
-S001,45,110,28.1,80,A,0
-S002,52,140,30.5,85,A,1
-S003,38,95,24.0,70,A,0
-S004,,130,29.8,82,A,1
-S005,41,,27.5,78,A,0
-S006,47,125,,80,A,1
-S007,55,,33.0,90,A,1
-S008,49,118,28.9,,A,0
-S009,42,,26.7,75,A,0
-S010,53,135,29.1,88,A,1
-S011,46,108,27.3,79,A,0
-S012,N/A,142,30.8,87,A,1
-S013,43,112,26.4,76,A,0
-S014,51,NA,29.5,84,A,1
-S015,48,120,28.6,81,A,0
-S016,40,98,25.2,72,A,0
-S017,57,148,32.7,92,A,1
-S018,44,115,27.8,79,A,0
-S019,50,128,30.0,85,A,1
-S020,39,102,24.9,73,A,0
-S021,46,113,28.4,80,B,0
-S022,54,138,31.5,88,B,1
-S023,42,108,27.0,77,B,0
-S024,48,122,29.3,82,B,1
-S025,45,116,28.7,79,B,0
-S026,null,134,30.6,86,B,1
-S027,41,105,25.8,74,B,0
-S028,49,124,29.0,83,B,1
-S029,47,118,28.2,81,B,0
-S030,43,110,26.9,76,B,0
diff --git a/agent-service/demo/outliers_demo.csv b/agent-service/demo/outliers_demo.csv
deleted file mode 100644
index 2ddd4320679..00000000000
--- a/agent-service/demo/outliers_demo.csv
+++ /dev/null
@@ -1,31 +0,0 @@
-sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
-S001,45,110,28.1,80,A,0
-S002,52,140,30.5,85,A,1
-S003,38,95,24.0,70,A,0
-S004,200,130,29.8,82,A,1
-S005,41,115,27.5,78,A,0
-S006,-5,125,28.0,80,A,1
-S007,55,132,300,85,A,1
-S008,49,118,28.9,82,A,0
-S009,42,114,26.7,250,A,0
-S010,53,135,29.1,88,A,1
-S011,46,108,27.3,79,A,0
-S012,44,142,30.8,87,A,1
-S013,43,112,26.4,76,A,0
-S014,51,128,29.5,84,A,1
-S015,180,120,28.6,81,A,0
-S016,40,98,25.2,72,A,0
-S017,57,148,500,92,A,1
-S018,44,115,27.8,79,A,0
-S019,50,128,30.0,85,A,1
-S020,39,102,24.9,73,A,0
-S021,46,113,28.4,80,B,0
-S022,54,138,31.5,300,B,1
-S023,42,108,27.0,77,B,0
-S024,48,122,29.3,82,B,1
-S025,45,116,28.7,79,B,0
-S026,52,134,30.6,86,B,1
-S027,-2,105,25.8,74,B,0
-S028,49,124,29.0,83,B,1
-S029,47,118,28.2,81,B,0
-S030,43,110,26.9,76,B,0
diff --git a/agent-service/demo/placeholder_values_demo.csv b/agent-service/demo/placeholder_values_demo.csv
deleted file mode 100644
index 00d16b50df4..00000000000
--- a/agent-service/demo/placeholder_values_demo.csv
+++ /dev/null
@@ -1,31 +0,0 @@
-sample_id,age,glucose,bmi,blood_pressure,group,diabetic_outcome
-S001,45,110,28.1,80,A,0
-S002,52,140,30.5,85,A,1
-S003,38,95,24.0,70,A,0
-S004,999,130,29.8,82,A,1
-S005,41,999,27.5,78,A,0
-S006,47,125,28.0,80,A,1
-S007,Unknown,128,30.0,85,A,1
-S008,49,118,28.9,82,A,0
-S009,42,114,26.7,75,A,0
-S010,53,135,29.1,88,A,1
-S011,46,108,27.3,79,A,0
-S012,999,142,30.8,87,A,1
-S013,43,112,26.4,76,A,0
-S014,51,unknown,29.5,84,A,1
-S015,48,120,28.6,81,A,0
-S016,40,98,25.2,72,A,0
-S017,57,148,32.7,92,A,1
-S018,44,115,27.8,79,A,0
-S019,50,128,30.0,85,A,1
-S020,39,102,24.9,73,A,0
-S021,46,113,-1,80,B,0
-S022,54,138,31.5,88,B,1
-S023,42,108,27.0,77,B,0
-S024,48,122,-1,82,B,1
-S025,45,116,28.7,-1,B,0
-S026,52,134,30.6,86,B,1
-S027,Unknown,105,25.8,74,B,0
-S028,49,124,29.0,83,B,1
-S029,47,118,28.2,81,B,0
-S030,43,110,26.9,76,B,0

From 5cd016058af08fba5e36cd83aa7c2f8e3fdc7271 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 00:56:35 -0700
Subject: [PATCH 07/14] =?UTF-8?q?docs(dataguard):=20drop=20=C2=A714=20Setu?=
 =?UTF-8?q?p;=20keep=20the=20end-to-end=20user=20flow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed the LiteLLM proxy / Python venv / bun-install setup instructions
from README_DataGuard.md. The flow walkthrough that lived under §14.3 is
preserved as the new §14 "End-to-end flow" so the doc still tells a user
what to expect after they click around. Downstream section numbers are
unchanged (§15 Testing → §19 Post-MVP follow-ups).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 README_DataGuard.md | 54 +--------------------------------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/README_DataGuard.md b/README_DataGuard.md
index baf0dafbf2a..8574a14a82d 100644
--- a/README_DataGuard.md
+++ b/README_DataGuard.md
@@ -394,59 +394,7 @@ src/app/workspace/
 
 ---
 
-## 14. Setup
-
-DataGuard runs as part of the standard Texera microservices + agent-service stack, plus a local LiteLLM proxy that wraps an LLM provider with an OpenAI-compatible API.
-
-### 14.1 One-time setup
-
-```bash
-# 1) API key for your LLM provider (e.g., Anthropic) — exported once
-echo 'export ANTHROPIC_API_KEY=sk-ant-…your-key…' >> ~/.zshrc
-source ~/.zshrc
-
-# 2) Python venv for LiteLLM proxy
-python3.12 -m venv ~/UCI/TexeraProject/venv312
-~/UCI/TexeraProject/venv312/bin/pip install --upgrade pip
-~/UCI/TexeraProject/venv312/bin/pip install 'litellm[proxy]'
-
-# 3) Bun for agent-service
-brew install oven-sh/bun/bun
-
-# 4) Yarn 4 via Corepack for frontend
-corepack enable
-
-# 5) GUI feature flag in Texera config
-# Set: common/config/src/main/resources/gui.conf
-#   copilot-enabled = true
-```
-
-### 14.2 Daily startup
-
-```bash
-# Terminal 1 — LiteLLM proxy on :4000 (OpenAI-style API over your provider)
-source ~/UCI/TexeraProject/venv312/bin/activate
-cd ~/UCI/TexeraProject/texera
-litellm --config bin/litellm-config.yaml
-
-# Terminal 2 — Texera Scala microservices
-# IntelliJ "texera micro services" run config, or:
-# bin/single-node/docker compose up -d
-
-# Terminal 3 — agent-service
-cd ~/UCI/TexeraProject/texera/agent-service
-bun install   # only if node_modules absent
-bun run dev   # :3001 with --watch reload
-
-# Terminal 4 — Frontend
-cd ~/UCI/TexeraProject/texera/frontend
-yarn install  # only if node_modules absent
-yarn start    # :4200, proxies /api/* per proxy.config.json
-```
-
-Then open `http://localhost:4200`, sign in, open or create a workflow.
-
-### 14.3 End-to-end flow
+## 14. End-to-end flow
 
 1. Confirm the 🛡 shield is ON (toolbar — twotone icon = ON, outline = OFF).
 2. Drop a `CSVFileScan` operator and point it at any dataset in the system.

From 7e4ca99125307a05b2f39c2b1e56dc44b26062a4 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 04:16:40 -0700
Subject: [PATCH 08/14] feat(dataguard): JSONL + CSVOld support, fingerprint
 locate, contract enforcement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Substantial follow-up iteration on the locate UX, file-type coverage, and
contract hardening. All work on top of feat/dataguard-mvp commit 5cd016058.

## File-type coverage (auto-trigger dispatcher)

`loadFromOperatorFile` refactored into a parser-dispatcher (`PARSERS` map +
`DatasetParser` type). Three operator types now in the trigger set:
  - CSVFileScan / CSVOldFileScan → parseCsv (CSVOld threads customDelimiter
    from operator properties so `;` / `\t` / etc. are honored)
  - JSONLFileScan → parseJsonl (new module, nested-object flatten to
    dot-notation columns; arrays stringified as single cells; non-object
    lines warn-and-skip; CRLF tolerated; collision rule: nested-owned paths
    always win over literal-dotted top-level keys regardless of JSON source
    order, via two-pass scan)

Format-aware write-back: server-side GET /dataguard/export-jsonl serializes
the in-memory session back to JSONL (canonical column key order; `undefined`
→ `null` for lossless round-trip). Frontend writeBackAsNewVersion branches
on source operator type to pull the right export endpoint.

ParallelCSVFileScan dropped from the trigger set — Texera disables it in
the operator registry (LogicalOp.scala:171 commented out). One-line re-add
if/when re-enabled.

## Locate feature — split path

CSV operators (`CSVFileScan`, `CSVOldFileScan`) use the original simple
synchronous index-based locate. Texera runs them with a single worker so
display order matches file-byte source order; the index DataGuard computed
against parseCsv's output is correct as-is. Cursor advances synchronously,
navigate is fire-and-forget.

JSONLFileScan uses a new fingerprint + flash-confirmed path because Texera
parallelizes JSONL scans and shuffles display order:
  - `rowFingerprint(row, columns)` is byte-identical on agent-service and
    frontend: alphabetical column sort + per-cell `JSON.stringify(String(v))`
    + empty-string concat. The `String()` step before stringify is critical
    — Texera widens schema to string when a column has mixed types, so
    `JSON.stringify(45)` vs `JSON.stringify("45")` would mismatch without it.
  - DataQualityIssue carries `affectedRowKeys[]` 1:1 aligned with
    `affectedRowIndices[]`. Profiler emits both. Frontend `findRowByKey`
    scans display rows for the matching fingerprint, paginates up to 10
    pages, falls back silently to the index path if not found.
  - Result-table-frame `currentLocateToken` cancellation kills the rapid-
    click race — every async resumption checks the captured token and bails
    (emitting flashResult: false exactly once so the awaiter's Promise
    resolves rather than hanging).
  - `navigate()` returns Promise<boolean>; subscribes to flashResult$ via
    firstValueFrom(race(filtered, timer(36s))) BEFORE publishing to nav$,
    so synchronous fast-path emissions are caught.
  - Checklist component awaits the Promise and only advances `locateCursors`
    on flashed===true. Empty clicks (timeout / supersede / out-of-bounds)
    leave the cursor put, eliminating "skipped a row then jumped back" UX.

Per-row cursor (`Map<issueId, number>`) survives benign state re-emits via
`purgeStaleCursors` (only ids not in the live entry set get evicted —
never wholesale clear, which previously corrupted cursors on benign setState
patches between clicks).

## Detector + ID-inference improvements

`inferIdColumn` heuristic for the auto-trigger's empty-body /scan now
recognizes dotted-notation names produced by JSONL flatten (`user.id`,
`customer.uid`, `nested.user.id`, `Account.ID` case-insensitive) in addition
to underscore variants (`sample_id`, `userId`, `id_card`, etc.). Without
this, dup-ID detection silently no-op'd on JSONL data.

## Contract enforcement

Apply-batch body schema strictly rejects:
  - verdict: "modify" (cut by #11a — silently lied to users)
  - modifiedAction field
  - {verdict: "deny", remember: true}
Elysia built with `normalize: false` so unknown legacy fields aren't silently
stripped before validation. Global onError converts VALIDATION to HTTP 400.
WS decision handler narrowed similarly.

## Test counts

  agent-service: typecheck clean / 217 pass / 0 fail / 457 expect
  frontend: tsc --noEmit clean
  frontend specs: 36 navigator + 15 jsonl + 2 checklist + 7 auto-trigger

## CONTRIBUTING compliance

  ✅ sbt scalafixAll --check
  ✅ sbt scalafmtCheckAll
  ✅ yarn format:ci (after yarn format:fix on 15 files)
  ✅ Apache license headers on all new files (.ts/.html/.scss)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 README_DataGuard.md                           |  82 +++-
 .../dataguard/__tests__/export-jsonl.test.ts  | 164 ++++++++
 .../__tests__/profile-dataset.test.ts         | 155 +++++++-
 .../agent/tools/dataguard/profile-dataset.ts  |  94 ++++-
 agent-service/src/server.ts                   |  30 ++
 agent-service/src/types/dataguard.ts          |  17 +
 .../agent-panel/agent-panel.component.ts      |   1 -
 .../permission-prompt.component.html          |  21 +-
 .../dataguard-checklist.component.html        | 140 +++++--
 .../dataguard-checklist.component.scss        |  23 +-
 .../dataguard-checklist.component.spec.ts     | 176 +++++++++
 .../dataguard-checklist.component.ts          | 117 ++++--
 .../component/menu/menu.component.ts          |   4 +-
 .../result-table-frame.component.ts           | 260 ++++++++++---
 .../data-guard-auto-trigger.service.spec.ts   | 215 +++++++++--
 .../agent/data-guard-auto-trigger.service.ts  | 252 +++++++++----
 .../service/agent/data-guard-jsonl.spec.ts    | 152 ++++++++
 .../service/agent/data-guard-jsonl.ts         | 191 ++++++++++
 .../agent/data-guard-results.service.spec.ts  |   7 +-
 .../agent/data-guard-results.service.ts       |   8 +
 .../data-guard-row-navigator.service.spec.ts  | 351 +++++++++++++++++-
 .../agent/data-guard-row-navigator.service.ts | 203 +++++++++-
 .../agent/data-guard-settings.service.ts      |   5 +-
 23 files changed, 2423 insertions(+), 245 deletions(-)
 create mode 100644 agent-service/src/agent/tools/dataguard/__tests__/export-jsonl.test.ts
 create mode 100644 frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-jsonl.spec.ts
 create mode 100644 frontend/src/app/workspace/service/agent/data-guard-jsonl.ts

diff --git a/README_DataGuard.md b/README_DataGuard.md
index 8574a14a82d..b9ba770bd12 100644
--- a/README_DataGuard.md
+++ b/README_DataGuard.md
@@ -49,7 +49,7 @@ DataGuard sidesteps all of this by living in `agent-service/` rather than in the
 
 ### 3.1 Trigger
 
-The user does **nothing special**. When a dataset-reading operator is added to the workflow canvas — currently `CSVFileScan` or `ParallelCSVFileScan` — the auto-trigger fires:
+The user does **nothing special**. When a dataset-reading operator is added to the workflow canvas — currently `CSVFileScan`, `CSVOldFileScan`, or `JSONLFileScan` — the auto-trigger fires:
 
 1. Resolves the workflow context (id, per-workflow shield setting).
 2. Finds or creates a per-workflow agent on agent-service.
@@ -74,13 +74,28 @@ The dedicated `<texera-dataguard-checklist>` floating panel slides in. The chat
 
 Each row's `In column X · affects N row(s)` line is a clickable **📍 locate** button. Clicking it:
 
-1. Highlights the source `CSVFileScan` operator on the graph.
+1. Highlights the source operator on the graph.
 2. Opens / focuses the Result Panel for that operator.
 3. Navigates to the page containing the next affected row and flashes the cell.
 
-The button **cycles** — every click advances a per-row cursor through `affectedRowIndices`, wrapping back to the first after the last. Each row owns its own cursor (a `Map<issueId, number>` on the checklist component), so toggling verdicts elsewhere doesn't reset it. The tooltip previews the next position (`Show next affected row (i of N)`).
+The button **cycles** — every click advances a per-row cursor through `affectedRowIndices`, wrapping back to the first after the last. Each row owns its own cursor (`Map<issueId, number>` on the checklist component). Cursors survive benign state re-emits via `purgeStaleCursors` (only ids no longer in the live entry set get evicted, never wholesale clear); they only reset on a genuine fresh scan with a different issueId set. The tooltip previews the next position (`Show next affected row (i of N)`).
 
-Plumbing: `DataGuardRowNavigatorService` is a `ReplaySubject<DataGuardRowNavRequest>(1, 500ms)`. The 500 ms buffer covers the cold-mount case where the user clicks locate while the Result Panel is collapsed and `ResultTableFrameComponent` is async-instantiated. `ResultTableFrameComponent` subscribes, paginates (via an internal `pageRendered$ Subject` so the flash lands AFTER the new page renders — not after a 100 ms guess), and applies `dg-row-highlight` + `dg-cell-highlight` for 2 s (matched to the SCSS pulse animation). Cross-operator races, viewport resize during page swap, columnDef-vs-header naming drift, and stale closures on a destroyed view are all explicitly guarded.
+**Two locate paths split by operator type** because they have different reordering semantics:
+
+| Source operator | Path | Cursor advance | Why |
+|---|---|---|---|
+| `CSVFileScan` / `CSVOldFileScan` | **Sync index** — `handleLocateByIndex(rowIndex)` directly | Synchronously, *before* `navigate` is called | Texera CSV scan is single-worker → display order = file-byte source order. The index DataGuard computed against `parseCsv`'s output is correct as-is. No fingerprint dance needed; simpler code, lower latency. |
+| `JSONLFileScan` | **Fingerprint match + flash-confirmed Promise** — `handleLocateByKey(rowKey, …)` | Only on `flashed === true` (await the Promise) | Texera JSONL scan is parallelizable → display order can shuffle relative to source order. The index is no longer trustworthy. We compute a content-stable fingerprint per row in the profiler and match it against rendered display rows. |
+
+The JSONL path additionally has:
+
+- **Fingerprint contract.** `rowFingerprint(row, columns)` is byte-identical on agent-service and frontend: sort columns alphabetically, for each cell `String(v) ` then `JSON.stringify(...)` (the `String()` step is critical — Texera's runtime sometimes coerces numbers to strings during schema-widened display, so naked `JSON.stringify(45)` vs `JSON.stringify("45")` would mismatch). Both sides have contract-example tests.
+- **`currentLocateToken` cancellation.** Each click bumps a monotonic counter; every async resumption in `handleLocateByKey` (page-render race, walkPage recursion) checks the captured token before mutating state. Older walks bail and emit `flashResult: false` exactly once so the awaiting Promise resolves rather than hanging.
+- **Subscribe-before-publish navigate Promise.** `navigate()` returns `Promise<boolean>`. Internally: `firstValueFrom(race(filtered, timer(36s)))` subscribes synchronously to `flashResult$`, then `nav$.next(req)` publishes. So even a synchronous fast-path emit (target row already on current page) is caught.
+- **Walk up to `LOCATE_BY_KEY_MAX_PAGES = 10` pages** before falling back to the index path. The fingerprint match falls back silently to index if it can't find the row — no nagging toast.
+- **36 s safety timeout** derived from `LOCATE_BY_KEY_MAX_PAGES × 3500 ms + 1000 ms` so a legitimately slow walk doesn't time out and skip the cursor.
+
+The result-panel side uses an internal `pageRendered$ Subject` so the flash lands AFTER the new page renders (not after a 100 ms guess). `dg-row-highlight` + `dg-cell-highlight` apply for 2 s, matched to the SCSS pulse animation. Cross-operator races, viewport resize during page swap, columnDef-vs-header naming drift, and stale closures on a destroyed view are all guarded.
 
 ### 3.4 Toolbar shield + floating icon
 
@@ -285,11 +300,34 @@ The checklist component lives at `bottom: 100px; right: 80px` by default. When d
 ```ts
 private static readonly DATASET_OPERATOR_TYPES = new Set<string>([
   "CSVFileScan",
-  "ParallelCSVFileScan",
+  "CSVOldFileScan",
+  "JSONLFileScan",
 ]);
 ```
 
-CSV-only for now — `loadFromOperatorFile` pipes every blob through `Papa.parse`, so adding `JSONFileScan` / `TableFileScan` / Parquet would either crash or produce garbage rows. Per-format parsing is the obvious follow-up.
+`loadFromOperatorFile` dispatches by operator type to a parser registry:
+
+```ts
+type DatasetParser = (blob, fileName, options?: {delimiter?}) => Promise<{columns, rows}>;
+const PARSERS: Record<string, DatasetParser> = {
+  CSVFileScan:     parseCsv,
+  CSVOldFileScan:  parseCsv,    // honors options.delimiter (CSVOld customDelimiter)
+  JSONLFileScan:   parseJsonl,  // nested flatten + array stringify + collision rule
+};
+```
+
+**CSV variants** share `parseCsv`. CSVOld's Scala impl uses scala-csv's `DefaultCSVFormat` (RFC-4180-equivalent bytes) but exposes a `customDelimiter` operator property; `extractParserOptions` reads it from `op.operatorProperties.customDelimiter` and threads it into Papa as the `delimiter` option, so `;`, `\t`, or any non-comma separator is honored.
+
+**JSONL** goes to `parseJsonl` (in `data-guard-jsonl.ts`). Flatten policy:
+- Nested objects → dot-notation columns (`address.street`). Pre-scan collects all nested-owned paths; second pass emits leaves and skips (with single warning per path) literal-dotted top-level keys whose path is nested-owned. **Nested always wins regardless of JSON source order.**
+- Arrays → `JSON.stringify(arr)` as a single cell (never explodes rows; preserves row indices for `apply_fix` rowIndices contract).
+- Non-object lines (bare strings/numbers/booleans/arrays/null) → `console.warn` and skip.
+- Blank lines + CRLF tolerated.
+- Server-side `GET /dataguard/export-jsonl` round-trips JSONL after Apply (iterates `dataset.columns` for canonical key order; `undefined` → `null` for lossless round-trip).
+
+**Out of trigger set** (intentional): `ArrowFileScan`, `FileLister`, `FileScan`, `FileScanFromInput`, `TextInput`. Adding a format = register a parser in `PARSERS`, add the operator type to `DATASET_OPERATOR_TYPES`, ensure write-back format-awareness if the operator has its own file format (JSONL has `/export-jsonl`, CSV uses default `/export-csv`).
+
+`ParallelCSVFileScan` is intentionally omitted: Texera disables it in the operator registry (`LogicalOp.scala:171` commented out, "so that it does not confuse user"). If re-enabled, one-line add to `PARSERS` and `DATASET_OPERATOR_TYPES`.
 
 ---
 
@@ -411,24 +449,32 @@ src/app/workspace/
 ```bash
 cd agent-service
 bun run typecheck   # exit 0
-bun test            # 199 pass / 0 fail (419 expect calls)
+bun test            # 217 pass / 0 fail (457 expect calls)
 
 cd frontend
 npx tsc --noEmit    # exit 0
+ng test --watch=false   # runs Karma harness (the same one CONTRIBUTING.md requires)
 ```
 
 Test coverage spans:
 
-- Types fixtures (12) — verifies the literal unions accept and reject the right members.
-- Profile (20+) — per-detector cases including the validRanges-based outlier, the explicit "clustered large readings are NOT auto-outliers" assertion, and the `inferIdColumn` heuristic across all id-name patterns (`id`, `*_id`, `*Id`, `id_*`).
-- Suggest (10+) — LLM-response schema validation.
-- Apply (16) — every op kind round-trips; original dataset never mutated; `replace_value` with `rowIndices` regression-locks the LakeFS "no changes detected" bug; `missingTokens` override threads through to impute.
-- With-approval (7) — low/medium/high/warning gating, the `warning`-with-remembered-rule case, and the buffered-decision race.
-- Session (8) — recordIssue/recordDecision/auto-allow lifecycle.
-- Decision log (6) + decision-log-no-modify (2) — RFC-4180 CSV shape and the post-#11a 9-column schema lock.
-- Apply-batch end-to-end (12+) — Modify-verdict rejection, `additionalProperties` rejection, `verdict==="deny" && remember===true` rejection, residual re-scan correctness.
-- Permission-types (4) — `@ts-expect-error` locks that `"modify"` and `modifiedAction` cannot type-check anywhere.
-- Frontend specs — `DataGuardRowNavigatorService` (cycle math, ReplaySubject TTL, negative-cursor coercion); `DataGuardAutoTriggerService` (resolveRescanTarget decision tree + pipeline serialization proof).
+- **Types fixtures** (12) — verifies the literal unions accept and reject the right members.
+- **Profile** (28+) — per-detector cases including the validRanges-based outlier, the explicit "clustered large readings are NOT auto-outliers" assertion, the `inferIdColumn` heuristic across all id-name patterns (`id`, `*_id`, `*Id`, `id_*`, **dotted JSONL flatten names like `user.id` / `customer.uid`**), `rowFingerprint` contract example + number-vs-string equivalence + float round-trip.
+- **Suggest** (10+) — LLM-response schema validation; outlier/out-of-range proposals must use `rowIndices` not `match`.
+- **Apply** (16) — every op kind round-trips; original dataset never mutated; `replace_value` with `rowIndices` regression-locks the LakeFS "no changes detected" bug; `missingTokens` override threads through to impute.
+- **With-approval** (7) — low/medium/high/warning gating, `warning`-with-remembered-rule, buffered-decision race.
+- **Session** (8) — recordIssue/recordDecision/auto-allow lifecycle.
+- **Decision log** (6) + **decision-log-no-modify** (2) — RFC-4180 CSV shape and the post-#11a 9-column schema lock.
+- **Apply-batch end-to-end** (12+) — Modify-verdict rejection, `additionalProperties` rejection, `verdict==="deny" && remember===true` rejection, residual re-scan correctness.
+- **Permission-types** (4) — `@ts-expect-error` locks that `"modify"` and `modifiedAction` cannot type-check anywhere.
+- **Export-jsonl** (6) — empty session, multi-row, special chars, null round-trip.
+- **Frontend specs:**
+  - `DataGuardRowNavigatorService` (36): cycle math, fingerprint algorithm parity, `findRowByKey`, `nextCycleStep` cycle, `purgeStaleCursors` survival semantics, **`navigate()` Promise round-3+4 contract** (rapid-click race resolves only the survivor, empty-click timeout leaves cursor put, synchronous fast-path emit-before-await race fixed).
+  - `DataGuardAutoTriggerService` (7): `resolveRescanTarget` decision tree + pipeline serialization proof.
+  - `DataGuardChecklistComponent` (2): **CSV-path advances cursor synchronously**, JSONL-path waits for flash-confirmed Promise (only advances on `true`).
+  - `data-guard-jsonl` (15): parser edge cases — nested flatten, array stringify, blank lines, CRLF, malformed JSON skip, collision rule (nested wins regardless of source order), 100-line bulk.
+
+The locate feature alone went through four iterative rounds — cursor preservation (purgeStaleCursors), fingerprint type-coercion (`String()` before stringify), token cancellation (`currentLocateToken`), subscribe-before-publish Promise (`firstValueFrom(race(...))` subscribes before `.next()`) — each round caught by a tightly-scoped reviewer pass. See git log on `feat/dataguard-mvp` for the full arc.
 
 ---
 
@@ -492,7 +538,7 @@ Relevant concepts:
 
 ## 19. Post-MVP follow-ups
 
-- **JSON / Parquet operator support.** Auto-trigger is intentionally narrowed to `CSVFileScan` + `ParallelCSVFileScan`. Adding `JSONFileScan` / `TableFileScan` / Parquet needs `loadFromOperatorFile` to branch by suffix instead of force-`Papa.parse`-ing every blob.
+- **Arrow / `File Scan` / `Text Input` operator support.** Auto-trigger is currently `CSVFileScan` + `CSVOldFileScan` + `JSONLFileScan`. Adding Arrow needs a binary IPC parser (`apache-arrow` package). `File Scan` / `File Scan From Input` need a suffix-based dispatcher to pick the right parser. `Text Input` has no file — would need a property-driven adapter and probably can't do the cleaned-version write-back, so DataGuard would be read-only there.
 - **Modify verdict.** Currently cut. Returns only with a real natural-language → `operationParams` parser; the legacy "modify" recorded a free-text override but executed the original params, which silently lied.
 - **Iceberg-backed decision log.** Via the existing Lakekeeper integration. Currently CSV.
 - **`run_cleaning_workflow` tool.** Distributed cleaning by delegating to a Texera workflow for datasets that don't fit in memory.
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/export-jsonl.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/export-jsonl.test.ts
new file mode 100644
index 00000000000..1ad5ed1f4df
--- /dev/null
+++ b/agent-service/src/agent/tools/dataguard/__tests__/export-jsonl.test.ts
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Route shape contract for GET /dataguard/export-jsonl. The frontend
+// write-back path uses this when the source operator is JSONLFileScan.
+// Each line is a JSON object whose keys follow `dataset.columns` order;
+// missing/null cells round-trip as JSON `null`.
+
+import { beforeEach, describe, expect, test } from "bun:test";
+import { buildApp, _resetAgentStoreForTests, _getAgentForTests } from "../../../../server";
+import { env } from "../../../../config/env";
+
+const API = env.API_PREFIX;
+const app = buildApp();
+
+function url(path: string): string {
+  return `http://localhost${path}`;
+}
+
+async function postJson(path: string, body: unknown): Promise<Response> {
+  return app.handle(
+    new Request(url(path), {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(body),
+    })
+  );
+}
+
+async function getRaw(path: string): Promise<Response> {
+  return app.handle(new Request(url(path), { method: "GET" }));
+}
+
+async function createAgent(): Promise<string> {
+  const res = await postJson(`${API}/agents`, { modelType: "test-model" });
+  const body = (await res.json()) as { id: string };
+  return body.id;
+}
+
+function parseJsonlBody(text: string): unknown[] {
+  return text
+    .split("\n")
+    .filter(l => l.length > 0)
+    .map(l => JSON.parse(l));
+}
+
+beforeEach(() => {
+  _resetAgentStoreForTests();
+});
+
+describe(`GET ${API}/agents/:id/dataguard/export-jsonl`, () => {
+  test("404 when no dataset is loaded", async () => {
+    const id = await createAgent();
+    const res = await getRaw(`${API}/agents/${id}/dataguard/export-jsonl`);
+    expect(res.status).toBe(404);
+  });
+
+  test("empty session (columns set, zero rows) returns empty body", async () => {
+    const id = await createAgent();
+    const agent = _getAgentForTests(id)!;
+    agent.getDataGuardSession().setDataset({ columns: ["a", "b"], rows: [] });
+    const res = await getRaw(`${API}/agents/${id}/dataguard/export-jsonl`);
+    expect(res.status).toBe(200);
+    const text = await res.text();
+    expect(text).toBe("");
+    expect(res.headers.get("content-type") || "").toContain("application/x-ndjson");
+  });
+
+  test("multi-row session emits one JSON object per line in columns order", async () => {
+    const id = await createAgent();
+    const agent = _getAgentForTests(id)!;
+    agent.getDataGuardSession().setDataset({
+      columns: ["id", "name"],
+      rows: [
+        { id: 1, name: "Alice" },
+        { id: 2, name: "Bob" },
+      ],
+    });
+    const res = await getRaw(`${API}/agents/${id}/dataguard/export-jsonl`);
+    expect(res.status).toBe(200);
+    const text = await res.text();
+    // Trailing newline so the file is canonical-JSONL.
+    expect(text.endsWith("\n")).toBe(true);
+    const lines = text.split("\n").filter(l => l.length > 0);
+    expect(lines.length).toBe(2);
+    // Key order must follow `columns`, not insertion order of the row map.
+    expect(lines[0]).toBe(`{"id":1,"name":"Alice"}`);
+    expect(lines[1]).toBe(`{"id":2,"name":"Bob"}`);
+  });
+
+  test("null cells round-trip as JSON null, not omitted", async () => {
+    const id = await createAgent();
+    const agent = _getAgentForTests(id)!;
+    agent.getDataGuardSession().setDataset({
+      columns: ["a", "b"],
+      rows: [
+        { a: 1, b: null },
+        { a: null, b: 2 },
+      ],
+    });
+    const res = await getRaw(`${API}/agents/${id}/dataguard/export-jsonl`);
+    const text = await res.text();
+    const rows = parseJsonlBody(text) as Array<Record<string, unknown>>;
+    expect(rows).toEqual([
+      { a: 1, b: null },
+      { a: null, b: 2 },
+    ]);
+  });
+
+  test("missing keys on a row are emitted as JSON null (not dropped)", async () => {
+    const id = await createAgent();
+    const agent = _getAgentForTests(id)!;
+    agent.getDataGuardSession().setDataset({
+      columns: ["a", "b"],
+      // Second row omits `b` entirely (undefined). Must surface as null so
+      // the column doesn't silently disappear from that row's output.
+      rows: [{ a: 1, b: 2 }, { a: 3 }],
+    });
+    const res = await getRaw(`${API}/agents/${id}/dataguard/export-jsonl`);
+    const text = await res.text();
+    const rows = parseJsonlBody(text) as Array<Record<string, unknown>>;
+    expect(rows).toEqual([
+      { a: 1, b: 2 },
+      { a: 3, b: null },
+    ]);
+  });
+
+  test("values with newlines and quotes are escaped by JSON encoding", async () => {
+    const id = await createAgent();
+    const agent = _getAgentForTests(id)!;
+    agent.getDataGuardSession().setDataset({
+      columns: ["text"],
+      rows: [
+        { text: 'line1\nline2 with "quotes"' },
+        { text: "tab\there" },
+      ],
+    });
+    const res = await getRaw(`${API}/agents/${id}/dataguard/export-jsonl`);
+    const text = await res.text();
+    const lines = text.split("\n").filter(l => l.length > 0);
+    // Each line must itself be one valid JSON object — embedded \n in the
+    // value must NOT split the row across multiple JSONL lines.
+    expect(lines.length).toBe(2);
+    const rows = lines.map(l => JSON.parse(l)) as Array<Record<string, unknown>>;
+    expect(rows[0]).toEqual({ text: 'line1\nline2 with "quotes"' });
+    expect(rows[1]).toEqual({ text: "tab\there" });
+  });
+});
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
index 8a22e1d0c23..03f74e07e4c 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
@@ -18,7 +18,7 @@
  */
 
 import { describe, expect, test } from "bun:test";
-import { profileDataset } from "../profile-dataset";
+import { profileDataset, rowFingerprint } from "../profile-dataset";
 import type { DatasetView } from "../dataset";
 
 describe("profileDataset", () => {
@@ -163,6 +163,33 @@ describe("profileDataset", () => {
     }
   });
 
+  test("auto-infer recognizes dotted JSONL-flatten names (`user.id`, `customer.uid`, nested)", () => {
+    // JSONL flatten produces dot-notation column names. The underscore-based
+    // matchers used to miss these (`.` is not `_`, and `Id$` is case-sensitive),
+    // so dup-ID detection silently no-op'd on JSONL-loaded data. Regression
+    // lock for F1 from the round-2 review.
+    const cases: Array<{ col: string }> = [
+      { col: "user.id" },
+      { col: "customer.uid" },
+      { col: "nested.user.id" },
+      { col: "Account.ID" }, // case-insensitive
+    ];
+    for (const { col } of cases) {
+      const ds: DatasetView = {
+        columns: [col, "value"],
+        rows: [
+          { [col]: "a", value: 1 },
+          { [col]: "a", value: 2 },
+          { [col]: "b", value: 3 },
+        ],
+      };
+      const issues = profileDataset(ds);
+      const dup = issues.find(i => i.issueType === "duplicate_id");
+      expect(dup).toBeDefined();
+      expect(dup!.column).toBe(col);
+    }
+  });
+
   test("auto-infer does NOT fire when no column name looks like an ID", () => {
     // Conservative: just having repeated values isn't enough — the user's
     // workflow may legitimately have duplicate categorical labels.
@@ -362,4 +389,130 @@ describe("profileDataset", () => {
       expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined();
     });
   });
+
+  // ---------------------------------------------------------------------------
+  // rowFingerprint — the contract that lets the frontend `findRowByKey` match
+  // a profiler-emitted key against rows whose display-order has been shuffled
+  // by Texera's multi-worker JSONL scan. Lock the algorithm down here.
+  // ---------------------------------------------------------------------------
+  describe("rowFingerprint", () => {
+    test("identical content → identical key", () => {
+      const a = rowFingerprint({ age: 25, name: "Alice" }, ["age", "name"]);
+      const b = rowFingerprint({ age: 25, name: "Alice" }, ["age", "name"]);
+      expect(a).toBe(b);
+    });
+
+    test("different content → different key", () => {
+      const a = rowFingerprint({ age: 25, name: "Alice" }, ["age", "name"]);
+      const b = rowFingerprint({ age: 26, name: "Alice" }, ["age", "name"]);
+      expect(a).not.toBe(b);
+    });
+
+    test("column order in the input is canonicalized (sort)", () => {
+      // Same row content, two different schema orderings — should match.
+      const a = rowFingerprint({ age: 25, name: "Alice" }, ["age", "name"]);
+      const b = rowFingerprint({ age: 25, name: "Alice" }, ["name", "age"]);
+      expect(a).toBe(b);
+    });
+
+    test("missing key and explicit null fingerprint identically", () => {
+      const a = rowFingerprint({ age: null, name: "Alice" }, ["age", "name"]);
+      // Note: `age` key not present on the second row at all.
+      const b = rowFingerprint({ name: "Alice" } as Record<string, unknown>, ["age", "name"]);
+      expect(a).toBe(b);
+    });
+
+    test("undefined value treated as null", () => {
+      const a = rowFingerprint({ age: undefined, name: "Alice" }, ["age", "name"]);
+      const b = rowFingerprint({ age: null, name: "Alice" }, ["age", "name"]);
+      expect(a).toBe(b);
+    });
+
+    test("JSON-special characters survive round-trip (quotes, backslashes, unicode)", () => {
+      // The contract is "String() then JSON.stringify", so the only thing to
+      // verify here is that the helper does *use* JSON.stringify (after the
+      // String() coercion that's a no-op on strings) — otherwise quote
+      // escaping would be lost and a string containing a field separator
+      // would mis-fingerprint. For strings, String(s) === s, so the expected
+      // output is JSON.stringify(s) per cell.
+      const row = { label: 'he said "hi"\\nbye', emoji: "🎉" };
+      const expected = JSON.stringify("🎉") + JSON.stringify('he said "hi"\\nbye');
+      // canonical column order is alphabetical: emoji, label
+      expect(rowFingerprint(row, ["label", "emoji"])).toBe(expected);
+    });
+
+    test("numbers and numeric strings ARE equivalent after String() coercion", () => {
+      // Texera's JSONL scan widens mixed-type columns to String (via
+      // `parseField(stringValue, schemaType)` in JSONLScanSourceOpExec), while
+      // DataGuard's parseJsonl preserves native JSON types. To make matches
+      // survive that schema-widening, both sides String()-coerce the cell
+      // before JSON.stringify — so `25` and `"25"` fingerprint identically.
+      const a = rowFingerprint({ x: 25 }, ["x"]);
+      const b = rowFingerprint({ x: "25" }, ["x"]);
+      expect(a).toBe(b);
+      // And the token shape is the quoted-string form.
+      expect(a).toBe('"25"');
+    });
+
+    test("floats fingerprint identically whether typed as number or string", () => {
+      // IEEE-754 ToString (ECMA-262 §7.1.17) is the same on V8 and Bun, so
+      // `String(28.1)` is "28.1" on both. The string side is trivially "28.1".
+      const a = rowFingerprint({ x: 28.1 }, ["x"]);
+      const b = rowFingerprint({ x: "28.1" }, ["x"]);
+      expect(a).toBe(b);
+      expect(a).toBe('"28.1"');
+    });
+
+    // Cross-language fingerprint contract: this is the *exact* string the
+    // frontend `findRowByKey` must produce for the same input. If the JSON
+    // tokens here drift between V8 and Bun, this test catches it before
+    // production. Format: JSON.stringify(String(value)) per non-null cell;
+    // null/undefined → bare `null` literal (no quotes).
+    test("contract example: known input produces known output (V8/Bun parity)", () => {
+      const row = { glucose: 180, patient_id: "p-7", group: null };
+      // canonical sort: glucose, group, patient_id
+      const key = rowFingerprint(row, ["patient_id", "group", "glucose"]);
+      // glucose: JSON.stringify(String(180)) = "\"180\""
+      // group:   null                         = "null"
+      // patient_id: JSON.stringify(String("p-7")) = "\"p-7\""
+      expect(key).toBe('"180"' + "null" + '"p-7"');
+    });
+  });
+
+  // ---------------------------------------------------------------------------
+  // Per-detector affectedRowKeys emission — must be 1-to-1 aligned with
+  // affectedRowIndices, and absent when indices are absent (large-issue path).
+  // ---------------------------------------------------------------------------
+  describe("affectedRowKeys integration", () => {
+    test("missing-value issue carries keys aligned with indices", () => {
+      const ds: DatasetView = {
+        columns: ["age", "name"],
+        rows: [
+          { age: 25, name: "Alice" },
+          { age: null, name: "Bob" },
+          { age: 30, name: "Carol" },
+        ],
+      };
+      const issues = profileDataset(ds);
+      const miss = issues.find(i => i.issueType === "missing_value" && i.column === "age");
+      expect(miss!.affectedRowIndices).toEqual([1]);
+      expect(miss!.affectedRowKeys).toHaveLength(1);
+      // Re-fingerprint the same row to confirm match.
+      expect(miss!.affectedRowKeys![0]).toBe(rowFingerprint(ds.rows[1], ds.columns));
+    });
+
+    test("large-issue path omits both indices and keys", () => {
+      // Force the missing-value detector well over the cap, then assert that
+      // neither indices nor keys are emitted — preserves the existing
+      // maybeIndices behaviour.
+      const rows = Array.from({ length: 100 }, () => ({ x: null }));
+      const issues = profileDataset(
+        { columns: ["x"], rows },
+        { maxIndicesInIssue: 10 }
+      );
+      const miss = issues.find(i => i.issueType === "missing_value");
+      expect(miss!.affectedRowIndices).toBeUndefined();
+      expect(miss!.affectedRowKeys).toBeUndefined();
+    });
+  });
 });
diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
index a5c55d2167d..b1e83bebc2f 100644
--- a/agent-service/src/agent/tools/dataguard/profile-dataset.ts
+++ b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
@@ -70,6 +70,72 @@ function maybeIndices(
   return indices.length <= cap ? indices : undefined;
 }
 
+/**
+ * Deterministic, content-based fingerprint for a dataset row.
+ *
+ * Used to align profiler issues (which see file-byte-order rows) with the
+ * Texera result panel (which may show rows in a worker-shuffled order — the
+ * JSONL multi-worker scan is the motivating case). The frontend recomputes the
+ * same fingerprint over the rows it has loaded and matches by string equality.
+ *
+ * Contract (must be byte-identical to `findRowByKey` in
+ * `data-guard-row-navigator.service.ts`):
+ *   - Columns are sorted alphabetically (`Array#sort()` default locale-agnostic
+ *     compare on the UTF-16 code-unit order) to canonicalize the schema. This
+ *     guarantees the same key regardless of column-display reordering on
+ *     either side.
+ *   - For each column in canonical order: read the cell value; `undefined`
+ *     (incl. completely-missing keys) is coerced to `null` so it produces the
+ *     same string as an explicit null cell.
+ *   - Each non-null cell is normalised to its string form via `String(v)`
+ *     before `JSON.stringify`, so number `45` and string `"45"` produce the
+ *     same token. This matters because Texera's JSONL scan widens mixed-type
+ *     columns to String (`JSONToMap` + `parseField(stringValue, schemaType)`),
+ *     while DataGuard's own `parseJsonl` keeps native JSON types — without
+ *     the coercion the two sides fingerprint differently and `findRowByKey`
+ *     misses every row in mixed-type columns.
+ *   - Null / undefined / missing keys emit the bare token `null` (no quotes).
+ *   - The individual JSON tokens are concatenated with an empty separator;
+ *     because each token is self-delimited (`"…"` or the literal `null`),
+ *     no ambiguity is introduced.
+ *
+ * Edge cases handled:
+ *   - Missing key vs explicit null → identical fingerprint (`null`).
+ *   - JSON-stringify special characters (quotes, backslashes, unicode) → the
+ *     standard JSON.stringify escapes apply identically on V8 (Texera frontend)
+ *     and on Bun (agent-service).
+ *   - Floats round-trip through `String()` identically on V8 and Bun (both
+ *     implement IEEE-754 ToString per ECMA-262 §7.1.17).
+ */
+function fingerprintCell(v: unknown): string {
+  if (v === null || v === undefined) return "null";
+  return JSON.stringify(String(v));
+}
+
+export function rowFingerprint(
+  row: Record<string, unknown>,
+  columns: ReadonlyArray<string>
+): string {
+  const canonical = [...columns].sort();
+  let out = "";
+  for (const c of canonical) {
+    out += fingerprintCell(row[c]);
+  }
+  return out;
+}
+
+function maybeKeys(
+  indices: number[] | undefined,
+  rows: ReadonlyArray<Record<string, unknown>>,
+  columns: ReadonlyArray<string>
+): string[] | undefined {
+  // Mirror the contract: only emit keys when indices are present. Large-issue
+  // path (indices === undefined) stays key-less so we don't waste payload
+  // bytes when the frontend can't display them all anyway.
+  if (indices === undefined) return undefined;
+  return indices.map(i => rowFingerprint(rows[i], columns));
+}
+
 // Guess which column is the row identifier when the caller didn't specify one.
 // Conservative: only matches columns whose names look unambiguously like IDs.
 // Tries the cheapest, most-specific patterns first so e.g. `sample_id` wins
@@ -82,6 +148,14 @@ function inferIdColumn(columns: ReadonlyArray<string>): string | undefined {
     name => /^id_/i.test(name),
     name => /Id$/.test(name),
     name => /^.+_uid$/i.test(name) || /^uid$/i.test(name),
+    // JSONL flatten produces dot-notation column names like `user.id` or
+    // `customer.uid`. The underscore-based matchers above don't catch these
+    // (the dot is not an underscore, and `Id$` is case-sensitive so a
+    // lowercase `id` at the trailing segment misses too). Add dot-anchored
+    // patterns so the auto-trigger's dup-ID detection still works on
+    // JSONL-loaded data.
+    name => /\.id$/i.test(name),
+    name => /\.uid$/i.test(name),
   ];
   for (const m of matchers) {
     const hit = columns.find(c => m(c));
@@ -121,6 +195,7 @@ export function profileDataset(
     }
     if (missingIndices.length === 0) continue;
     const pct = (missingIndices.length / Math.max(dataset.rows.length, 1)) * 100;
+    const idx = maybeIndices(missingIndices, indexCap);
     issues.push({
       issueId: nextIssueId(),
       issueType: "missing_value",
@@ -128,7 +203,8 @@ export function profileDataset(
       description: `${missingIndices.length} row(s) have missing ${col}`,
       evidence: `Missing: ${missingIndices.length} of ${dataset.rows.length} (${pct.toFixed(1)}%)`,
       affectedRowCount: missingIndices.length,
-      affectedRowIndices: maybeIndices(missingIndices, indexCap),
+      affectedRowIndices: idx,
+      affectedRowKeys: maybeKeys(idx, dataset.rows, dataset.columns),
       detectedAt,
     });
   }
@@ -139,6 +215,7 @@ export function profileDataset(
     if (hits.size === 0) continue;
     const indices = Array.from(hits.keys()).sort((a, b) => a - b);
     const distinctValues = Array.from(new Set(hits.values()));
+    const idx = maybeIndices(indices, indexCap);
     issues.push({
       issueId: nextIssueId(),
       issueType: "placeholder_value",
@@ -146,7 +223,8 @@ export function profileDataset(
       description: `${indices.length} row(s) in ${col} contain placeholder value(s): ${distinctValues.join(", ")}`,
       evidence: `Placeholder(s) ${distinctValues.join(", ")} appear ${indices.length} time(s) in ${col}.`,
       affectedRowCount: indices.length,
-      affectedRowIndices: maybeIndices(indices, indexCap),
+      affectedRowIndices: idx,
+      affectedRowKeys: maybeKeys(idx, dataset.rows, dataset.columns),
       detectedAt,
     });
   }
@@ -178,6 +256,7 @@ export function profileDataset(
     }
     if (duplicateIndices.length > 0) {
       duplicateIndices.sort((a, b) => a - b);
+      const idx = maybeIndices(duplicateIndices, indexCap);
       issues.push({
         issueId: nextIssueId(),
         issueType: "duplicate_id",
@@ -185,7 +264,8 @@ export function profileDataset(
         description: `${duplicateKeys.length} duplicate ID(s) in ${idCol} affecting ${duplicateIndices.length} row(s)`,
         evidence: `Duplicate keys (showing up to 5): ${duplicateKeys.slice(0, 5).join(", ")}`,
         affectedRowCount: duplicateIndices.length,
-        affectedRowIndices: maybeIndices(duplicateIndices, indexCap),
+        affectedRowIndices: idx,
+        affectedRowKeys: maybeKeys(idx, dataset.rows, dataset.columns),
         detectedAt,
       });
     }
@@ -249,6 +329,7 @@ export function profileDataset(
       }
       if (inconsistentRows.length === 0) continue;
       inconsistentRows.sort((a, b) => a - b);
+      const idx = maybeIndices(inconsistentRows, indexCap);
       issues.push({
         issueId: nextIssueId(),
         issueType: "inconsistent_label",
@@ -256,7 +337,8 @@ export function profileDataset(
         description: `${inconsistentRows.length} row(s) in ${col} use non-canonical label spellings`,
         evidence: `Mixed spellings (showing up to 3): ${examples.slice(0, 3).join(", ")}`,
         affectedRowCount: inconsistentRows.length,
-        affectedRowIndices: maybeIndices(inconsistentRows, indexCap),
+        affectedRowIndices: idx,
+        affectedRowKeys: maybeKeys(idx, dataset.rows, dataset.columns),
         detectedAt,
       });
     }
@@ -280,6 +362,7 @@ export function profileDataset(
         if (v < range.min || v > range.max) outlierIndices.push(i);
       }
       if (outlierIndices.length === 0) continue;
+      const idx = maybeIndices(outlierIndices, indexCap);
       issues.push({
         issueId: nextIssueId(),
         issueType: "outlier",
@@ -287,7 +370,8 @@ export function profileDataset(
         description: `${outlierIndices.length} row(s) in ${col} fall outside the valid range [${range.min}, ${range.max}]`,
         evidence: `Valid range: [${range.min}, ${range.max}]; violations: ${outlierIndices.length}.`,
         affectedRowCount: outlierIndices.length,
-        affectedRowIndices: maybeIndices(outlierIndices, indexCap),
+        affectedRowIndices: idx,
+        affectedRowKeys: maybeKeys(idx, dataset.rows, dataset.columns),
         detectedAt,
       });
     }
diff --git a/agent-service/src/server.ts b/agent-service/src/server.ts
index bfbaaa3ee7a..0eb076719f6 100644
--- a/agent-service/src/server.ts
+++ b/agent-service/src/server.ts
@@ -578,6 +578,36 @@ const agentsRouter = new Elysia({ prefix: "/agents", normalize: false })
     return lines.join("\n");
   })
 
+  // Return the in-memory cleaned dataset as JSONL — one JSON object per row,
+  // `columns` dictates the key order, missing/null cells are emitted as
+  // `null` so the round trip is lossless. Used by the frontend write-back
+  // path when the source operator is `JSONLFileScan`.
+  .get("/:id/dataguard/export-jsonl", ({ params: { id }, set }) => {
+    const agent = getAgent(id);
+    const dataset = agent.getDataGuardSession().getDataset();
+    if (!dataset) {
+      set.status = 404;
+      return "No dataset loaded.";
+    }
+    const lines: string[] = [];
+    for (const row of dataset.rows) {
+      // Build an ordered object so JSON.stringify emits keys in `columns`
+      // order. `undefined` cells become `null` for an honest round-trip
+      // (JSON.stringify would otherwise drop them and the column would
+      // silently disappear from that row).
+      const ordered: Record<string, unknown> = {};
+      for (const col of dataset.columns) {
+        const v = row[col];
+        ordered[col] = v === undefined ? null : v;
+      }
+      lines.push(JSON.stringify(ordered));
+    }
+    set.headers["content-type"] = "application/x-ndjson; charset=utf-8";
+    // Trailing newline keeps the output canonical-JSONL (one record per line,
+    // file ends in \n). Empty dataset → empty body, not a stray "\n".
+    return lines.length === 0 ? "" : lines.join("\n") + "\n";
+  })
+
   .get("/:id/dataguard/session", ({ params: { id } }) => {
     const agent = getAgent(id);
     const session = agent.getDataGuardSession();
diff --git a/agent-service/src/types/dataguard.ts b/agent-service/src/types/dataguard.ts
index 6eb113dd080..d1f8b61e215 100644
--- a/agent-service/src/types/dataguard.ts
+++ b/agent-service/src/types/dataguard.ts
@@ -67,6 +67,23 @@ export interface DataQualityIssue {
   // Present only when the affected set is small enough to enumerate; otherwise
   // omit and rely on `evidence` for a sample / aggregate description.
   affectedRowIndices?: number[];
+  /**
+   * Per-affected-row stable string keys used by the frontend `locate` feature
+   * to find the right cell even when the result panel reorders rows (e.g.,
+   * Texera JSONL multi-worker output shuffle). Aligned 1-to-1 with
+   * `affectedRowIndices` when both are present.
+   *
+   * Fingerprint contract (must match the frontend `findRowByKey` helper):
+   *   - Sort the dataset's columns alphabetically into a canonical order.
+   *   - For each column, take the cell value (treating `undefined` and missing
+   *     keys the same as `null`) and `JSON.stringify` it.
+   *   - Concatenate the resulting strings in canonical-column order with no
+   *     separator. The empty string is intentional: any non-empty separator
+   *     could itself appear inside a JSON-stringified value, so concatenating
+   *     adjacent JSON tokens is unambiguous (each token is self-delimited by
+   *     its leading quote / brace / bracket / digit).
+   */
+  affectedRowKeys?: string[];
   detectedAt: string;
 }
 
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts b/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
index 914f7306031..cf47b8b3ab7 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
+++ b/frontend/src/app/workspace/component/agent/agent-panel/agent-panel.component.ts
@@ -125,7 +125,6 @@ export class AgentPanelComponent implements OnInit, OnDestroy, OnChanges {
         // Try to activate the agent if agentIdToActivate is set
         this.tryActivateAgentFromInput();
       });
-
   }
 
   ngOnChanges(changes: SimpleChanges): void {
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
index 58963b0893a..7e80e93472f 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
@@ -1,4 +1,6 @@
-<div class="dg-permission" *ngIf="step.pendingApproval && !submitted">
+<div
+  class="dg-permission"
+  *ngIf="step.pendingApproval && !submitted">
   <div class="dg-permission__header">
     <strong>DataGuard wants to apply this fix</strong>
     <span class="dg-permission__tier dg-permission__tier--{{step.pendingApproval.riskTier}}">
@@ -25,17 +27,28 @@
   </div>
 
   <div class="dg-permission__actions">
-    <button nz-button nzType="primary" (click)="onAllow(false)">Allow</button>
+    <button
+      nz-button
+      nzType="primary"
+      (click)="onAllow(false)">
+      Allow
+    </button>
     <button
       nz-button
       *ngIf="step.pendingApproval.riskTier !== 'high' && step.pendingApproval.riskTier !== 'warning'"
       (click)="onAllow(true)">
       Allow &amp; don't ask for similar
     </button>
-    <button nz-button (click)="onDeny()">Deny</button>
+    <button
+      nz-button
+      (click)="onDeny()">
+      Deny
+    </button>
   </div>
 </div>
 
-<div *ngIf="submitted" class="dg-permission dg-permission--resolved">
+<div
+  *ngIf="submitted"
+  class="dg-permission dg-permission--resolved">
   Decision sent — waiting for the agent to continue.
 </div>
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html
index f50626061a0..4aec6974497 100644
--- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.html
@@ -1,10 +1,38 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
 <!-- cdkDragBoundary keeps the panel inside the viewport so a user can't drag
      it behind the toolbar / off-screen and lose access to the close button. -->
-<div class="dg-panel" *ngIf="isOpen" cdkDrag cdkDragBoundary="body">
+<div
+  class="dg-panel"
+  *ngIf="isOpen"
+  cdkDrag
+  cdkDragBoundary="body">
   <!-- Header doubles as the drag handle. -->
-  <div class="dg-panel__header" cdkDragHandle>
+  <div
+    class="dg-panel__header"
+    cdkDragHandle>
     <span class="dg-panel__title">
-      <i nz-icon nzType="safety-certificate" nzTheme="twotone" nzTwotoneColor="#0050b3"></i>
+      <i
+        nz-icon
+        nzType="safety-certificate"
+        nzTheme="twotone"
+        nzTwotoneColor="#0050b3"></i>
       DataGuard
     </span>
     <span class="dg-panel__status">{{ statusBadge() }}</span>
@@ -15,27 +43,42 @@
       class="dg-panel__close"
       (click)="onClose()"
       title="Dismiss checklist">
-      <i nz-icon nzType="close"></i>
+      <i
+        nz-icon
+        nzType="close"></i>
     </button>
   </div>
 
   <!-- Subheader: dataset summary + message -->
   <div class="dg-panel__sub">
     <ng-container *ngIf="scan.datasetSource">
-      We checked <strong>{{ scan.datasetRows }} rows</strong> in
-      <code>{{ scan.datasetSource }}</code>.
+      We checked <strong>{{ scan.datasetRows }} rows</strong> in <code>{{ scan.datasetSource }}</code>.
     </ng-container>
-    <div *ngIf="scan.message" class="dg-panel__message">{{ scan.message }}</div>
+    <div
+      *ngIf="scan.message"
+      class="dg-panel__message">
+      {{ scan.message }}
+    </div>
   </div>
 
   <!-- Empty state during scanning -->
-  <div *ngIf="scan.state === 'scanning'" class="dg-panel__empty">
-    <i nz-icon nzType="loading"></i>&nbsp;Looking for problems in your data…
+  <div
+    *ngIf="scan.state === 'scanning'"
+    class="dg-panel__empty">
+    <i
+      nz-icon
+      nzType="loading"></i
+    >&nbsp;Looking for problems in your data…
   </div>
 
   <!-- Empty state when scan finished with 0 issues -->
-  <div *ngIf="scan.state === 'ready' && scan.entries.length === 0" class="dg-panel__empty dg-panel__empty--clean">
-    <i nz-icon nzType="check-circle" nzTheme="twotone"></i>
+  <div
+    *ngIf="scan.state === 'ready' && scan.entries.length === 0"
+    class="dg-panel__empty dg-panel__empty--clean">
+    <i
+      nz-icon
+      nzType="check-circle"
+      nzTheme="twotone"></i>
     Your data looks good — nothing to fix.
   </div>
 
@@ -47,7 +90,11 @@
     <span
       *ngFor="let c of categorySummary(); let last = last"
       class="dg-panel__category-chip">
-      {{ c.count }} {{ c.label }}{{ c.count === 1 ? "" : "s" }}<span *ngIf="!last" class="dg-panel__category-sep">·</span>
+      {{ c.count }} {{ c.label }}{{ c.count === 1 ? "" : "s" }}<span
+        *ngIf="!last"
+        class="dg-panel__category-sep"
+        >·</span
+      >
     </span>
   </div>
 
@@ -57,13 +104,25 @@
     class="dg-panel__bulk">
     <span>{{ selectedCount }} to fix · {{ deniedCount }} skipped</span>
     <span class="dg-panel__bulk-actions">
-      <button nz-button nzSize="small" (click)="onSelectAll()">Fix all</button>
-      <button nz-button nzSize="small" (click)="onDenyAll()">Skip all</button>
+      <button
+        nz-button
+        nzSize="small"
+        (click)="onSelectAll()">
+        Fix all
+      </button>
+      <button
+        nz-button
+        nzSize="small"
+        (click)="onDenyAll()">
+        Skip all
+      </button>
     </span>
   </div>
 
   <!-- Checklist of issues -->
-  <ul class="dg-panel__list" *ngIf="scan.entries.length > 0">
+  <ul
+    class="dg-panel__list"
+    *ngIf="scan.entries.length > 0">
     <li
       *ngFor="let entry of scan.entries"
       class="dg-row"
@@ -79,9 +138,7 @@
           <span class="dg-row__category">{{ categoryLabel(entry) }}</span>
           <span class="dg-row__action">{{ entry.proposal?.action || entry.issue.description }}</span>
         </label>
-        <span class="dg-row__tier dg-row__tier--{{ riskTierLabel(entry) }}">
-          {{ riskTierLabel(entry) }}
-        </span>
+        <span class="dg-row__tier dg-row__tier--{{ riskTierLabel(entry) }}"> {{ riskTierLabel(entry) }} </span>
       </div>
 
       <div class="dg-row__details">
@@ -90,11 +147,20 @@
           class="dg-row__locate"
           (click)="onShowInResultPanel(entry)"
           [title]="locateTooltip(entry)">
-          <i nz-icon nzType="environment" nzTheme="outline"></i>
+          <i
+            nz-icon
+            nzType="environment"
+            nzTheme="outline"></i>
           In column <b>{{ entry.issue.column }}</b> · affects {{ entry.issue.affectedRowCount }} row(s)
         </button>
-        <div class="dg-row__field" *ngIf="entry.proposal">{{ entry.proposal.reason }}</div>
-        <div class="dg-row__field dg-row__field--error" *ngIf="entry.error">
+        <div
+          class="dg-row__field"
+          *ngIf="entry.proposal">
+          {{ entry.proposal.reason }}
+        </div>
+        <div
+          class="dg-row__field dg-row__field--error"
+          *ngIf="entry.error">
           ⚠ We couldn't suggest a fix for this one. You can skip it.
         </div>
       </div>
@@ -107,7 +173,10 @@
           nzSize="small"
           [disabled]="entry.verdict === 'deny' || !!entry.error"
           (click)="onDeny(entry)">
-          <i nz-icon nzType="close-circle"></i>&nbsp;Skip
+          <i
+            nz-icon
+            nzType="close-circle"></i
+          >&nbsp;Skip
         </button>
         <label
           *ngIf="!!entry.proposal && riskTierLabel(entry) !== 'high' && riskTierLabel(entry) !== 'warning'"
@@ -123,7 +192,9 @@
   </ul>
 
   <!-- Footer: Apply Selected -->
-  <div class="dg-panel__foot" *ngIf="scan.state === 'ready' && scan.entries.length > 0">
+  <div
+    class="dg-panel__foot"
+    *ngIf="scan.state === 'ready' && scan.entries.length > 0">
     <button
       nz-button
       nzType="primary"
@@ -134,16 +205,27 @@
     </button>
   </div>
 
-  <div class="dg-panel__foot dg-panel__foot--split" *ngIf="scan.state === 'done' || scan.state === 'error'">
+  <div
+    class="dg-panel__foot dg-panel__foot--split"
+    *ngIf="scan.state === 'done' || scan.state === 'error'">
     <button
       nz-button
       nzType="default"
       (click)="onRescan()"
       [disabled]="isRescanning"
       title="Re-run DataGuard on the current dataset version. Useful if the AI missed something the first time.">
-      <i nz-icon nzType="reload" [nzSpin]="isRescanning"></i>&nbsp;Scan again
+      <i
+        nz-icon
+        nzType="reload"
+        [nzSpin]="isRescanning"></i
+      >&nbsp;Scan again
+    </button>
+    <button
+      nz-button
+      nzType="primary"
+      (click)="onClose()">
+      Close
     </button>
-    <button nz-button nzType="primary" (click)="onClose()">Close</button>
   </div>
 </div>
 
@@ -156,5 +238,9 @@
   class="dg-floater"
   (click)="onFloaterClick()"
   title="Scan the current dataset operator with DataGuard">
-  <i nz-icon nzType="safety-certificate" nzTheme="twotone" nzTwotoneColor="#0050b3"></i>
+  <i
+    nz-icon
+    nzType="safety-certificate"
+    nzTheme="twotone"
+    nzTwotoneColor="#0050b3"></i>
 </button>
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss
index efc9146f712..f2c2a591fce 100644
--- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.scss
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 .dg-panel {
   position: fixed;
   bottom: 100px;
@@ -303,7 +322,9 @@
   align-items: center;
   justify-content: center;
   box-shadow: 0 4px 12px rgba(0, 0, 0, 0.12);
-  transition: background 0.15s, transform 0.1s;
+  transition:
+    background 0.15s,
+    transform 0.1s;
 
   i {
     font-size: 1.4rem;
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
new file mode 100644
index 00000000000..b943ac0824d
--- /dev/null
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
@@ -0,0 +1,176 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { vi } from "vitest";
+import { BehaviorSubject } from "rxjs";
+import { DataGuardChecklistComponent } from "./dataguard-checklist.component";
+import { ChecklistEntry, DataGuardScanResult } from "../../service/agent/data-guard-results.service";
+import type { DataGuardRowNavRequest } from "../../service/agent/data-guard-row-navigator.service";
+
+type NavRequest = Omit<DataGuardRowNavRequest, "requestId">;
+
+/**
+ * Operator-type-aware locate branching: CSV scans take the synchronous index
+ * path (cursor advances immediately, no await), while JSONL keeps the
+ * flash-confirmed Promise contract introduced in round 4. The tests construct
+ * the component directly with stub collaborators so we exercise just
+ * `onShowInResultPanel` — TestBed would drag in change detection and the full
+ * results-service wiring we don't need here.
+ */
+describe("DataGuardChecklistComponent.onShowInResultPanel — locate branching", () => {
+  const SOURCE_OP_ID = "scan-1";
+
+  function makeEntry(rowIndices: number[], rowKeys?: string[]): ChecklistEntry {
+    return {
+      issueId: "issue-1",
+      issue: {
+        issueId: "issue-1",
+        issueType: "missing_value",
+        column: "age",
+        description: "x",
+        evidence: "y",
+        affectedRowCount: rowIndices.length,
+        affectedRowIndices: rowIndices,
+        affectedRowKeys: rowKeys,
+        detectedAt: "now",
+      },
+      proposal: null,
+      error: null,
+      verdict: "pending",
+    };
+  }
+
+  function makeComponent(operatorType: string, navigateImpl: () => Promise<boolean>) {
+    const scanState: DataGuardScanResult = {
+      agentId: "a",
+      state: "ready",
+      entries: [],
+      datasetSource: "demo.csv",
+      datasetRows: 10,
+      datasetColumns: 3,
+      sourceOperatorId: SOURCE_OP_ID,
+    };
+    const state$ = new BehaviorSubject<DataGuardScanResult>(scanState);
+
+    const results = {
+      getState$: () => state$.asObservable(),
+      getState: () => state$.value,
+      updateEntry: vi.fn(),
+      reset: vi.fn(),
+    } as any;
+
+    const autoTrigger = {
+      startOrchestration: () => ({ unsubscribe: () => {} }),
+      rescanAny: vi.fn(),
+      rescanCurrent: vi.fn(),
+      applyBatch: vi.fn(),
+    } as any;
+
+    const settings = { isEnabled: () => true } as any;
+
+    const navigateSpy = vi.fn((_req: NavRequest) => navigateImpl());
+    const rowNavigator = { navigate: navigateSpy } as any;
+
+    const jointGraph = {
+      getCurrentHighlightedOperatorIDs: () => [SOURCE_OP_ID],
+      unhighlightOperators: vi.fn(),
+      highlightOperators: vi.fn(),
+    };
+    const texeraGraph = {
+      getAllOperators: () => [{ operatorID: SOURCE_OP_ID, operatorType }],
+      getOperator: (id: string) => (id === SOURCE_OP_ID ? { operatorID: SOURCE_OP_ID, operatorType } : undefined),
+    };
+    const workflowActionService = {
+      getJointGraphWrapper: () => jointGraph,
+      getTexeraGraph: () => texeraGraph,
+      getWorkflowMetadata: () => ({ wid: 1 }),
+      openResultPanel: vi.fn(),
+    } as any;
+
+    const notificationService = {
+      info: vi.fn(),
+      warning: vi.fn(),
+    } as any;
+
+    const component = new DataGuardChecklistComponent(
+      results,
+      autoTrigger,
+      settings,
+      workflowActionService,
+      rowNavigator,
+      notificationService
+    );
+    // Hydrate the component's view of state without going through ngOnInit
+    // (which would also subscribe to the auto-trigger orchestration stream we
+    // don't need here).
+    (component as any).scan = scanState;
+    return { component, navigateSpy };
+  }
+
+  it("CSV path: advances cursor synchronously and fires navigate without awaiting", async () => {
+    // Pending Promise — if the component were awaiting it, the cursor would
+    // not be advanced by the time onShowInResultPanel resolves to us. (It
+    // resolves after a single microtask defer.)
+    let resolveNavigate!: (v: boolean) => void;
+    const pending = new Promise<boolean>(r => (resolveNavigate = r));
+    const { component, navigateSpy } = makeComponent("CSVFileScan", () => pending);
+
+    const entry = makeEntry([3, 7, 12, 18]);
+    const click = component.onShowInResultPanel(entry);
+    await click; // returns once cursor advance + void navigate has been issued
+
+    // navigate fired exactly once, WITHOUT a rowKey — that's the CSV signal
+    // to result-table-frame to take the simple index path.
+    expect(navigateSpy).toHaveBeenCalledTimes(1);
+    const payload = navigateSpy.mock.calls[0]![0]!;
+    expect(payload.operatorId).toBe(SOURCE_OP_ID);
+    expect(payload.rowIndex).toBe(3);
+    expect(payload.rowKey).toBeUndefined();
+
+    // Cursor advanced immediately — without resolving the Promise.
+    expect((component as any).locateCursors.get("issue-1")).toBe(1);
+
+    // Clean up the dangling Promise so vitest doesn't complain.
+    resolveNavigate(true);
+  });
+
+  it("JSONL path: cursor advances only after navigate() resolves true, stays put on false", async () => {
+    // First click: navigate resolves true → cursor advances.
+    const { component, navigateSpy } = makeComponent("JSONLFileScan", () => Promise.resolve(true));
+    const entry = makeEntry([3, 7, 12, 18], ["k0", "k1", "k2", "k3"]);
+
+    const clickPromise = component.onShowInResultPanel(entry);
+    // Before the await settles, the cursor should still be at its starting
+    // position — proves we did not eagerly write it like the CSV branch does.
+    expect((component as any).locateCursors.get("issue-1")).toBeUndefined();
+    await clickPromise;
+    expect((component as any).locateCursors.get("issue-1")).toBe(1);
+    // rowKey was passed through — that's the JSONL signal to the table side.
+    expect(navigateSpy.mock.calls[0]![0]!.rowKey).toBe("k0");
+
+    // Second click: navigate resolves false → cursor stays at 1, next click
+    // will retry the same target rather than skipping it.
+    const { component: c2, navigateSpy: spy2 } = makeComponent("JSONLFileScan", () => Promise.resolve(false));
+    (c2 as any).locateCursors.set("issue-1", 1);
+    await c2.onShowInResultPanel(entry);
+    expect((c2 as any).locateCursors.get("issue-1")).toBe(1);
+    expect(spy2).toHaveBeenCalledTimes(1);
+    expect(spy2.mock.calls[0]![0]!.rowIndex).toBe(7); // step.value at cursor=1
+  });
+});
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
index d55d1eb16d7..d5fe65dbda2 100644
--- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
@@ -84,11 +84,14 @@ export class DataGuardChecklistComponent implements OnInit, OnDestroy {
   private orchestrationSub?: Subscription;
 
   // Per-row cursor for the "📍" locate-cycle affordance. Keyed by `issueId`
-  // so each detector row gets an independent cursor. Cleared on every fresh
-  // scan push — different issueIds means stale keys would just become
-  // garbage, but a hard reset keeps memory bounded and the behaviour
-  // predictable. The cursor value is the index of the *next* click —
-  // i.e., on entry it is 0, and after navigating to indices[0] it becomes 1.
+  // so each detector row gets an independent cursor. We *purge stale keys* on
+  // every state push (rather than wiping the Map wholesale), so a benign
+  // re-emit — e.g., `updateEntry` after the user toggles an unrelated row's
+  // verdict — leaves the clicked row's cursor intact. Without this, repeat
+  // clicks on the same 📍 would jump back to 0 mid-cycle whenever an unrelated
+  // setState happened to fire between clicks. Cursor value is the index of
+  // the *next* click — i.e., on entry it is 0, and after navigating to
+  // indices[0] it becomes 1.
   private locateCursors = new Map<string, number>();
 
   constructor(
@@ -124,17 +127,18 @@ export class DataGuardChecklistComponent implements OnInit, OnDestroy {
   }
 
   ngOnInit(): void {
-    // Track the issueId set of the previous push. `updateEntry` rebuilds the
-    // entries array on every verdict toggle (`.map(...)`), so identity-compare
-    // would spuriously reset cursors mid-review. Instead, reset only when the
-    // *set of issueIds* changes — that's the actual "fresh scan" signal.
-    let lastIssueIdsKey: string | undefined;
+    // We keep the Subscription handle explicitly because the orchestration
+    // sub below is also kept that way (and torn down in ngOnDestroy). Using
+    // untilDestroyed here would mix patterns; the existing teardown is fine.
+    // eslint-disable-next-line rxjs-angular/prefer-takeuntil
     this.sub = this.results.getState$().subscribe(s => {
-      const key = s.entries.map(e => e.issueId).join("|");
-      if (key !== lastIssueIdsKey) {
-        this.locateCursors.clear();
-        lastIssueIdsKey = key;
-      }
+      // Purge stale cursors instead of clearing the whole Map. A live issueId
+      // keeps its cursor across any re-emit, so repeated 📍 clicks survive
+      // benign state pushes (verdict toggles, status messages, etc.). Truly
+      // fresh scans replace the issueId set wholesale, so every old key is
+      // dropped here — bounded memory, no leak. Same-id-set re-emits do zero
+      // mutations to the Map.
+      DataGuardRowNavigatorService.purgeStaleCursors(this.locateCursors, new Set(s.entries.map(e => e.issueId)));
       this.scan = s;
       // Tally once per state push instead of three full-walks per CD tick.
       let allow = 0,
@@ -255,7 +259,7 @@ export class DataGuardChecklistComponent implements OnInit, OnDestroy {
    * event in a microtask — otherwise our subscriber on the table side may not
    * exist yet on the first click after a panel close.
    */
-  public onShowInResultPanel(entry: ChecklistEntry): void {
+  public async onShowInResultPanel(entry: ChecklistEntry): Promise<void> {
     const opId = this.scan.sourceOperatorId;
     if (!opId) {
       this.notificationService.warning("DataGuard: no source operator recorded for this scan.");
@@ -286,9 +290,7 @@ export class DataGuardChecklistComponent implements OnInit, OnDestroy {
       return;
     }
     if (rowIndices.length === 0) {
-      this.notificationService.info(
-        "DataGuard: opened the result panel — no rows are affected by this issue."
-      );
+      this.notificationService.info("DataGuard: opened the result panel — no rows are affected by this issue.");
       return;
     }
     // Cycle through affectedRowIndices: each click advances this row's cursor
@@ -296,15 +298,82 @@ export class DataGuardChecklistComponent implements OnInit, OnDestroy {
     // panel re-pulses on every click. Pure helper for testability.
     const cursor = this.locateCursors.get(entry.issueId) ?? 0;
     const step = DataGuardRowNavigatorService.nextCycleStep(rowIndices, cursor);
-    this.locateCursors.set(entry.issueId, step.nextCursor);
+
+    // Branch on the source operator's type to pick the right locate flavour.
+    // CSV scans (CSVFileScan, CSVOldFileScan) run with a single Texera worker
+    // so the result-panel display order matches the file-byte order the
+    // profiler indexed against — the simple synchronous index path is correct
+    // and we skip the fingerprint + flash-confirmed dance entirely. JSONL
+    // (and any future shuffled-output scan) keeps the round-4 flash-confirmed
+    // contract so we don't silently skip rows under a worker shuffle.
+    //
+    // We already validated `operatorExists` above against
+    // `getAllOperators()`, so `getOperator(opId)` should return the same
+    // record this tick. If it returns `undefined` anyway — operator was
+    // deleted between the existence check and here, or never registered —
+    // we fall through to the JSONL path: the safer default. It awaits a
+    // flash that will never arrive, but the 36 s navigate() timeout resolves
+    // `false` and the cursor stays put, which is exactly what we want for
+    // a vanished operator. No throw, no notification spam.
+    const opType = this.workflowActionService.getTexeraGraph().getOperator(opId)?.operatorType;
+    const isCsvLike = opType === "CSVFileScan" || opType === "CSVOldFileScan";
+
     // Defer one microtask so the table frame mounts before we ask it to page.
-    queueMicrotask(() =>
-      this.rowNavigator.navigate({
+    // Applies to both branches because openResultPanel() above triggers a CD
+    // tick to instantiate the frame via NgComponentOutlet.
+    await new Promise<void>(resolve => queueMicrotask(resolve));
+
+    if (isCsvLike) {
+      // Synchronous-style path: advance the cursor immediately and
+      // fire-and-forget navigate(). We deliberately do NOT pass `rowKey` —
+      // result-table-frame routes to handleLocateByIndex when rowKey is
+      // absent, which is the simple page-by-index walk that CSV's
+      // file-byte-ordered display can rely on. The navigate Promise still
+      // settles (success or 36 s timeout) but we don't await it; if it
+      // returned `false` it would have no effect anyway because the cursor
+      // is already advanced — and the user clicks again to move on.
+      this.locateCursors.set(entry.issueId, step.nextCursor);
+      void this.rowNavigator.navigate({
         operatorId: opId,
         rowIndex: step.value,
         column: entry.issue.column,
-      })
-    );
+        // rowKey intentionally omitted — forces the index path on the table side.
+      });
+      return;
+    }
+
+    // JSONL / shuffled-output path: prefer content-based row matching
+    // (`rowKey`) over the profiler-side index. Texera's JSONL multi-worker
+    // scan shuffles rows out of file order, so the profiler's index 4 may
+    // map to display index 11 (or any other). The result-table-frame falls
+    // back to the index when `rowKey` cannot be found in the loaded data.
+    const rowKeys = entry.issue.affectedRowKeys;
+    let rowKey: string | undefined;
+    if (rowKeys && rowKeys.length === rowIndices.length) {
+      const safeCursor = Number.isFinite(cursor) && cursor >= 0 ? Math.floor(cursor) : 0;
+      rowKey = rowKeys[safeCursor % rowKeys.length];
+    } else if (rowKeys === undefined) {
+      // Server omitted fingerprints (e.g., issue was too large to enumerate).
+      // Surface a hint so users aren't surprised when the highlight lands on
+      // the wrong row in a shuffled result panel.
+      this.notificationService.info(
+        "DataGuard: cell highlight may be inaccurate — row fingerprints weren't recorded for this issue."
+      );
+    }
+
+    // Flash-confirmed advancement: only commit the cursor when the table
+    // frame actually pulsed the row. A `false` (timeout, supersede, no
+    // match) leaves the cursor where it was so the next click retries the
+    // same target instead of silently skipping it — the round-4 contract.
+    const flashed = await this.rowNavigator.navigate({
+      operatorId: opId,
+      rowIndex: step.value,
+      rowKey,
+      column: entry.issue.column,
+    });
+    if (flashed) {
+      this.locateCursors.set(entry.issueId, step.nextCursor);
+    }
   }
 
   // ---------------- display helpers ----------------
diff --git a/frontend/src/app/workspace/component/menu/menu.component.ts b/frontend/src/app/workspace/component/menu/menu.component.ts
index 0f3e0eb211c..d2416b41087 100644
--- a/frontend/src/app/workspace/component/menu/menu.component.ts
+++ b/frontend/src/app/workspace/component/menu/menu.component.ts
@@ -494,9 +494,7 @@ export class MenuComponent implements OnInit, OnDestroy {
   public onToggleDataGuard(): void {
     const wid = this.workflowActionService.getWorkflowMetadata()?.wid;
     if (wid === undefined) {
-      this.notificationService.warning(
-        "Save the workflow first — DataGuard's setting is stored per workflow."
-      );
+      this.notificationService.warning("Save the workflow first — DataGuard's setting is stored per workflow.");
       return;
     }
     const next = this.dataGuardSettings.toggle(wid);
diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
index e9bba0d285b..295dd4aa4e0 100644
--- a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
+++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
@@ -125,6 +125,15 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
   // know when the page is shown" rather than "when the HTTP responds" — so
   // we never double-subscribe to the cold `selectPage` Observable.
   private readonly pageRendered$ = new Subject<number>();
+  // DataGuard locate-flow token. Each new navigate request bumps this; every
+  // async branch in handleLocateByKey / handleLocateByIndex captures the value
+  // at request time and bails (emitting `flashed: false` exactly once) if the
+  // captured token no longer matches `currentLocateToken` by the time the
+  // branch resumes. This kills the rapid-click race where an older walk would
+  // otherwise complete after a newer one and flash the wrong row, and also
+  // guarantees the checklist's cursor-advancement Promise resolves for the
+  // superseded request instead of hanging until the 6 s safety timeout.
+  private currentLocateToken: number = 0;
 
   constructor(
     private modalService: NzModalService,
@@ -138,6 +147,12 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
     private dataGuardRowNavigator: DataGuardRowNavigatorService
   ) {}
 
+  // DataGuard locate-by-key: max pages to walk before giving up. Without a cap
+  // a malformed key could keep selectPage'ing forever; 10 covers typical
+  // demo / hackathon datasets (e.g. 5 rows/page × 10 pages = 50 rows, larger
+  // than every fixture we ship).
+  private static readonly LOCATE_BY_KEY_MAX_PAGES = 10;
+
   ngOnChanges(changes: SimpleChanges): void {
     this.operatorId = changes.operatorId?.currentValue;
     if (this.operatorId) {
@@ -230,59 +245,212 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
       .pipe(untilDestroyed(this))
       .subscribe(req => {
         if (!this.operatorId || req.operatorId !== this.operatorId) {
+          // Wrong operator: this frame is not the target. Don't emit a
+          // flashResult — some other frame (the right one) will handle it.
+          // Note: if there is NO right frame at all (operator was deleted),
+          // the checklist's awaitFlashResult timeout (6 s) catches it.
           return;
         }
-        // Capture operator at click time. The frame's `operatorId` can change
-        // before the async page-load completes (user switches operators); we
-        // bail in applyFlash if the frame is no longer showing this request's
-        // operator. pageSize is intentionally NOT captured — recomputed inside
-        // applyFlash so a panel resize between click and apply takes effect.
-        const requestOperatorId = req.operatorId;
-        const applyFlash = () => {
-          if (this.operatorId !== requestOperatorId) return;
-          const targetPage = DataGuardRowNavigatorService.pageIndexFor(req.rowIndex, this.pageSize);
-          const rowInPage = req.rowIndex - (targetPage - 1) * this.pageSize;
-          this.highlightedRowIndexInPage = rowInPage;
-          this.highlightedColumn = req.column ?? null;
-          this.changeDetectorRef.detectChanges();
-          if (this.highlightTimer !== null) {
-            clearTimeout(this.highlightTimer);
-          }
-          this.highlightTimer = setTimeout(() => {
-            this.highlightedRowIndexInPage = null;
-            this.highlightedColumn = null;
-            this.highlightTimer = null;
-            this.changeDetectorRef.detectChanges();
-          }, ResultTableFrameComponent.HIGHLIGHT_DURATION_MS);
-        };
-        // Compute target page once, off the freshest pageSize. (Reviewer 2:
-        // applyFlash recomputes too, but that branch only runs after the
-        // page-rendered round-trip, so a panel resize in flight still picks
-        // up the new pageSize there.)
-        const targetPage = DataGuardRowNavigatorService.pageIndexFor(req.rowIndex, this.pageSize);
-        if (this.currentPageIndex !== targetPage) {
-          this.currentPageIndex = targetPage;
-          // Wait on the *post-render* signal — not the cold selectPage
-          // Observable — so we don't trigger a duplicate HTTP fetch and so
-          // the flash lands after setupResultTable + detectChanges complete.
-          // race() against a generous timeout so a never-completing render
-          // (e.g., user navigates away) doesn't strand the flash forever.
-          race(
-            this.pageRendered$.pipe(
-              filter(p => p === targetPage),
-              take(1)
-            ),
-            timer(3000)
-          )
-            .pipe(take(1), untilDestroyed(this))
-            .subscribe(() => applyFlash());
-          this.changePaginatedResultData();
+        // Token-cancel any in-flight walk from a prior request. The old walk's
+        // captured token no longer matches currentLocateToken, so its next
+        // async resumption will short-circuit + emit `flashed: false` for its
+        // own requestId. This is what makes rapid clicks safe.
+        this.currentLocateToken = req.requestId;
+        // Branch on rowKey: if the server included a fingerprint, prefer it
+        // (handles Texera's JSONL multi-worker row-shuffle). Fall back to the
+        // raw index path when rowKey is absent or no row matches anywhere.
+        if (req.rowKey !== undefined) {
+          this.handleLocateByKey(req.operatorId, req.requestId, req.rowKey, req.column, req.rowIndex);
         } else {
-          applyFlash();
+          this.handleLocateByIndex(req.operatorId, req.requestId, req.rowIndex, req.column);
         }
       });
   }
 
+  /**
+   * Centralised flash-result reporter. Wraps the navigator service call so
+   * every exit point in the locate flow goes through one place — easier to
+   * audit "did we report on every branch?".
+   */
+  private reportFlashResult(requestId: number, flashed: boolean): void {
+    this.dataGuardRowNavigator.reportFlashResult({ requestId, flashed });
+  }
+
+  /**
+   * Locate a row by its content fingerprint (`rowKey`). Scans the currently
+   * loaded page first (fast path — common when the issue is on the first
+   * page); on a miss walks subsequent pages up to LOCATE_BY_KEY_MAX_PAGES.
+   * When no match is found anywhere, toasts the user and leaves the operator
+   * highlighted but un-flashed.
+   *
+   * Same operator-id captured-at-click-time guard as the index path — bails
+   * silently if the user switched operators mid-page-load.
+   */
+  private handleLocateByKey(
+    requestOperatorId: string,
+    requestId: number,
+    rowKey: string,
+    column: string | undefined,
+    fallbackIndex: number
+  ): void {
+    // Every async resumption MUST first check that our captured requestId is
+    // still the current one. If a newer click superseded us, emit
+    // `flashed: false` (so the older request's cursor stays put) and bail.
+    // Returns `true` when the caller should bail.
+    const isSuperseded = (): boolean => this.currentLocateToken !== requestId;
+
+    const tryFlashOnCurrentPage = (): boolean => {
+      if (this.operatorId !== requestOperatorId) return true; // bail, but treat as "handled"
+      const columns = this.currentColumns?.map(c => c.columnDef) ?? [];
+      const rowInPage = DataGuardRowNavigatorService.findRowByKey(
+        this.currentResult as ReadonlyArray<Record<string, unknown>>,
+        columns,
+        rowKey
+      );
+      if (rowInPage >= 0) {
+        this.flashRow(rowInPage, column);
+        return true;
+      }
+      return false;
+    };
+
+    // 1. Fast path — match on the page already rendered.
+    if (tryFlashOnCurrentPage()) {
+      this.reportFlashResult(requestId, true);
+      return;
+    }
+
+    // 2. Walk subsequent pages. Start from page 1 (not currentPageIndex+1)
+    //    because the user may not be on page 1 when they click — the affected
+    //    row could be earlier. Stop at LOCATE_BY_KEY_MAX_PAGES or when we run
+    //    out of tuples.
+    const totalPages = Math.max(1, Math.ceil(this.totalNumTuples / Math.max(this.pageSize, 1)));
+    const lastPage = Math.min(totalPages, ResultTableFrameComponent.LOCATE_BY_KEY_MAX_PAGES);
+    const walkPage = (pageIndex: number) => {
+      if (isSuperseded()) {
+        this.reportFlashResult(requestId, false);
+        return;
+      }
+      if (this.operatorId !== requestOperatorId) {
+        // User switched operators mid-walk. Treat as silent skip — the new
+        // operator's frame (if any) handles its own requests, this request
+        // never produced a flash here.
+        this.reportFlashResult(requestId, false);
+        return;
+      }
+      if (pageIndex > lastPage) {
+        // Exhausted the search window. Silently fall back to the file-byte-order
+        // index — this is the same path locate took before fingerprints existed
+        // and is correct for single-worker output (the common case). For
+        // multi-worker shuffle cases the flash may land on the wrong cell, but
+        // toasting on every click is more annoying than the occasional miss is
+        // confusing; the toast was firing 100% of the time during normal use.
+        this.handleLocateByIndex(requestOperatorId, requestId, fallbackIndex, column);
+        return;
+      }
+      this.currentPageIndex = pageIndex;
+      race(
+        this.pageRendered$.pipe(
+          filter(p => p === pageIndex),
+          take(1)
+        ),
+        timer(3000)
+      )
+        .pipe(take(1), untilDestroyed(this))
+        .subscribe(() => {
+          if (isSuperseded()) {
+            this.reportFlashResult(requestId, false);
+            return;
+          }
+          if (tryFlashOnCurrentPage()) {
+            this.reportFlashResult(requestId, true);
+            return;
+          }
+          walkPage(pageIndex + 1);
+        });
+      this.changePaginatedResultData();
+    };
+    walkPage(1);
+  }
+
+  /**
+   * Legacy index-based path. Used as a fallback when `rowKey` is absent
+   * (CSV single-worker, older agent-service builds) or when no key match is
+   * found anywhere in the result panel within LOCATE_BY_KEY_MAX_PAGES.
+   */
+  private handleLocateByIndex(
+    requestOperatorId: string,
+    requestId: number,
+    rowIndex: number,
+    column: string | undefined
+  ): void {
+    const isSuperseded = (): boolean => this.currentLocateToken !== requestId;
+    // applyFlash returns true if it actually flashed, false if the index path
+    // bailed (operator switched, totalNumTuples=0, rowIndex out-of-bounds).
+    // The "no rows" guard catches the empty-click test case: index fallback
+    // can't flash anything when totalNumTuples is 0.
+    const applyFlash = (): boolean => {
+      if (this.operatorId !== requestOperatorId) return false;
+      if (rowIndex < 0 || (this.totalNumTuples > 0 && rowIndex >= this.totalNumTuples)) {
+        return false;
+      }
+      if (this.totalNumTuples === 0 || this.currentResult.length === 0) {
+        return false;
+      }
+      const targetPage = DataGuardRowNavigatorService.pageIndexFor(rowIndex, this.pageSize);
+      const rowInPage = rowIndex - (targetPage - 1) * this.pageSize;
+      // The page may have fewer rows than expected (last page short-fill); if
+      // the computed in-page index is out of range, don't flash a phantom row.
+      if (rowInPage < 0 || rowInPage >= this.currentResult.length) {
+        return false;
+      }
+      this.flashRow(rowInPage, column);
+      return true;
+    };
+    const targetPage = DataGuardRowNavigatorService.pageIndexFor(rowIndex, this.pageSize);
+    if (this.currentPageIndex !== targetPage) {
+      this.currentPageIndex = targetPage;
+      race(
+        this.pageRendered$.pipe(
+          filter(p => p === targetPage),
+          take(1)
+        ),
+        timer(3000)
+      )
+        .pipe(take(1), untilDestroyed(this))
+        .subscribe(() => {
+          if (isSuperseded()) {
+            this.reportFlashResult(requestId, false);
+            return;
+          }
+          this.reportFlashResult(requestId, applyFlash());
+        });
+      this.changePaginatedResultData();
+    } else {
+      if (isSuperseded()) {
+        this.reportFlashResult(requestId, false);
+        return;
+      }
+      this.reportFlashResult(requestId, applyFlash());
+    }
+  }
+
+  /** Shared flash routine: sets highlight state for HIGHLIGHT_DURATION_MS. */
+  private flashRow(rowInPage: number, column: string | undefined): void {
+    this.highlightedRowIndexInPage = rowInPage;
+    this.highlightedColumn = column ?? null;
+    this.changeDetectorRef.detectChanges();
+    if (this.highlightTimer !== null) {
+      clearTimeout(this.highlightTimer);
+    }
+    this.highlightTimer = setTimeout(() => {
+      this.highlightedRowIndexInPage = null;
+      this.highlightedColumn = null;
+      this.highlightTimer = null;
+      this.changeDetectorRef.detectChanges();
+    }, ResultTableFrameComponent.HIGHLIGHT_DURATION_MS);
+  }
+
   ngOnDestroy(): void {
     // @UntilDestroy handles RxJS subs but not raw timers — clear so the late
     // callback can't fire detectChanges() on a destroyed view (NG0911).
diff --git a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts
index 81c9b23dd22..7a3b9354142 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.spec.ts
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-import { DataGuardAutoTriggerService } from "./data-guard-auto-trigger.service";
+import { DataGuardAutoTriggerService, PARSERS, parseCsv, ParserOptions } from "./data-guard-auto-trigger.service";
 import { OperatorPredicate } from "../../types/workflow-common.interface";
 
 /**
@@ -47,7 +47,7 @@ describe("DataGuardAutoTriggerService.resolveRescanTarget", () => {
       operatorVersion: "0",
     } as unknown as OperatorPredicate;
   }
-  const isDataset = (t: string) => t === "CSVFileScan" || t === "ParallelCSVFileScan";
+  const isDataset = (t: string) => t === "CSVFileScan" || t === "CSVOldFileScan";
 
   it("returns 'prior' when the prior operator is still on the graph", () => {
     const a = op("op-1", "CSVFileScan");
@@ -71,11 +71,7 @@ describe("DataGuardAutoTriggerService.resolveRescanTarget", () => {
     // User deleted the originally-scanned operator and dropped a new one. We
     // shouldn't warn "nothing to rescan" just because the prior id is stale.
     const replacement = op("op-2", "CSVFileScan");
-    const result = DataGuardAutoTriggerService.resolveRescanTarget(
-      "op-1-gone",
-      [replacement],
-      isDataset
-    );
+    const result = DataGuardAutoTriggerService.resolveRescanTarget("op-1-gone", [replacement], isDataset);
     expect(result.kind).toBe("candidate");
     if (result.kind === "candidate") expect(result.operator.operatorID).toBe("op-2");
   });
@@ -91,6 +87,18 @@ describe("DataGuardAutoTriggerService.resolveRescanTarget", () => {
     expect(result.kind).toBe("none");
   });
 
+  it("treats CSVOldFileScan as a dataset operator alongside CSVFileScan", () => {
+    // Regression: CSVOldFileScan was added in the parser-dispatcher refactor
+    // so the auto-trigger fires for it too. Reusing the dispatcher's own
+    // PARSERS map as the source of truth keeps the operator-type set and the
+    // parser table from drifting apart silently.
+    const isDatasetByPARSERS = (t: string) => Object.prototype.hasOwnProperty.call(PARSERS, t);
+    const csvOld = op("op-csvold", "CSVOldFileScan");
+    const result = DataGuardAutoTriggerService.resolveRescanTarget(undefined, [csvOld], isDatasetByPARSERS);
+    expect(result.kind).toBe("candidate");
+    if (result.kind === "candidate") expect(result.operator.operatorID).toBe("op-csvold");
+  });
+
   it("returns 'prior' even when the survivor is no longer a dataset operator", () => {
     // Edge case: results state somehow has a sourceOperatorId pointing at an
     // operator that is no longer a dataset operator (e.g., the user replaced
@@ -167,7 +175,13 @@ describe("DataGuardAutoTriggerService concurrent pipeline serialization", () =>
       getWorkflowMetadata: () => ({ wid: 7 }),
     };
     const agentService = {
-      getAllAgents: () => ({ subscribe: (o: { next: (v: unknown[]) => void; complete?: () => void }) => { o.next([{ id: "agent-1", delegate: { workflowId: 7 } }]); o.complete?.(); return { unsubscribe() {} }; } }),
+      getAllAgents: () => ({
+        subscribe: (o: { next: (v: unknown[]) => void; complete?: () => void }) => {
+          o.next([{ id: "agent-1", delegate: { workflowId: 7 } }]);
+          o.complete?.();
+          return { unsubscribe() {} };
+        },
+      }),
     };
     const notificationService = {
       info: () => {},
@@ -193,18 +207,20 @@ describe("DataGuardAutoTriggerService concurrent pipeline serialization", () =>
       post: () => ({ subscribe: () => ({ unsubscribe() {} }) }),
     };
 
-    // Construct the service. Cast everything; this test only depends on
-    // the concurrency-control surface, not on the deps' real behaviour.
-    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    // Construct the service. Cast everything through `unknown`; this test
+    // only depends on the concurrency-control surface, not on the deps' real
+    // behaviour. `unknown` keeps the ESLint no-explicit-any rule happy and is
+    // standard pattern for test-double injection.
+    const asDep = <T>(x: unknown) => x as T;
     const svc = new DataGuardAutoTriggerService(
-      workflowActionService as any,
-      agentService as any,
-      notificationService as any,
-      settings as any,
-      results as any,
-      datasetService as any,
-      executeWorkflowService as any,
-      http as any
+      asDep(workflowActionService),
+      asDep(agentService),
+      asDep(notificationService),
+      asDep(settings),
+      asDep(results),
+      asDep(datasetService),
+      asDep(executeWorkflowService),
+      asDep(http)
     );
 
     // Replace loadFromOperatorFile with the gated stub so the first pipeline
@@ -220,13 +236,20 @@ describe("DataGuardAutoTriggerService concurrent pipeline serialization", () =>
     (svc as unknown as { http: { post: () => unknown } }).http = {
       post: () => {
         scanCallCount++;
-        return { subscribe: (o: { next: (v: unknown) => void; complete?: () => void }) => { o.next({ issueCount: 0, issues: [], proposals: [] }); o.complete?.(); return { unsubscribe() {} }; } };
+        return {
+          subscribe: (o: { next: (v: unknown) => void; complete?: () => void }) => {
+            o.next({ issueCount: 0, issues: [], proposals: [] });
+            o.complete?.();
+            return { unsubscribe() {} };
+          },
+        };
       },
     };
 
     // Start pipeline #1 (simulating an auto-trigger) — it will suspend on the load.
-    const first = (svc as unknown as { runPipeline: (op: OperatorPredicate, o: { userInitiated: boolean }) => Promise<void> })
-      .runPipeline(opA, { userInitiated: false });
+    const first = (
+      svc as unknown as { runPipeline: (op: OperatorPredicate, o: { userInitiated: boolean }) => Promise<void> }
+    ).runPipeline(opA, { userInitiated: false });
 
     // Yield once so runPipeline reaches the load await.
     await Promise.resolve();
@@ -257,3 +280,151 @@ describe("DataGuardAutoTriggerService concurrent pipeline serialization", () =>
     expect(pipelineSlot(svc)).toBeNull();
   });
 });
+
+/**
+ * Parser-dispatch tests. The dispatcher is what protects us from
+ * `Papa.parse`-ing JSON / Parquet bytes; it's the contract between the
+ * auto-trigger and the per-format readers (CSV today, JSONL once dev-B's
+ * module lands). These tests run against the exported `PARSERS` table and
+ * `parseCsv` helper directly — no service harness required.
+ */
+describe("DataGuardAutoTriggerService parser dispatch", () => {
+  it("registers CSVFileScan and CSVOldFileScan against parseCsv", () => {
+    // CSVOld's Scala impl uses scala-csv's DefaultCSVFormat (RFC-4180), so
+    // its bytes are identical-shape to CSVFileScan — they legitimately share
+    // the same parser. ParallelCSVFileScan was removed because Texera has
+    // disabled it in the operator registry (LogicalOp.scala:171).
+    expect(PARSERS["CSVFileScan"]).toBe(parseCsv);
+    expect(PARSERS["CSVOldFileScan"]).toBe(parseCsv);
+    expect(PARSERS["ParallelCSVFileScan"]).toBeUndefined();
+  });
+
+  it("registers JSONLFileScan as a distinct parser (not parseCsv)", () => {
+    // Dev-B's parseJsonl module has now landed and is wired in. The key
+    // invariant we care about for the dispatcher contract: JSONL must NOT
+    // dispatch through parseCsv — feeding NDJSON bytes to Papa would
+    // produce one row with a useless single column. As long as the entry
+    // exists and isn't parseCsv, the dispatcher is honest.
+    expect(PARSERS["JSONLFileScan"]).toBeDefined();
+    expect(PARSERS["JSONLFileScan"]).not.toBe(parseCsv);
+  });
+
+  it("parseCsv extracts headers and dynamically-typed rows from RFC-4180 bytes", async () => {
+    const csv = "id,name,score\n1,Alice,9.5\n2,Bob,7\n";
+    const blob = new Blob([csv], { type: "text/csv" });
+    const out = await parseCsv(blob, "demo.csv");
+    expect(out.columns).toEqual(["id", "name", "score"]);
+    expect(out.rows).toEqual([
+      { id: 1, name: "Alice", score: 9.5 },
+      { id: 2, name: "Bob", score: 7 },
+    ]);
+  });
+
+  it("parseCsv handles a CSVOld-style blob identically to a canonical CSV blob", async () => {
+    // Same bytes, two operator types. Since both CSVFileScan and
+    // CSVOldFileScan dispatch to parseCsv, the detector pipeline sees an
+    // identical {columns, rows} structure regardless of which operator
+    // produced it — which is the contract the 5 detector categories rely on.
+    const csv = "x,y\nfoo,1\nbar,2\n";
+    const blob1 = new Blob([csv], { type: "text/csv" });
+    const blob2 = new Blob([csv], { type: "text/csv" });
+    const viaCsv = await PARSERS["CSVFileScan"](blob1, "a.csv");
+    const viaCsvOld = await PARSERS["CSVOldFileScan"](blob2, "a.csv");
+    expect(viaCsvOld).toEqual(viaCsv);
+  });
+
+  it("parseCsv honors an explicit `;` delimiter for CSVOld-style bytes", async () => {
+    // CSVOld exposes a `customDelimiter` operator property. The Scala impl
+    // overrides scala-csv's delimiter with whatever the user picked (`,` /
+    // `;` / `\t` / …). Papa's autodetect *usually* gets this right, but
+    // we now pass the operator's choice through explicitly so parsing is
+    // byte-for-byte equivalent to the operator's own reader.
+    const csv = "id;name;score\n1;Alice;9.5\n2;Bob;7\n";
+    const blob = new Blob([csv], { type: "text/csv" });
+    const out = await parseCsv(blob, "demo.csv", { delimiter: ";" });
+    expect(out.columns).toEqual(["id", "name", "score"]);
+    expect(out.rows).toEqual([
+      { id: 1, name: "Alice", score: 9.5 },
+      { id: 2, name: "Bob", score: 7 },
+    ]);
+  });
+
+  it("parseCsv honors an explicit tab delimiter", async () => {
+    // Tab-delimited (`\t`) is the other realistic CSVOld customDelimiter
+    // value. Asserting it separately because Papa's autodetect handles
+    // tabs differently from semicolons.
+    const csv = "id\tname\n1\tAlice\n2\tBob\n";
+    const blob = new Blob([csv], { type: "text/csv" });
+    const out = await parseCsv(blob, "demo.tsv", { delimiter: "\t" });
+    expect(out.columns).toEqual(["id", "name"]);
+    expect(out.rows).toEqual([
+      { id: 1, name: "Alice" },
+      { id: 2, name: "Bob" },
+    ]);
+  });
+
+  it("parseCsv throws a descriptive error on malformed input", async () => {
+    // Mismatched quotes — Papa surfaces an error; our wrapper rethrows with
+    // a row reference so the toast/log is actionable.
+    const bad = 'col1,col2\n"unterminated,oops\n';
+    const blob = new Blob([bad], { type: "text/csv" });
+    let err: unknown = null;
+    try {
+      await parseCsv(blob, "bad.csv");
+    } catch (e) {
+      err = e;
+    }
+    expect(err).not.toBeNull();
+    expect((err as Error).message).toMatch(/CSV parse failed/);
+  });
+});
+
+/**
+ * Pure tests for the parser-options extractor. This is the bridge between an
+ * operator's properties (`op.operatorProperties`) and a parser's runtime
+ * options (`ParserOptions`). Keeping the mapping in a static helper means
+ * adding a new operator-specific knob = one line here + one line in the
+ * parser, no TestBed needed to assert it.
+ */
+describe("DataGuardAutoTriggerService.extractParserOptions", () => {
+  function op(operatorType: string, properties: Record<string, unknown> = {}): OperatorPredicate {
+    return {
+      operatorID: "op-1",
+      operatorType,
+      operatorProperties: properties,
+      inputPorts: [],
+      outputPorts: [],
+      showAdvanced: false,
+      isDisabled: false,
+      customDisplayName: operatorType,
+      operatorVersion: "0",
+    } as unknown as OperatorPredicate;
+  }
+
+  it("pulls customDelimiter off a CSVOldFileScan", () => {
+    const opts: ParserOptions = DataGuardAutoTriggerService.extractParserOptions(
+      op("CSVOldFileScan", { customDelimiter: ";" })
+    );
+    expect(opts.delimiter).toBe(";");
+  });
+
+  it("omits delimiter when CSVOldFileScan has no customDelimiter", () => {
+    // The Scala side defaults an empty/missing customDelimiter to "," — and
+    // Papa's autodetect handles the common case well — so we deliberately
+    // leave `delimiter` undefined and let Papa decide rather than hard-code.
+    const opts = DataGuardAutoTriggerService.extractParserOptions(op("CSVOldFileScan", {}));
+    expect(opts.delimiter).toBeUndefined();
+  });
+
+  it("ignores customDelimiter on non-CSVOld operators", () => {
+    // A stray `customDelimiter` on a CSVFileScan op should NOT leak through.
+    // Only CSVOldFileScan opts in to the property (its Scala desc declares
+    // it); we don't want to silently inherit it elsewhere.
+    const opts = DataGuardAutoTriggerService.extractParserOptions(op("CSVFileScan", { customDelimiter: ";" }));
+    expect(opts.delimiter).toBeUndefined();
+  });
+
+  it("returns an empty options object for an unknown operator type", () => {
+    expect(DataGuardAutoTriggerService.extractParserOptions(op("JSONLFileScan"))).toEqual({});
+  });
+});
diff --git a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts
index a3712089c34..cf2e358cea4 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-auto-trigger.service.ts
@@ -25,15 +25,11 @@ import { OperatorPredicate } from "../../types/workflow-common.interface";
 import { WorkflowActionService } from "../workflow-graph/model/workflow-action.service";
 import { AgentService, AgentInfo } from "./agent.service";
 import { DataGuardSettingsService } from "./data-guard-settings.service";
-import {
-  DataGuardResultsService,
-  ChecklistEntry,
-  DataQualityIssue,
-  FixProposal,
-} from "./data-guard-results.service";
+import { DataGuardResultsService, ChecklistEntry, DataQualityIssue, FixProposal } from "./data-guard-results.service";
 import { NotificationService } from "../../../common/service/notification/notification.service";
 import { DatasetService } from "../../../dashboard/service/user/dataset/dataset.service";
 import { ExecuteWorkflowService } from "../execute-workflow/execute-workflow.service";
+import { parseJsonl } from "./data-guard-jsonl";
 
 /**
  * DataGuard auto-trigger orchestration — checklist-driven flow.
@@ -55,16 +51,91 @@ import { ExecuteWorkflowService } from "../execute-workflow/execute-workflow.ser
  * Toggle: DataGuardSettingsService gates the pipeline (per-workflow,
  * default ON, controlled by toolbar 🛡 shield button).
  */
+/**
+ * Per-parser options. Optional; each parser ignores keys it doesn't care about.
+ *  - `delimiter`: explicit field separator to override Papa's autodetect. Used
+ *    by `CSVOldFileScan`, whose Scala impl overrides scala-csv's delimiter via
+ *    the operator's `customDelimiter` property (`,` / `;` / `\t` / …). Without
+ *    this, Papa's autodetect *usually* picks the right one but isn't strict
+ *    equivalence — a `;`-delimited file would round-trip differently than the
+ *    operator's own reader.
+ */
+export interface ParserOptions {
+  delimiter?: string;
+}
+
+/**
+ * A `DatasetParser` converts a raw file Blob into the {columns, rows} shape the
+ * agent-service `/dataguard/dataset` endpoint expects. Each format gets its own
+ * parser so we don't `Papa.parse` JSON / Parquet / TSV bytes and produce
+ * garbage rows.
+ *
+ * Returning `Promise` lets a parser do streaming or chunked reads (Parquet
+ * decoders, JSONL line readers) without changing the caller contract.
+ */
+export type DatasetParser = (
+  blob: Blob,
+  fileName: string,
+  options?: ParserOptions
+) => Promise<{ columns: string[]; rows: Record<string, unknown>[] }>;
+
+/**
+ * Parse a Blob assumed to be RFC-4180 CSV with a header row. Shared by every
+ * CSV-shaped operator (`CSVFileScan`, `CSVOldFileScan`). CSVOld's Scala impl
+ * (CSVOldScanSourceOpExec) uses scala-csv with `DefaultCSVFormat`, which is
+ * RFC-4180 — so the bytes are byte-for-byte the same shape, only the runtime
+ * wrapper differs. No CSVOld-specific variant is needed; we just register
+ * `parseCsv` for both keys.
+ *
+ * If `options.delimiter` is provided we pass it through to Papa instead of
+ * relying on Papa's autodetect — necessary for CSVOldFileScan, which exposes
+ * a `customDelimiter` operator property (`,` is the default but the user can
+ * set `;`, `\t`, or anything else).
+ *
+ * `ParallelCSVFileScan` was removed from this dispatcher because Texera has
+ * disabled it in the operator registry (see LogicalOp.scala:171 — the type
+ * registration is commented out so users can't drop one onto the canvas).
+ * If it gets re-enabled later, add the key back here in one line.
+ */
+export const parseCsv: DatasetParser = async (blob, _fileName, options) => {
+  const text = await blob.text();
+  const parseConfig: Papa.ParseConfig<Record<string, unknown>> = {
+    header: true,
+    skipEmptyLines: true,
+    dynamicTyping: true,
+  };
+  if (options?.delimiter) {
+    parseConfig.delimiter = options.delimiter;
+  }
+  const parsed = Papa.parse<Record<string, unknown>>(text, parseConfig);
+  if (parsed.errors.length > 0) {
+    throw new Error(`CSV parse failed: ${parsed.errors[0].message} (row ${parsed.errors[0].row})`);
+  }
+  return {
+    columns: parsed.meta.fields ?? [],
+    rows: parsed.data,
+  };
+};
+
+/**
+ * Operator-type → parser dispatch table. Exported so tests can assert the
+ * mapping is correct without instantiating the full service.
+ */
+export const PARSERS: Record<string, DatasetParser> = {
+  CSVFileScan: parseCsv,
+  CSVOldFileScan: parseCsv,
+  // JSONL flattens nested objects into dot-notation columns so the five
+  // detectors fire identically on JSONL-loaded data as on CSV-loaded data.
+  // See data-guard-jsonl.ts for the full flatten policy + collision rule.
+  JSONLFileScan: parseJsonl,
+};
+
 @Injectable({ providedIn: "root" })
 export class DataGuardAutoTriggerService {
-  // CSV-only for MVP — `loadFromOperatorFile` blindly Papa.parses every blob,
-  // so JSON / Table / Parquet operators would either crash or produce garbage
-  // rows. Per-format parsing is the obvious follow-up; until then narrowing
-  // the trigger set is honest. See §16.4 of README_DataGuard_Texera.md.
-  private static readonly DATASET_OPERATOR_TYPES = new Set<string>([
-    "CSVFileScan",
-    "ParallelCSVFileScan",
-  ]);
+  // Auto-trigger fires only for operators whose type is a key in `PARSERS`.
+  // Adding a new format = add to PARSERS + this set in lockstep (see post-MVP
+  // §19 of README_DataGuard_Texera.md for the broader format roadmap).
+  private static readonly DATASET_OPERATOR_TYPES = new Set<string>(["CSVFileScan", "CSVOldFileScan", "JSONLFileScan"]);
 
   /** Dedup: re-orchestrate only if (operatorID, filePath) changes. */
   private readonly lastOrchestratedFile = new Map<string, string>();
@@ -206,9 +277,10 @@ export class DataGuardAutoTriggerService {
   public startOrchestration(): Subscription {
     const graph = this.workflowActionService.getTexeraGraph();
     const addStream$ = graph.getOperatorAddStream();
-    const propertyStream$ = graph
-      .getOperatorPropertyChangeStream()
-      .pipe(debounceTime(500), map(event => event.operator));
+    const propertyStream$ = graph.getOperatorPropertyChangeStream().pipe(
+      debounceTime(500),
+      map(event => event.operator)
+    );
 
     return merge(addStream$, propertyStream$)
       .pipe(filter(op => this.isDatasetOperatorType(op.operatorType)))
@@ -253,10 +325,7 @@ export class DataGuardAutoTriggerService {
     };
     try {
       applyResp = await firstValueFrom(
-        this.http.post<typeof applyResp>(
-          `/api/agents/${state.agentId}/dataguard/apply-batch`,
-          { decisions }
-        )
+        this.http.post<typeof applyResp>(`/api/agents/${state.agentId}/dataguard/apply-batch`, { decisions })
       );
     } catch (e: unknown) {
       const msg = this.extractMessage(e);
@@ -300,8 +369,14 @@ export class DataGuardAutoTriggerService {
     }
 
     try {
-      this.results.setState({ message: `Saving cleaned data as a new version…` });
-      const newPath = await this.writeBackAsNewVersion(state.agentId, sourceFile);
+      this.results.setState({ message: "Saving cleaned data as a new version…" });
+      // Look up the source operator's type so writeBackAsNewVersion picks the
+      // matching export endpoint (export-csv vs export-jsonl). If the operator
+      // has been deleted while we were applying, fall back to CSV — that's the
+      // historical default and downstream still rejects the path if needed.
+      const sourceOp = this.workflowActionService.getTexeraGraph().getOperator(opId);
+      const sourceOpType = sourceOp?.operatorType ?? "";
+      const newPath = await this.writeBackAsNewVersion(state.agentId, sourceFile, sourceOpType);
 
       // Step 4 — repoint the operator at the cleaned version
       const graph = this.workflowActionService.getTexeraGraph();
@@ -348,13 +423,28 @@ export class DataGuardAutoTriggerService {
    * `sourceFile` is the path the operator was reading from, in the canonical
    * format `/ownerEmail/datasetName/versionName/fileRelativePath`.
    */
-  private async writeBackAsNewVersion(agentId: string, sourceFile: string): Promise<string> {
-    // 1. Pull cleaned CSV from agent-service
-    const csvBlob = await firstValueFrom(
-      this.http.get(`/api/agents/${agentId}/dataguard/export-csv`, { responseType: "blob" })
+  private async writeBackAsNewVersion(agentId: string, sourceFile: string, sourceOpType: string): Promise<string> {
+    // 1. Pull cleaned data from agent-service. The endpoint + MIME type +
+    // fallback extension depend on the source operator type. Everything else
+    // in this method (LakeFS commit, operator-repoint, auto-rescan) is
+    // format-agnostic and stays untouched.
+    //
+    // Filename policy: we PRESERVE the source file's original name (extension
+    // included). The `fallbackName` below is used only when the source path
+    // is degenerate (no trailing segment). We deliberately do NOT rewrite the
+    // extension based on body format — LakeFS doesn't care about extensions,
+    // and rewriting "data.json → data.jsonl" would silently change the
+    // operator's perceived file identity. Reuse-in-place is the contract.
+    const isJsonl = sourceOpType === "JSONLFileScan";
+    const exportPath = isJsonl ? "export-jsonl" : "export-csv";
+    const mimeType = isJsonl ? "application/x-ndjson" : "text/csv";
+    const fallbackName = isJsonl ? "cleaned.jsonl" : "cleaned.csv";
+
+    const dataBlob = await firstValueFrom(
+      this.http.get(`/api/agents/${agentId}/dataguard/${exportPath}`, { responseType: "blob" })
     );
-    const fileName = sourceFile.split("/").pop() || "cleaned.csv";
-    const csvFile = new File([csvBlob], fileName, { type: "text/csv" });
+    const fileName = sourceFile.split("/").pop() || fallbackName;
+    const cleanedFile = new File([dataBlob], fileName, { type: mimeType });
 
     // 2. Parse source path: /ownerEmail/datasetName/versionName/fileRelative...
     const parts = sourceFile.replace(/^\/+/, "").split("/");
@@ -366,9 +456,7 @@ export class DataGuardAutoTriggerService {
 
     // 3. Find dataset (need its did + write access)
     const datasets = await firstValueFrom(this.datasetService.retrieveAccessibleDatasets());
-    const match = datasets.find(
-      d => d.ownerEmail === ownerEmail && d.dataset.name === datasetName
-    );
+    const match = datasets.find(d => d.ownerEmail === ownerEmail && d.dataset.name === datasetName);
     if (!match || !match.dataset.did) {
       throw new Error(`dataset "${ownerEmail}/${datasetName}" not accessible to you`);
     }
@@ -376,11 +464,11 @@ export class DataGuardAutoTriggerService {
       throw new Error(`you don't have write access to "${ownerEmail}/${datasetName}"`);
     }
 
-    // 4. Multipart-upload as a single part (CSVs are small for a hackathon demo)
-    const partSize = Math.max(csvFile.size, 5 * 1024 * 1024); // LakeFS likes ≥5MB parts but accepts last
+    // 4. Multipart-upload as a single part (cleaned files are small for a hackathon demo)
+    const partSize = Math.max(cleanedFile.size, 5 * 1024 * 1024); // LakeFS likes ≥5MB parts but accepts last
     await firstValueFrom(
       this.datasetService
-        .multipartUpload(ownerEmail, datasetName, fileRelativePath, csvFile, partSize, 1, true)
+        .multipartUpload(ownerEmail, datasetName, fileRelativePath, cleanedFile, partSize, 1, true)
         .pipe(filter(p => p.status === "finished"))
     );
 
@@ -389,9 +477,7 @@ export class DataGuardAutoTriggerService {
     // "DataGuard cleaned" v2) doesn't collide on the version name — every run
     // gets a fresh unique version instead of failing.
     const versionLabel = `DataGuard cleaned ${this.timestampSuffix()}`;
-    const newVersion = await firstValueFrom(
-      this.datasetService.createDatasetVersion(match.dataset.did, versionLabel)
-    );
+    const newVersion = await firstValueFrom(this.datasetService.createDatasetVersion(match.dataset.did, versionLabel));
 
     return `/${ownerEmail}/${datasetName}/${newVersion.name}/${fileRelativePath}`;
   }
@@ -452,9 +538,7 @@ export class DataGuardAutoTriggerService {
           state: "idle",
           message: undefined,
         });
-        this.notificationService.warning(
-          "DataGuard: set the operator's file path before scanning."
-        );
+        this.notificationService.warning("DataGuard: set the operator's file path before scanning.");
       }
       return;
     }
@@ -525,10 +609,10 @@ export class DataGuardAutoTriggerService {
         agentId,
         state: "scanning",
         entries: [],
-        message: `Loading dataset…`,
+        message: "Loading dataset…",
       });
 
-      const loaded = await this.loadFromOperatorFile(agentId, filePath);
+      const loaded = await this.loadFromOperatorFile(agentId, filePath, op);
 
       this.results.setState({
         datasetSource: loaded.source,
@@ -536,18 +620,14 @@ export class DataGuardAutoTriggerService {
         datasetColumns: loaded.columns,
         message: `Checking ${loaded.rows} rows for problems…`,
       });
-      this.notificationService.info(
-        `DataGuard is checking ${loaded.rows} rows from ${loaded.source}…`
-      );
+      this.notificationService.info(`DataGuard is checking ${loaded.rows} rows from ${loaded.source}…`);
 
       // Phase 2: server-side scan (NO chat involved — bypasses LLM ReAct loop)
       const scan: {
         issueCount: number;
         issues: DataQualityIssue[];
         proposals: Array<{ issueId: string; proposal: FixProposal | null; error: string | null }>;
-      } = await firstValueFrom(
-        this.http.post<any>(`/api/agents/${agentId}/dataguard/scan`, {})
-      );
+      } = await firstValueFrom(this.http.post<any>(`/api/agents/${agentId}/dataguard/scan`, {}));
 
       const entries: ChecklistEntry[] = scan.issues.map(issue => {
         const p = scan.proposals.find(x => x.issueId === issue.issueId);
@@ -566,9 +646,10 @@ export class DataGuardAutoTriggerService {
       this.results.setState({
         state: "ready",
         entries,
-        message: scan.issueCount === 0
-          ? "Your data looks good — nothing to fix."
-          : `Found ${scan.issueCount} thing${scan.issueCount === 1 ? "" : "s"} we can clean up. Pick which to fix.`,
+        message:
+          scan.issueCount === 0
+            ? "Your data looks good — nothing to fix."
+            : `Found ${scan.issueCount} thing${scan.issueCount === 1 ? "" : "s"} we can clean up. Pick which to fix.`,
         sourceOperatorId: op.operatorID,
         sourceFilePath: filePath,
       });
@@ -595,35 +676,66 @@ export class DataGuardAutoTriggerService {
     return typeof v === "string" ? v.trim() : "";
   }
 
+  /**
+   * Load a dataset file into the agent's DataGuardSession. Dispatches to a
+   * per-format parser via the `PARSERS` table so we don't `Papa.parse` JSON or
+   * Parquet bytes. If no parser is registered for the operator type we surface
+   * a visible error (toast + thrown) — silent no-op was the pre-dispatcher
+   * footgun.
+   *
+   * Passes the operator's relevant properties through to the parser as
+   * `ParserOptions`. Today the only consumer is CSVOldFileScan's
+   * `customDelimiter`; future formats can extend `ParserOptions` (and this
+   * extractor) without churning the parser signature.
+   */
   private async loadFromOperatorFile(
     agentId: string,
-    filePath: string
+    filePath: string,
+    op: OperatorPredicate
   ): Promise<{ source: string; rows: number; columns: number }> {
-    const blob = await firstValueFrom(this.datasetService.retrieveDatasetVersionSingleFile(filePath));
-    const text = await blob.text();
-    const parsed = Papa.parse<Record<string, unknown>>(text, {
-      header: true,
-      skipEmptyLines: true,
-      dynamicTyping: true,
-    });
-    if (parsed.errors.length > 0) {
-      throw new Error(`CSV parse failed: ${parsed.errors[0].message} (row ${parsed.errors[0].row})`);
+    const operatorType = op.operatorType;
+    const parser = PARSERS[operatorType];
+    if (!parser) {
+      const msg =
+        `DataGuard: no parser registered for operator type "${operatorType}". ` +
+        `Supported types: ${Object.keys(PARSERS).join(", ")}.`;
+      this.notificationService.error(msg);
+      throw new Error(msg);
     }
-    const columns = parsed.meta.fields ?? [];
-    const rows = parsed.data;
-    await firstValueFrom(
-      this.http.post(`/api/agents/${agentId}/dataguard/dataset`, { columns, rows })
-    );
+    const parserOptions = DataGuardAutoTriggerService.extractParserOptions(op);
+    const fileName = filePath.split("/").pop() || filePath;
+    const blob = await firstValueFrom(this.datasetService.retrieveDatasetVersionSingleFile(filePath));
+    const { columns, rows } = await parser(blob, fileName, parserOptions);
+    await firstValueFrom(this.http.post(`/api/agents/${agentId}/dataguard/dataset`, { columns, rows }));
     return { source: filePath, rows: rows.length, columns: columns.length };
   }
 
+  /**
+   * Pure helper: pull format-specific knobs off an operator's properties into
+   * a `ParserOptions`. Exposed as static so tests can exercise the mapping
+   * without a TestBed harness.
+   *
+   * Today this only handles `CSVOldFileScan.customDelimiter`. Empty / missing
+   * delimiter falls back to Papa autodetect (matches the Scala side, which
+   * also defaults to `,` when `customDelimiter` is empty).
+   */
+  public static extractParserOptions(op: OperatorPredicate): ParserOptions {
+    const opts: ParserOptions = {};
+    if (op.operatorType === "CSVOldFileScan") {
+      const props = (op.operatorProperties ?? {}) as Record<string, unknown>;
+      const raw = props["customDelimiter"];
+      if (typeof raw === "string" && raw.length > 0) {
+        opts.delimiter = raw;
+      }
+    }
+    return opts;
+  }
+
   private async ensureAgent(workflowId: number): Promise<string> {
     const all: AgentInfo[] = await firstValueFrom(this.agentService.getAllAgents());
     const match = all.find(a => a.delegate?.workflowId === workflowId);
     if (match) return match.id;
-    const created = await firstValueFrom(
-      this.agentService.createAgent("claude-haiku-4.5", "DataGuard", workflowId)
-    );
+    const created = await firstValueFrom(this.agentService.createAgent("claude-haiku-4.5", "DataGuard", workflowId));
     return created.id;
   }
 }
diff --git a/frontend/src/app/workspace/service/agent/data-guard-jsonl.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-jsonl.spec.ts
new file mode 100644
index 00000000000..c54c5b4c612
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-jsonl.spec.ts
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { vi } from "vitest";
+import { parseJsonl } from "./data-guard-jsonl";
+
+function jsonlBlob(text: string): Blob {
+  return new Blob([text], { type: "application/jsonl" });
+}
+
+describe("parseJsonl", () => {
+  it("parses a single JSON object line into one row with the expected column", async () => {
+    const { columns, rows } = await parseJsonl(jsonlBlob('{"a":1,"b":"x"}'), "single.jsonl");
+    expect(columns).toEqual(["a", "b"]);
+    expect(rows).toEqual([{ a: 1, b: "x" }]);
+  });
+
+  it("returns empty columns and rows for an empty file", async () => {
+    const { columns, rows } = await parseJsonl(jsonlBlob(""), "empty.jsonl");
+    expect(columns).toEqual([]);
+    expect(rows).toEqual([]);
+  });
+
+  it("skips blank lines and a trailing newline silently", async () => {
+    const { columns, rows } = await parseJsonl(jsonlBlob('{"a":1}\n\n{"a":2}\n   \n{"a":3}\n'), "blanks.jsonl");
+    expect(columns).toEqual(["a"]);
+    expect(rows).toEqual([{ a: 1 }, { a: 2 }, { a: 3 }]);
+  });
+
+  it("tolerates CRLF line endings", async () => {
+    const { rows } = await parseJsonl(jsonlBlob('{"a":1}\r\n{"a":2}\r\n'), "crlf.jsonl");
+    expect(rows).toEqual([{ a: 1 }, { a: 2 }]);
+  });
+
+  it("flattens nested objects into dot-notation columns", async () => {
+    const { columns, rows } = await parseJsonl(
+      jsonlBlob('{"id":1,"address":{"street":"Main","city":"Irvine"}}'),
+      "nested.jsonl"
+    );
+    expect(columns).toEqual(["id", "address.street", "address.city"]);
+    expect(rows[0]).toEqual({ id: 1, "address.street": "Main", "address.city": "Irvine" });
+  });
+
+  it("stringifies arrays as a single cell instead of exploding rows", async () => {
+    const { columns, rows } = await parseJsonl(jsonlBlob('{"tags":["a","b","c"]}'), "arr.jsonl");
+    expect(columns).toEqual(["tags"]);
+    expect(rows).toEqual([{ tags: '["a","b","c"]' }]);
+  });
+
+  it("skips a bare-string line with a console warning", async () => {
+    const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+    const { rows } = await parseJsonl(jsonlBlob('"just a string"\n{"a":1}'), "bare.jsonl");
+    expect(rows).toEqual([{ a: 1 }]);
+    expect(warn).toHaveBeenCalled();
+  });
+
+  it("skips a bare-number, top-level-array, and boolean line with warnings", async () => {
+    const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+    const { rows } = await parseJsonl(jsonlBlob('42\n[1,2,3]\ntrue\n{"a":1}'), "nonobjects.jsonl");
+    expect(rows).toEqual([{ a: 1 }]);
+    expect(warn).toHaveBeenCalled();
+  });
+
+  it("fills missing keys in a row with null when other rows have that key", async () => {
+    const { columns, rows } = await parseJsonl(jsonlBlob('{"a":1,"b":2}\n{"a":3}'), "ragged.jsonl");
+    expect(columns).toEqual(["a", "b"]);
+    expect(rows).toEqual([
+      { a: 1, b: 2 },
+      { a: 3, b: null },
+    ]);
+  });
+
+  it("uses union-of-keys column ordering (first-seen wins)", async () => {
+    const { columns } = await parseJsonl(jsonlBlob('{"b":1}\n{"a":2}\n{"c":3,"a":4}'), "order.jsonl");
+    expect(columns).toEqual(["b", "a", "c"]);
+  });
+
+  it("handles a 100-line input with consistent shape", async () => {
+    const lines: string[] = [];
+    for (let i = 0; i < 100; i++) {
+      lines.push(JSON.stringify({ idx: i, kind: i % 2 === 0 ? "even" : "odd" }));
+    }
+    const { columns, rows } = await parseJsonl(jsonlBlob(lines.join("\n")), "big.jsonl");
+    expect(columns).toEqual(["idx", "kind"]);
+    expect(rows.length).toBe(100);
+    expect(rows[0]).toEqual({ idx: 0, kind: "even" });
+    expect(rows[99]).toEqual({ idx: 99, kind: "odd" });
+  });
+
+  it("does not throw on lines that aren't valid JSON; it warns and continues", async () => {
+    const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+    const { rows } = await parseJsonl(jsonlBlob('{"a":1}\nnot json at all\n{"a":2}'), "broken.jsonl");
+    expect(rows).toEqual([{ a: 1 }, { a: 2 }]);
+    expect(warn).toHaveBeenCalled();
+  });
+
+  it("prefers nested-key value on column-name collision and warns once", async () => {
+    const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+    // Top-level literal "address.street" key appears first, nested second:
+    // the nested {address:{street}} value must win regardless of order.
+    const { columns, rows } = await parseJsonl(
+      jsonlBlob('{"address.street":"OLD","address":{"street":"NEW"}}'),
+      "collide.jsonl"
+    );
+    expect(columns).toEqual(["address.street"]);
+    expect(rows).toEqual([{ "address.street": "NEW" }]);
+    expect(warn).toHaveBeenCalled();
+  });
+
+  it("prefers nested-key value even when nested appears FIRST in source order (F2 regression)", async () => {
+    // Round-2 reviewer found that the previous last-write-wins implementation
+    // produced "OLD" here because the literal-dotted key was iterated after
+    // the nested object. The two-pass fix must make nested win in BOTH source
+    // orderings.
+    const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+    const { columns, rows } = await parseJsonl(
+      jsonlBlob('{"address":{"street":"NEW"},"address.street":"OLD"}'),
+      "collide-reverse.jsonl"
+    );
+    expect(columns).toEqual(["address.street"]);
+    expect(rows).toEqual([{ "address.street": "NEW" }]);
+    expect(warn).toHaveBeenCalled();
+  });
+
+  it("collision warning fires only once per colliding path across many rows", async () => {
+    const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+    const lines = ['{"a":{"b":1},"a.b":99}', '{"a":{"b":2},"a.b":99}', '{"a":{"b":3},"a.b":99}'].join("\n");
+    const { rows } = await parseJsonl(jsonlBlob(lines), "collide-many.jsonl");
+    expect(rows).toEqual([{ "a.b": 1 }, { "a.b": 2 }, { "a.b": 3 }]);
+    // Exactly one collision warning for "a.b" across all three rows.
+    const collisionWarns = warn.mock.calls.filter(
+      args => typeof args[0] === "string" && args[0].includes('collision on "a.b"')
+    );
+    expect(collisionWarns.length).toBe(1);
+  });
+});
diff --git a/frontend/src/app/workspace/service/agent/data-guard-jsonl.ts b/frontend/src/app/workspace/service/agent/data-guard-jsonl.ts
new file mode 100644
index 00000000000..3dcde6d1535
--- /dev/null
+++ b/frontend/src/app/workspace/service/agent/data-guard-jsonl.ts
@@ -0,0 +1,191 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * JSONL parser for DataGuard auto-trigger.
+ *
+ * Produces the same `{columns, rows}` shape that `loadFromOperatorFile` would
+ * have produced via Papa.parse — so the five detectors in `profile_dataset`
+ * fire identically on a JSONL-loaded `DatasetView` as on a CSV-loaded one.
+ *
+ * Flatten policy:
+ *   - Nested objects → dot-notation column names. `{address:{street:"x"}}`
+ *     produces a single column `address.street`.
+ *   - Arrays → `JSON.stringify(arr)` as a single cell (one row per JSONL line;
+ *     we never explode arrays across rows — that would change row indices and
+ *     break the locate / apply-fix rowIndices contract).
+ *   - Lines that aren't JSON *objects* (bare strings, numbers, top-level
+ *     arrays, booleans, null) → skipped with a console.warn.
+ *   - Blank lines, CRLF, trailing newlines → tolerated silently.
+ *   - Column set = union of keys across all parsed rows; rows missing a column
+ *     get `null` for that cell.
+ *   - Collision (literal top-level dotted key vs nested-key flatten producing
+ *     the same final name): the **nested-key value always wins**, regardless
+ *     of which key appears first in JSON source order. A warning is logged
+ *     once per colliding path per parse. The implementation uses two passes
+ *     (collect every nested path first; then write leaves, blocking literal-
+ *     dotted top-level writes that land on a nested-owned slot) so source
+ *     order does not matter.
+ */
+
+export interface ParsedJsonl {
+  columns: string[];
+  rows: Record<string, unknown>[];
+}
+
+const isPlainObject = (v: unknown): v is Record<string, unknown> =>
+  v !== null && typeof v === "object" && !Array.isArray(v);
+
+/**
+ * Flatten one JSON object into a `Record<string, unknown>` whose keys are
+ * dot-notation paths and whose values are leaves (primitives, null, or
+ * stringified arrays).
+ *
+ * Collision rule: when a literal top-level dotted key (e.g. `"address.street"`)
+ * collides with a path produced by descending into a nested object (e.g.
+ * `address: {street: ...}`), the **nested** value always wins, regardless of
+ * which appeared first in JSON source order. Pass 1 collects every nested
+ * path; pass 2 writes leaves and skips literal-dotted top-level keys whose
+ * path is already owned by a nested descent. Nested-wins is defensible: the
+ * nested structure is the "real" hierarchy; a literal-dotted top-level key
+ * is the edge case.
+ */
+function flattenObject(obj: Record<string, unknown>, warnedCollisions: Set<string>): Record<string, unknown> {
+  // Pass 1: collect every path produced by descending into a nested object.
+  // Literal-dotted top-level KEYS are not visited here (they are leaves at
+  // the top level even if their name contains a dot).
+  const nestedPaths = new Set<string>();
+  const collectNested = (node: Record<string, unknown>, prefix: string): void => {
+    for (const key of Object.keys(node)) {
+      const value = node[key];
+      const path = prefix ? `${prefix}.${key}` : key;
+      if (isPlainObject(value)) {
+        // Descending into a nested object — every leaf path reached this way
+        // is "owned" by the nested structure and must beat any literal-dotted
+        // top-level write.
+        collectNested(value, path);
+      } else if (prefix !== "") {
+        // A leaf reached *via* a nested descent (prefix non-empty).
+        nestedPaths.add(path);
+      }
+    }
+  };
+  collectNested(obj, "");
+
+  // Pass 2: write every leaf. Nested leaves always go in. Literal-dotted
+  // top-level leaves are blocked if a nested path already owns the slot.
+  const out: Record<string, unknown> = {};
+  const writeLeaves = (node: Record<string, unknown>, prefix: string, isTopLevel: boolean): void => {
+    for (const key of Object.keys(node)) {
+      const value = node[key];
+      const path = prefix ? `${prefix}.${key}` : key;
+      if (isPlainObject(value)) {
+        writeLeaves(value, path, false);
+        continue;
+      }
+      // Array → JSON-stringified single cell. Primitives / null / undefined
+      // pass through as-is; `null` is meaningful for missing-value detection.
+      const leaf = Array.isArray(value) ? JSON.stringify(value) : value;
+      if (isTopLevel && nestedPaths.has(path)) {
+        // Literal-dotted top-level key colliding with a nested path. Skip the
+        // write entirely (nested wins). Warn once per path per parse.
+        if (!warnedCollisions.has(path)) {
+          // eslint-disable-next-line no-console
+          console.warn(`[DataGuard JSONL] column name collision on "${path}"; nested-key value wins.`);
+          warnedCollisions.add(path);
+        }
+        continue;
+      }
+      out[path] = leaf;
+    }
+  };
+  writeLeaves(obj, "", true);
+  return out;
+}
+
+export async function parseJsonl(blob: Blob, fileName: string): Promise<ParsedJsonl> {
+  const text = await blob.text();
+  // CRLF tolerance: normalize first. Splitting on /\r?\n/ would also work but
+  // a single replace keeps the offset arithmetic obvious if we ever surface
+  // per-line errors with line numbers.
+  const normalized = text.replace(/\r\n/g, "\n");
+  const lines = normalized.split("\n");
+
+  const rows: Record<string, unknown>[] = [];
+  const columnOrder: string[] = [];
+  const seenColumns = new Set<string>();
+  const warnedCollisions = new Set<string>();
+  let skippedNonObject = 0;
+  let skippedParseError = 0;
+
+  for (let i = 0; i < lines.length; i++) {
+    const raw = lines[i];
+    if (raw.length === 0 || raw.trim().length === 0) continue;
+    let parsed: unknown;
+    try {
+      parsed = JSON.parse(raw);
+    } catch (e: unknown) {
+      skippedParseError++;
+      // eslint-disable-next-line no-console
+      console.warn(
+        `[DataGuard JSONL] ${fileName}: line ${i + 1} is not valid JSON, skipping.`,
+        e instanceof Error ? e.message : e
+      );
+      continue;
+    }
+    if (!isPlainObject(parsed)) {
+      skippedNonObject++;
+      // eslint-disable-next-line no-console
+      console.warn(
+        `[DataGuard JSONL] ${fileName}: line ${i + 1} is not a JSON object (got ${
+          parsed === null ? "null" : Array.isArray(parsed) ? "array" : typeof parsed
+        }), skipping.`
+      );
+      continue;
+    }
+    const flat = flattenObject(parsed, warnedCollisions);
+    for (const key of Object.keys(flat)) {
+      if (!seenColumns.has(key)) {
+        seenColumns.add(key);
+        columnOrder.push(key);
+      }
+    }
+    rows.push(flat);
+  }
+
+  // Fill missing keys with null so downstream detectors see a consistent
+  // schema (the profiler iterates columns × rows; an undefined cell would
+  // skew its missing-value counts vs. an explicit null).
+  for (const row of rows) {
+    for (const col of columnOrder) {
+      if (!Object.prototype.hasOwnProperty.call(row, col)) {
+        row[col] = null;
+      }
+    }
+  }
+
+  if (skippedNonObject > 0 || skippedParseError > 0) {
+    // eslint-disable-next-line no-console
+    console.warn(
+      `[DataGuard JSONL] ${fileName}: skipped ${skippedNonObject} non-object line(s), ${skippedParseError} parse error(s).`
+    );
+  }
+
+  return { columns: columnOrder, rows };
+}
diff --git a/frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts
index 21476337ed2..00da0688453 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-results.service.spec.ts
@@ -20,12 +20,7 @@
 import { TestBed } from "@angular/core/testing";
 import { firstValueFrom } from "rxjs";
 import { take } from "rxjs/operators";
-import {
-  DataGuardResultsService,
-  ChecklistEntry,
-  DataQualityIssue,
-  FixProposal,
-} from "./data-guard-results.service";
+import { DataGuardResultsService, ChecklistEntry, DataQualityIssue, FixProposal } from "./data-guard-results.service";
 
 function makeIssue(issueId: string): DataQualityIssue {
   return {
diff --git a/frontend/src/app/workspace/service/agent/data-guard-results.service.ts b/frontend/src/app/workspace/service/agent/data-guard-results.service.ts
index 3e92f8ace61..99f27bf1bc6 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-results.service.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-results.service.ts
@@ -29,6 +29,14 @@ export interface DataQualityIssue {
   evidence: string;
   affectedRowCount: number;
   affectedRowIndices?: number[];
+  /**
+   * Per-affected-row content fingerprints, 1-to-1 aligned with
+   * `affectedRowIndices`. Used by the result-panel locate feature to find the
+   * affected row even when Texera's multi-worker JSONL scan shuffles rows out
+   * of file-byte order. See `DataGuardRowNavigatorService.rowFingerprint` for
+   * the contract — it must stay identical to the agent-service helper.
+   */
+  affectedRowKeys?: string[];
   detectedAt: string;
 }
 
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
index 2449c88e49d..4609106af1e 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
@@ -85,14 +85,220 @@ describe("DataGuardRowNavigatorService", () => {
     });
   });
 
+  describe("purgeStaleCursors", () => {
+    // Regression: the checklist's locate cursors used to be cleared wholesale
+    // whenever the results-service emitted, which wiped the per-row cursor
+    // mid-cycle after benign re-emits (verdict toggles, status messages).
+    // The fix swaps clear() for a per-key purge so live ids survive.
+    it("keeps cursors for ids that are still live across a re-emit", () => {
+      const cursors = new Map<string, number>([
+        ["issue-a", 2],
+        ["issue-b", 1],
+      ]);
+      // Same id set re-emits → no mutations expected.
+      DataGuardRowNavigatorService.purgeStaleCursors(cursors, new Set(["issue-a", "issue-b"]));
+      expect(cursors.get("issue-a")).toBe(2);
+      expect(cursors.get("issue-b")).toBe(1);
+      expect(cursors.size).toBe(2);
+    });
+
+    it("regression: a 4-step cycle survives an unrelated updateEntry re-emit", () => {
+      // Simulates: user clicks 📍 twice on issue-a (cursor reaches 2), then
+      // toggles issue-b's verdict (which fires updateEntry → setState with the
+      // same id set), then clicks 📍 on issue-a a third time. The third click
+      // MUST advance from 2 → 3, not snap back to 0 → 1.
+      const cursors = new Map<string, number>([["issue-a", 2]]);
+      // Re-emit after the benign verdict toggle. Both ids still live.
+      DataGuardRowNavigatorService.purgeStaleCursors(cursors, new Set(["issue-a", "issue-b"]));
+      expect(cursors.get("issue-a")).toBe(2);
+      // Next click reads this and feeds it into nextCycleStep.
+      const indices = [3, 7, 12, 15];
+      const step = DataGuardRowNavigatorService.nextCycleStep(indices, cursors.get("issue-a")!);
+      expect(step.value).toBe(12);
+      expect(step.nextCursor).toBe(3);
+    });
+
+    it("drops cursors for ids that disappeared (fresh scan)", () => {
+      const cursors = new Map<string, number>([
+        ["issue-a", 2],
+        ["issue-b", 1],
+      ]);
+      // Fresh scan: completely different id set.
+      DataGuardRowNavigatorService.purgeStaleCursors(cursors, new Set(["issue-c", "issue-d"]));
+      expect(cursors.has("issue-a")).toBe(false);
+      expect(cursors.has("issue-b")).toBe(false);
+      expect(cursors.size).toBe(0);
+    });
+
+    it("preserves the intersection when the id set partially changes", () => {
+      const cursors = new Map<string, number>([
+        ["issue-a", 2],
+        ["issue-b", 1],
+        ["issue-c", 3],
+      ]);
+      DataGuardRowNavigatorService.purgeStaleCursors(cursors, new Set(["issue-b", "issue-d"]));
+      expect(cursors.has("issue-a")).toBe(false);
+      expect(cursors.get("issue-b")).toBe(1);
+      expect(cursors.has("issue-c")).toBe(false);
+      // Live ids without prior cursors are not auto-added — that's the
+      // checklist's `.get(...) ?? 0` job at click time.
+      expect(cursors.has("issue-d")).toBe(false);
+    });
+
+    it("handles an empty live set by purging everything", () => {
+      const cursors = new Map<string, number>([["issue-a", 2]]);
+      DataGuardRowNavigatorService.purgeStaleCursors(cursors, new Set());
+      expect(cursors.size).toBe(0);
+    });
+
+    it("is a no-op on an empty cursors map", () => {
+      const cursors = new Map<string, number>();
+      DataGuardRowNavigatorService.purgeStaleCursors(cursors, new Set(["x"]));
+      expect(cursors.size).toBe(0);
+    });
+  });
+
+  describe("rowFingerprint", () => {
+    // Mirror of the agent-service profile-dataset.test.ts contract. These two
+    // suites are the cross-language locks that catch JSON.stringify drift
+    // between Bun (server) and V8 (browser).
+    it("produces identical keys for identical content", () => {
+      const a = DataGuardRowNavigatorService.rowFingerprint({ age: 25, name: "Alice" }, ["age", "name"]);
+      const b = DataGuardRowNavigatorService.rowFingerprint({ age: 25, name: "Alice" }, ["age", "name"]);
+      expect(a).toBe(b);
+    });
+
+    it("canonicalises column order (sort) so display-side reordering doesn't matter", () => {
+      const a = DataGuardRowNavigatorService.rowFingerprint({ age: 25, name: "Alice" }, ["age", "name"]);
+      const b = DataGuardRowNavigatorService.rowFingerprint({ age: 25, name: "Alice" }, ["name", "age"]);
+      expect(a).toBe(b);
+    });
+
+    it("treats missing key and explicit null the same way", () => {
+      const a = DataGuardRowNavigatorService.rowFingerprint({ age: null, name: "Alice" }, ["age", "name"]);
+      const b = DataGuardRowNavigatorService.rowFingerprint({ name: "Alice" }, ["age", "name"]);
+      expect(a).toBe(b);
+    });
+
+    // Contract example: the exact same input/output as the agent-service
+    // counterpart test ("contract example: known input produces known output").
+    // If either side drifts, both tests fail and the diff makes the cause obvious.
+    // Format: JSON.stringify(String(value)) per non-null cell; null/undefined
+    // → bare `null` literal (no quotes).
+    it("contract example: matches agent-service rowFingerprint byte-for-byte", () => {
+      const row = { glucose: 180, patient_id: "p-7", group: null };
+      const key = DataGuardRowNavigatorService.rowFingerprint(row, ["patient_id", "group", "glucose"]);
+      // glucose: JSON.stringify(String(180)) = "\"180\""
+      // group:   null                         = "null"
+      // patient_id: JSON.stringify(String("p-7")) = "\"p-7\""
+      expect(key).toBe('"180"' + "null" + '"p-7"');
+    });
+
+    // Bug repro: Texera's JSONLScanSourceOpExec widens mixed-type columns to
+    // String via parseField(stringValue, schemaType), while DataGuard's own
+    // parseJsonl keeps native JSON types. Without the String() coercion, the
+    // profiler-side number `45` and display-side string `"45"` fingerprint
+    // differently and findRowByKey misses every row → wrong cell highlight.
+    it("regression: number and numeric-string cells fingerprint identically (JSONL mixed-type)", () => {
+      const fromAgent = DataGuardRowNavigatorService.rowFingerprint({ age: 45, sample_id: "J001" }, [
+        "age",
+        "sample_id",
+      ]);
+      const fromTexera = DataGuardRowNavigatorService.rowFingerprint({ age: "45", sample_id: "J001" }, [
+        "age",
+        "sample_id",
+      ]);
+      expect(fromAgent).toBe(fromTexera);
+      // And the resulting token is the quoted-string form.
+      expect(fromAgent).toBe('"45"' + '"J001"');
+    });
+
+    it("regression: float values round-trip via String() identically on V8/Bun", () => {
+      const a = DataGuardRowNavigatorService.rowFingerprint({ x: 28.1 }, ["x"]);
+      const b = DataGuardRowNavigatorService.rowFingerprint({ x: "28.1" }, ["x"]);
+      expect(a).toBe(b);
+      expect(a).toBe('"28.1"');
+    });
+  });
+
+  describe("findRowByKey", () => {
+    const rows = [
+      { id: "a", v: 1 },
+      { id: "b", v: 2 },
+      { id: "c", v: 3 },
+    ];
+    const columns = ["id", "v"];
+
+    it("finds a row in the middle of the array", () => {
+      const key = DataGuardRowNavigatorService.rowFingerprint({ id: "b", v: 2 }, columns);
+      expect(DataGuardRowNavigatorService.findRowByKey(rows, columns, key)).toBe(1);
+    });
+
+    it("returns -1 when no row matches", () => {
+      const key = DataGuardRowNavigatorService.rowFingerprint({ id: "z", v: 99 }, columns);
+      expect(DataGuardRowNavigatorService.findRowByKey(rows, columns, key)).toBe(-1);
+    });
+
+    it("returns -1 for empty inputs", () => {
+      expect(DataGuardRowNavigatorService.findRowByKey([], columns, "anything")).toBe(-1);
+      expect(DataGuardRowNavigatorService.findRowByKey(rows, [], "anything")).toBe(-1);
+    });
+
+    it("survives display-side column reordering (caller's column list can be in any order)", () => {
+      // Caller passes columns in a different order than the producer side did
+      // — the fingerprint sort canonicalises, so we still match.
+      const key = DataGuardRowNavigatorService.rowFingerprint({ id: "c", v: 3 }, ["v", "id"]);
+      expect(DataGuardRowNavigatorService.findRowByKey(rows, ["id", "v"], key)).toBe(2);
+    });
+  });
+
   describe("navigate / getNav$", () => {
-    it("multicasts the request to subscribers", () => {
+    it("multicasts the request to nav$ subscribers, stamping a requestId, and resolves Promise<boolean>", async () => {
       const svc = new DataGuardRowNavigatorService();
-      const req: DataGuardRowNavRequest = { operatorId: "op-1", rowIndex: 42, column: "age" };
       const received: DataGuardRowNavRequest[] = [];
-      svc.getNav$().subscribe(r => received.push(r));
-      svc.navigate(req);
-      expect(received).toEqual([req]);
+      svc.getNav$().subscribe(r => {
+        received.push(r);
+        // Auto-respond so navigate() resolves.
+        svc.reportFlashResult({ requestId: r.requestId, flashed: true });
+      });
+      const flashed = await svc.navigate({ operatorId: "op-1", rowIndex: 42, column: "age" });
+      expect(flashed).toBe(true);
+      expect(received.length).toBe(1);
+      expect(received[0].operatorId).toBe("op-1");
+      expect(received[0].rowIndex).toBe(42);
+      expect(received[0].column).toBe("age");
+      expect(typeof received[0].requestId).toBe("number");
+    });
+
+    it("carries rowKey alongside rowIndex so subscribers can prefer the key", async () => {
+      const svc = new DataGuardRowNavigatorService();
+      const received: DataGuardRowNavRequest[] = [];
+      svc.getNav$().subscribe(r => {
+        received.push(r);
+        svc.reportFlashResult({ requestId: r.requestId, flashed: true });
+      });
+      await svc.navigate({
+        operatorId: "op-1",
+        rowIndex: 4,
+        rowKey: '"alice"25',
+        column: "age",
+      });
+      expect(received[0].rowKey).toBe('"alice"25');
+      expect(received[0].rowIndex).toBe(4);
+    });
+
+    it("assigns monotonically increasing requestIds across calls", async () => {
+      const svc = new DataGuardRowNavigatorService();
+      const seen: number[] = [];
+      svc.getNav$().subscribe(r => {
+        seen.push(r.requestId);
+        svc.reportFlashResult({ requestId: r.requestId, flashed: true });
+      });
+      await svc.navigate({ operatorId: "op-1", rowIndex: 0 });
+      await svc.navigate({ operatorId: "op-1", rowIndex: 1 });
+      await svc.navigate({ operatorId: "op-1", rowIndex: 2 });
+      expect(seen[1]).toBeGreaterThan(seen[0]);
+      expect(seen[2]).toBeGreaterThan(seen[1]);
     });
 
     // R3: cold-mount replay. The ResultTableFrameComponent mounts lazily after
@@ -101,22 +307,149 @@ describe("DataGuardRowNavigatorService", () => {
     // request — this is the buffer behaviour reviewers asked us to lock down.
     it("replays the most recent request to a late subscriber (cold-mount)", () => {
       const svc = new DataGuardRowNavigatorService();
-      const req: DataGuardRowNavRequest = { operatorId: "op-1", rowIndex: 7, column: "x" };
-      svc.navigate(req);
+      // Fire-and-forget — we're testing the nav$ replay, not the awaiter.
+      void svc.navigate({ operatorId: "op-1", rowIndex: 7, column: "x" });
       const received: DataGuardRowNavRequest[] = [];
       svc.getNav$().subscribe(r => received.push(r));
-      expect(received).toEqual([req]);
+      expect(received.length).toBe(1);
+      expect(received[0].rowIndex).toBe(7);
     });
 
     // R3 followup: the replay window is bounded (500 ms) so a request from a
     // long-ago click doesn't leak into a much later mount.
     it("does not replay requests older than the buffer window", fakeAsync(() => {
       const svc = new DataGuardRowNavigatorService();
-      svc.navigate({ operatorId: "op-1", rowIndex: 7 });
+      void svc.navigate({ operatorId: "op-1", rowIndex: 7 });
       tick(1000);
       const received: DataGuardRowNavRequest[] = [];
       svc.getNav$().subscribe(r => received.push(r));
       expect(received).toEqual([]);
     }));
   });
+
+  describe("navigate() flash-confirmed Promise (rounds 3 + 4)", () => {
+    // Round-3 mechanism: result-table-frame emits {requestId, flashed} on
+    // flashResult$ and the checklist only advances its locate cursor on
+    // `true`. Cures: silent skips (4 clicks → flashes at [0,1,(skip),3]
+    // + 5th click wraps to 0) and rapid-click cursor drift.
+    //
+    // Round-4 fix: navigate() returns Promise<boolean> directly and subscribes
+    // to flashResult$ BEFORE publishing on nav$. Without that ordering, a
+    // SYNCHRONOUS fast-path emission (target row already on current page →
+    // table-frame calls reportFlashResult inside the next() chain) would be
+    // dropped by the Subject and the awaiter would hang the full safety
+    // timeout — leaving the cursor stuck on indices[0] for every cycle.
+
+    // CRITICAL round-4 regression lock — production order.
+    //
+    // The fake frame subscribes to nav$ FIRST (mimicking the real
+    // ResultTableFrame's ngOnInit subscription) and on every emission calls
+    // reportFlashResult SYNCHRONOUSLY inside the next() chain — exactly what
+    // tryFlashOnCurrentPage produces on the fast path. Then the caller does
+    // `await navigate()`. Without subscribe-before-next inside navigate(),
+    // this Promise would hang until the 36 s safety timeout and resolve
+    // false. This test would have caught the round-3 bug.
+    it("regression: navigate resolves true when the frame reports synchronously inside the next() chain (fast path)", async () => {
+      const svc = new DataGuardRowNavigatorService();
+      svc.getNav$().subscribe(req => {
+        // Synchronous emission — exactly what the fast path produces.
+        svc.reportFlashResult({ requestId: req.requestId, flashed: true });
+      });
+      const flashed = await svc.navigate({ operatorId: "op-1", rowIndex: 1 });
+      expect(flashed).toBe(true);
+    });
+
+    it("regression: a cycle of synchronous fast-path clicks advances each time", async () => {
+      // Direct simulation of the user's scenario: 4 clicks on an issue with
+      // 4 affected rows, each landing on the already-rendered page (fast
+      // path = synchronous reportFlashResult). All 4 must resolve `true`
+      // so the cursor advances through [0,1,2,3] without skipping.
+      const svc = new DataGuardRowNavigatorService();
+      svc.getNav$().subscribe(req => {
+        svc.reportFlashResult({ requestId: req.requestId, flashed: true });
+      });
+      const results: boolean[] = [];
+      for (let i = 0; i < 4; i++) {
+        results.push(await svc.navigate({ operatorId: "op-1", rowIndex: i }));
+      }
+      expect(results).toEqual([true, true, true, true]);
+    });
+
+    it("navigate resolves false when the table-frame reports flashed=false", async () => {
+      const svc = new DataGuardRowNavigatorService();
+      svc.getNav$().subscribe(req => {
+        svc.reportFlashResult({ requestId: req.requestId, flashed: false });
+      });
+      const flashed = await svc.navigate({ operatorId: "op-1", rowIndex: 1 });
+      expect(flashed).toBe(false);
+    });
+
+    it("navigate ignores results stamped with an unrelated requestId", async () => {
+      const svc = new DataGuardRowNavigatorService();
+      svc.getNav$().subscribe(req => {
+        // Wrong-id result first (would hang the awaiter if we didn't filter)
+        // then the real one.
+        svc.reportFlashResult({ requestId: req.requestId + 999, flashed: true });
+        svc.reportFlashResult({ requestId: req.requestId, flashed: false });
+      });
+      const flashed = await svc.navigate({ operatorId: "op-1", rowIndex: 1 });
+      expect(flashed).toBe(false);
+    });
+
+    // Required test 1: rapid two-click race. Click A's walk gets superseded;
+    // cursor advances EXACTLY once (for B's success), not twice. The fake
+    // frame emits A's flashed=false and B's flashed=true.
+    it("rapid two-click race: superseded older walk does not advance the cursor", async () => {
+      const svc = new DataGuardRowNavigatorService();
+      let advances = 0;
+      const seenRequests: number[] = [];
+      // Don't auto-respond — we'll emit manually to control ordering.
+      svc.getNav$().subscribe(req => seenRequests.push(req.requestId));
+      const pA = svc.navigate({ operatorId: "op-1", rowIndex: 0 }).then(f => {
+        if (f) advances++;
+      });
+      const pB = svc.navigate({ operatorId: "op-1", rowIndex: 1 }).then(f => {
+        if (f) advances++;
+      });
+      // Report: A superseded (false), B succeeded (true).
+      svc.reportFlashResult({ requestId: seenRequests[0], flashed: false });
+      svc.reportFlashResult({ requestId: seenRequests[1], flashed: true });
+      await Promise.all([pA, pB]);
+      expect(advances).toBe(1);
+    });
+
+    // Required test 2: empty-click — the table-frame never reports anything.
+    // navigate() must resolve `false` via the safety timeout so the cursor
+    // stays put. Timeout is 10 * 3500 + 1000 = 36000 ms (round-4 concern #2).
+    it("empty-click safety timeout resolves false so the cursor stays put", async () => {
+      vi.useFakeTimers();
+      try {
+        const svc = new DataGuardRowNavigatorService();
+        // No nav$ subscriber → no reportFlashResult ever fires. Wedged-frame
+        // simulation.
+        let resolved: boolean | undefined;
+        const promise = svc.navigate({ operatorId: "op-1", rowIndex: 7 }).then(r => {
+          resolved = r;
+        });
+        // Advance past the safety timeout (36000 ms).
+        await vi.advanceTimersByTimeAsync(36500);
+        await promise;
+        expect(resolved).toBe(false);
+      } finally {
+        vi.useRealTimers();
+      }
+    });
+
+    it("getFlashResult$ multicasts {requestId, flashed} to subscribers", () => {
+      const svc = new DataGuardRowNavigatorService();
+      const seen: { requestId: number; flashed: boolean }[] = [];
+      svc.getFlashResult$().subscribe(r => seen.push(r));
+      svc.reportFlashResult({ requestId: 1, flashed: true });
+      svc.reportFlashResult({ requestId: 2, flashed: false });
+      expect(seen).toEqual([
+        { requestId: 1, flashed: true },
+        { requestId: 2, flashed: false },
+      ]);
+    });
+  });
 });
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
index 8c721dca560..b20657cdf3b 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
@@ -18,13 +18,59 @@
  */
 
 import { Injectable } from "@angular/core";
-import { Observable, ReplaySubject } from "rxjs";
+import { Observable, ReplaySubject, Subject, firstValueFrom, race, timer } from "rxjs";
+import { filter, map, take } from "rxjs/operators";
+
+/**
+ * Outcome of a navigate() request, emitted on `flashResult$` once the
+ * result-table-frame either successfully flashed a row or aborted the walk
+ * (superseded by a newer request, page-render timeout, or no row found).
+ *
+ * The checklist component subscribes filtered to its own `requestId` so it can
+ * advance its per-row locate cursor ONLY when a flash truly landed — silent
+ * skips (e.g. rapid double-clicks where the older walk gets cancelled, or
+ * empty pages where neither key nor index lookup matches) leave the cursor
+ * unchanged so the next click retries the same target instead of skipping it.
+ */
+export interface DataGuardRowNavResult {
+  requestId: number;
+  flashed: boolean;
+}
 
 /** Payload of a "show this row in the result panel" request. */
 export interface DataGuardRowNavRequest {
+  /**
+   * Monotonically increasing id assigned by `navigate()`. The result-table-frame
+   * captures the latest id on each request and bails out of older async walks
+   * whose captured id no longer matches the current one. Surfaced on the
+   * request payload so subscribers can correlate the request to its
+   * `flashResult$` emission.
+   */
+  requestId: number;
   operatorId: string;
-  /** 0-based row index in the operator's full result set. */
+  /**
+   * 0-based row index in the operator's full result set, as seen by the
+   * agent-service profiler (file-byte order). Used as a fall-back target for
+   * CSV single-worker flows where the result panel order matches the
+   * profiler order; ignored when `rowKey` is set and matched successfully.
+   */
   rowIndex: number;
+  /**
+   * Content-based fingerprint of the affected row. Produced by the
+   * agent-service `rowFingerprint` helper. Texera's multi-worker JSONL scan
+   * shuffles rows into worker-arrival order in the result panel, so the
+   * profiler-side `rowIndex` no longer reliably points at the right cell.
+   * Matching by `rowKey` instead lets the result-table-frame find the
+   * affected row regardless of display order.
+   *
+   * Fingerprint contract (must match `rowFingerprint` in
+   * `agent-service/src/agent/tools/dataguard/profile-dataset.ts`):
+   *   - Sort the dataset's column names alphabetically.
+   *   - For each column in canonical order, `JSON.stringify` the cell value
+   *     (treat `undefined` and missing keys as `null`).
+   *   - Concatenate the per-cell JSON tokens with an empty separator.
+   */
+  rowKey?: string;
   /** Optional column to focus / highlight inside the row. */
   column?: string;
 }
@@ -48,13 +94,85 @@ export class DataGuardRowNavigatorService {
   // mount on a warm laptop. Bump if QA reports drops on slow CPUs / first
   // contentful paint stalls.
   private readonly nav$ = new ReplaySubject<DataGuardRowNavRequest>(1, 500);
+  // Multi-cast pipe of {requestId, flashed} completion signals. Plain Subject
+  // (not Replay) because navigate() now wires the awaiter subscription BEFORE
+  // it publishes the request — see the round-4 fix below — so synchronous
+  // fast-path emissions from the table-frame are caught. A ReplaySubject would
+  // also work but would leak previous-request results into new awaiters with
+  // colliding ids if the buffer didn't drain in time. Cleaner to subscribe
+  // first.
+  private readonly flashResult$ = new Subject<DataGuardRowNavResult>();
+  // Monotonic request-id source. Incremented on every navigate() call so the
+  // result-table-frame can identify the latest request and bail older walks.
+  private requestSeq = 0;
+  // Safety timeout for the navigate() Promise. The table-frame's locate-by-key
+  // walk can chain up to LOCATE_BY_KEY_MAX_PAGES (10) page-render races at
+  // 3 s each = 30 s worst case; we add a 5 s buffer for setup/teardown and the
+  // optional handleLocateByIndex fallback that runs AFTER the walk exhausts.
+  // The timeout exists ONLY as a deadlock guard against a wedged table-frame
+  // that mounts but never reports; in the normal case the frame always
+  // reports promptly. Derived from a single explicit constant (rather than
+  // importing the table-frame's own constants) to keep this service free of
+  // any reverse dependency on the result-panel component layer.
+  // Formula: LOCATE_BY_KEY_MAX_PAGES (10) * pageRendered$ race cap (3500 ms,
+  // includes a small slack over the 3 s race) + 1000 ms component-setup slack.
+  private static readonly FLASH_RESULT_TIMEOUT_MS = 10 * 3500 + 1000; // 36000
 
   public getNav$(): Observable<DataGuardRowNavRequest> {
     return this.nav$.asObservable();
   }
 
-  public navigate(req: DataGuardRowNavRequest): void {
-    this.nav$.next(req);
+  /**
+   * Hot stream of `{requestId, flashed}` completions. Subscribers correlate
+   * by `requestId` because the table-frame may emit out-of-order results when
+   * an older walk completes after a newer click superseded it (the older walk
+   * gets `flashed: false`, the newer one `flashed: true`).
+   */
+  public getFlashResult$(): Observable<DataGuardRowNavResult> {
+    return this.flashResult$.asObservable();
+  }
+
+  /**
+   * Publish a navigate request AND return a Promise that resolves to the
+   * flash outcome (`true` = a row actually pulsed on screen, `false` = the
+   * request was superseded by a newer click / timed out / found nothing).
+   *
+   * CRITICAL ordering (round-4 regression fix): the awaiter subscription is
+   * wired to `flashResult$` BEFORE `nav$.next()` fires the request. If the
+   * target row is already on the rendered page, the table-frame's
+   * `tryFlashOnCurrentPage` calls `reportFlashResult` synchronously inside
+   * the `next()` chain. With subscribe-after-next (the old shape) that
+   * emission would be lost to a plain Subject and the awaiter would hang
+   * until the 36 s safety timeout, leaving the locate cursor stuck on
+   * `indices[0]` for every cycle. By subscribing first we catch the
+   * synchronous emission.
+   *
+   * Returns a `Promise<boolean>` directly; callers do `const flashed =
+   * await navigator.navigate(req)` and only advance their cursor on `true`.
+   */
+  public navigate(req: Omit<DataGuardRowNavRequest, "requestId">): Promise<boolean> {
+    const requestId = ++this.requestSeq;
+    const result$ = this.flashResult$.pipe(
+      filter(r => r.requestId === requestId),
+      take(1),
+      map(r => r.flashed)
+    );
+    const timeout$ = timer(DataGuardRowNavigatorService.FLASH_RESULT_TIMEOUT_MS).pipe(map(() => false));
+    // firstValueFrom subscribes synchronously, so by the time the next line
+    // runs the subscription to flashResult$ is already in place.
+    const settled = firstValueFrom(race(result$, timeout$));
+    this.nav$.next({ ...req, requestId });
+    return settled;
+  }
+
+  /**
+   * Called by the result-table-frame when a walk concludes — either by
+   * actually flashing the row (`flashed: true`) or by being superseded /
+   * timing out / failing to find the target anywhere (`flashed: false`).
+   * Idempotent emits are safe; the navigate() awaiter uses `take(1)`.
+   */
+  public reportFlashResult(result: DataGuardRowNavResult): void {
+    this.flashResult$.next(result);
   }
 
   /**
@@ -92,4 +210,81 @@ export class DataGuardRowNavigatorService {
     const idx = safe % len;
     return { value: affectedRowIndices[idx], nextCursor: idx + 1 };
   }
+
+  /**
+   * Purge keys from a per-row cursor Map that no longer appear in `liveIds`.
+   * Used by the checklist component on every results-service push: a benign
+   * re-emit (e.g., `updateEntry` after a verdict toggle) leaves the live ids
+   * unchanged, so this is a no-op and the user's 📍 click cursors survive.
+   * On a fresh scan the issueId set is replaced wholesale, so every old key
+   * is dropped — bounded memory, no leak. Mutates `cursors` in place so the
+   * caller's Map reference stays stable.
+   *
+   * Extracted as a static helper so the checklist subscribe callback stays
+   * trivially testable without spinning up a TestBed.
+   */
+  public static purgeStaleCursors(cursors: Map<string, number>, liveIds: ReadonlySet<string>): void {
+    for (const k of cursors.keys()) {
+      if (!liveIds.has(k)) cursors.delete(k);
+    }
+  }
+
+  /**
+   * Compute the fingerprint of a result-panel row. Implementation MUST stay
+   * byte-identical to the agent-service `rowFingerprint` helper, otherwise the
+   * locate-by-key match will silently fail.
+   *
+   * Contract (mirrored from agent-service):
+   *   - Canonicalize column order by alphabetical sort.
+   *   - Each non-null cell is normalised to a string via `String(v)` before
+   *     `JSON.stringify`, so number `45` and string `"45"` produce the same
+   *     token. This is the fix for the JSONL-multi-worker mixed-type case:
+   *     Texera's `JSONLScanSourceOpExec` widens mixed columns to String via
+   *     `parseField(stringValue, schemaType)`, while DataGuard's `parseJsonl`
+   *     keeps native JSON types. Without coercion the two sides fingerprint
+   *     differently and `findRowByKey` misses every row.
+   *   - `undefined` and missing keys emit the bare token `null` (no quotes),
+   *     identical to explicit-null cells.
+   *   - Concatenate JSON tokens with an empty separator (each token is
+   *     self-delimited as `"…"` or the literal `null`).
+   *
+   * Reviewers note: we deliberately do NOT share this code between the
+   * frontend and the agent-service — they're separate build targets with no
+   * common module path. The contract is enforced by parallel unit tests on
+   * each side using the same input fixtures.
+   */
+  private static fingerprintCell(v: unknown): string {
+    if (v === null || v === undefined) return "null";
+    return JSON.stringify(String(v));
+  }
+
+  public static rowFingerprint(row: Record<string, unknown>, columns: ReadonlyArray<string>): string {
+    const canonical = [...columns].sort();
+    let out = "";
+    for (const c of canonical) {
+      out += DataGuardRowNavigatorService.fingerprintCell(row[c]);
+    }
+    return out;
+  }
+
+  /**
+   * Linear-scan find of the row whose fingerprint matches `targetKey`. Returns
+   * the 0-based display index, or -1 if no row matches.
+   *
+   * `columns` is the schema seen by the caller (display-side); it can be in any
+   * order — the fingerprint canonicalises it. Empty arrays return -1.
+   */
+  public static findRowByKey(
+    rows: ReadonlyArray<Record<string, unknown>>,
+    columns: ReadonlyArray<string>,
+    targetKey: string
+  ): number {
+    if (rows.length === 0 || columns.length === 0) return -1;
+    for (let i = 0; i < rows.length; i++) {
+      if (DataGuardRowNavigatorService.rowFingerprint(rows[i], columns) === targetKey) {
+        return i;
+      }
+    }
+    return -1;
+  }
 }
diff --git a/frontend/src/app/workspace/service/agent/data-guard-settings.service.ts b/frontend/src/app/workspace/service/agent/data-guard-settings.service.ts
index 58786f2616e..8346bfb0b27 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-settings.service.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-settings.service.ts
@@ -66,10 +66,7 @@ export class DataGuardSettingsService {
 
   /** Persist + broadcast a new enabled state for this workflow. */
   public setEnabled(workflowId: number, enabled: boolean): void {
-    localStorage.setItem(
-      this.key(workflowId),
-      enabled ? DataGuardSettingsService.ON : DataGuardSettingsService.OFF
-    );
+    localStorage.setItem(this.key(workflowId), enabled ? DataGuardSettingsService.ON : DataGuardSettingsService.OFF);
     const next = new Map(this.cache$.value);
     next.set(workflowId, enabled);
     this.cache$.next(next);

From 4986832fff57eeac52d1cb61a59909a671515b41 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 05:19:13 -0700
Subject: [PATCH 09/14] feat(dataguard): disable outlier by default,
 locate-cycle for dup rows, no-op fix guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- profile-dataset: gate auto-IQR behind `enableOutlierDetection` (default
  false). validRanges still fires unconditionally as a per-column override.
  IQR detector code preserved so re-enable is a single-flag flip after
  convergence behavior is fully refined.
- apply-fix replace_value: skip writes when the proposed replacement equals
  the current cell (cellEquals guard, both rowIndices and value-match
  branches). Fixes iterative-cleanup convergence — without the guard,
  v3 onwards re-applied byte-identical CSVs and LakeFS aborted the commit
  with "No changes detected."
- locate cycle: add `rowKeyOccurrence` + `findNthRowByKey` so clicking
  "locate" on duplicate-fingerprint rows cycles through all matches rather
  than always pinning to the first occurrence. Cumulative match counter
  surfaces "2 of 4" in the result panel.
---
 .../dataguard/__tests__/apply-fix.test.ts     |  52 +++++++
 .../__tests__/profile-dataset.test.ts         | 140 ++++++++++++++---
 .../src/agent/tools/dataguard/apply-fix.ts    |  13 +-
 .../agent/tools/dataguard/profile-dataset.ts  | 143 ++++++++++++++++--
 .../dataguard-checklist.component.spec.ts     |  68 +++++++++
 .../dataguard-checklist.component.ts          |  17 ++-
 .../result-table-frame.component.ts           |  68 +++++++--
 .../data-guard-row-navigator.service.spec.ts  |  63 ++++++++
 .../agent/data-guard-row-navigator.service.ts |  79 +++++++++-
 9 files changed, 583 insertions(+), 60 deletions(-)

diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
index e222b382de0..a0aeb1c2cf4 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
@@ -105,6 +105,58 @@ describe("applyFix", () => {
     expect(result.dataset.rows[1].x).toBe(2);
   });
 
+  test("replace_value with rowIndices skips no-op writes (cell already equals replacement)", () => {
+    // Iterative cleanup regression: after v1→v2→v3 capping outliers to the
+    // IQR fence, the LLM proposes "replace these rows with fence value X"
+    // but those rows already hold X from the previous round. Without the
+    // equality guard, rowsAffected would be 3, the frontend would push a
+    // byte-identical CSV, and LakeFS would abort with "No changes detected."
+    // With the guard, rowsAffected === 0 → frontend skips the upload.
+    const ds: DatasetView = {
+      columns: ["bmi"],
+      rows: [
+        { bmi: 28.1 },
+        { bmi: 35.74 }, // already at fence
+        { bmi: 27.5 },
+        { bmi: 35.74 }, // already at fence
+        { bmi: 35.74 }, // already at fence
+      ],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "replace_value",
+        operationParams: { column: "bmi", rowIndices: [1, 3, 4], replacement: 35.74 },
+      })
+    );
+    expect(result.rowsAffected).toBe(0);
+    expect(result.dataset.rows[1].bmi).toBe(35.74);
+    expect(result.dataset.rows[3].bmi).toBe(35.74);
+  });
+
+  test("replace_value with rowIndices: mixed (some cells already match, some don't)", () => {
+    // Same scenario but row 3 still has a genuine outlier (75.2). Only that
+    // row should count as affected; the others are already at the fence.
+    const ds: DatasetView = {
+      columns: ["bmi"],
+      rows: [
+        { bmi: 35.74 },
+        { bmi: 35.74 },
+        { bmi: 35.74 },
+        { bmi: 75.2 }, // real outlier
+      ],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "replace_value",
+        operationParams: { column: "bmi", rowIndices: [0, 1, 2, 3], replacement: 35.74 },
+      })
+    );
+    expect(result.rowsAffected).toBe(1);
+    expect(result.dataset.rows[3].bmi).toBe(35.74);
+  });
+
   test("replace_value: original dataset is not mutated", () => {
     const ds: DatasetView = {
       columns: ["age"],
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
index 03f74e07e4c..fe710137b82 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
@@ -287,42 +287,144 @@ describe("profileDataset", () => {
     expect(kinds.has("outlier")).toBe(true);
   });
 
-  describe("outlier detector (validRanges-based)", () => {
-    test("does not fire without validRanges hint — earlier z-score variant was removed", () => {
-      // 19 values near 50, one extreme reading at 500. The old z-score
-      // detector would have flagged the 500 automatically; the validRanges
-      // detector requires the caller to opt in by supplying a hard range.
+  describe("outlier detector (IQR auto + validRanges override)", () => {
+    // Auto-IQR is off by default (see ProfileOptions.enableOutlierDetection
+    // doc-comment for why). Tests that exercise the IQR branch opt in
+    // explicitly with `enableOutlierDetection: true` so the same coverage
+    // survives a future re-enable: flip the default and these tests stop
+    // needing the override.
+
+    test("auto-fires on a clear outlier via IQR (Tukey 1.5× fence) — no hint needed", () => {
+      // 19 values near 50, one extreme reading at 500. IQR Q1/Q3 are
+      // unaffected by the single outlier, so the fence excludes 500.
       const rows: Array<Record<string, unknown>> = [];
       for (let i = 0; i < 19; i++) rows.push({ v: 50 + (i % 3) });
       rows.push({ v: 500 });
-      const issues = profileDataset({ columns: ["v"], rows });
+      const issues = profileDataset({ columns: ["v"], rows }, { enableOutlierDetection: true });
+      const outlier = issues.find(i => i.issueType === "outlier" && i.column === "v");
+      expect(outlier).toBeDefined();
+      expect(outlier!.affectedRowIndices).toEqual([19]);
+      expect(outlier!.description).toContain("Tukey");
+    });
+
+    test("does NOT flag clusters of consecutive large readings — IQR Q3 absorbs them", () => {
+      // 10 normals at 100 + 10 sustained-high at 400. Q1=100, Q3=400, IQR=300,
+      // upper fence = 400 + 450 = 850. All values fit, no outliers.
+      // This is exactly the false-positive case the user wanted protection from.
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 10; i++) rows.push({ glucose: 100 });
+      for (let i = 0; i < 10; i++) rows.push({ glucose: 400 });
+      const issues = profileDataset({ columns: ["glucose"], rows }, { enableOutlierDetection: true });
+      expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
+    });
+
+    test("validRanges override wins per-column over auto IQR", () => {
+      // age column gets a hard range; bmi column has no range so falls back
+      // to IQR. The 500 in age should be flagged via validRanges (not IQR
+      // semantics).
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 19; i++) rows.push({ age: 30 + (i % 5), bmi: 25 + (i % 3) });
+      rows.push({ age: 500, bmi: 27 });
+      const issues = profileDataset(
+        { columns: ["age", "bmi"], rows },
+        { enableOutlierDetection: true, validRanges: { age: { min: 0, max: 120 } } }
+      );
+      const ageOutlier = issues.find(i => i.issueType === "outlier" && i.column === "age");
+      expect(ageOutlier).toBeDefined();
+      expect(ageOutlier!.affectedRowIndices).toEqual([19]);
+      expect(ageOutlier!.description).toContain("valid range");
+      // bmi column has no validRanges; IQR sees a tight cluster, no outlier.
+      expect(issues.find(i => i.issueType === "outlier" && i.column === "bmi")).toBeUndefined();
+    });
+
+    test("skips column with fewer than outlierMinObservations numeric values", () => {
+      // Quartiles aren't meaningful on a tiny sample, so the auto-IQR branch
+      // bails. Default min-obs is 10; we feed 5.
+      const rows: Array<Record<string, unknown>> = [
+        { v: 1 }, { v: 2 }, { v: 3 }, { v: 4 }, { v: 1000 },
+      ];
+      const issues = profileDataset({ columns: ["v"], rows }, { enableOutlierDetection: true });
       expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
     });
 
-    test("flags values outside the user-supplied range", () => {
+    test("skips placeholder rows so they don't surface under two issue types", () => {
+      // age=999 is a default placeholder, so the placeholder detector owns
+      // that row. The IQR pass must skip it.
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 19; i++) rows.push({ age: 30 + (i % 5) });
+      rows.push({ age: 999 }); // placeholder, not outlier
+      const issues = profileDataset({ columns: ["age"], rows }, { enableOutlierDetection: true });
+      expect(issues.find(i => i.issueType === "placeholder_value")).toBeDefined();
+      expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
+    });
+
+    test("skips mostly-non-numeric columns (require ≥ 80% numeric)", () => {
+      // 18 strings + 2 numbers → 10% numeric → skip; otherwise the 2 numbers
+      // would form a degenerate quartile distribution and flag spuriously.
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 18; i++) rows.push({ mixed: "label-" + i });
+      rows.push({ mixed: 5 });
+      rows.push({ mixed: 9999 });
+      const issues = profileDataset({ columns: ["mixed"], rows }, { enableOutlierDetection: true });
+      expect(issues.find(i => i.issueType === "outlier" && i.column === "mixed")).toBeUndefined();
+    });
+
+    test("custom outlierIqrMultiplier bumps the fence to suppress mild outliers", () => {
+      // With default 1.5×, value 500 is flagged. Bumping high enough that the
+      // fence comfortably covers 500 → no outlier. Q1=50, Q3=52, IQR=2,
+      // multiplier 1000 → fence = 52 + 2000 = 2052, so 500 fits. Useful for
+      // users who want IQR off entirely without supplying validRanges.
+      const rows: Array<Record<string, unknown>> = [];
+      for (let i = 0; i < 19; i++) rows.push({ v: 50 + (i % 3) });
+      rows.push({ v: 500 });
+      const issues = profileDataset(
+        { columns: ["v"], rows },
+        { enableOutlierDetection: true, outlierIqrMultiplier: 1000 }
+      );
+      expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
+    });
+
+    test("does not double-count: IQR skips column when validRanges already set", () => {
+      // Mode-1 is exclusive per column: a column with validRanges goes through
+      // the hard-range path only — IQR is silently bypassed for that column.
       const rows: Array<Record<string, unknown>> = [];
       for (let i = 0; i < 19; i++) rows.push({ age: 30 + (i % 5) });
       rows.push({ age: 500 });
       const issues = profileDataset(
         { columns: ["age"], rows },
-        { validRanges: { age: { min: 0, max: 120 } } }
+        { enableOutlierDetection: true, validRanges: { age: { min: 0, max: 120 } } }
       );
-      const outlier = issues.find(i => i.issueType === "outlier" && i.column === "age");
-      expect(outlier).toBeDefined();
-      expect(outlier!.affectedRowCount).toBe(1);
-      expect(outlier!.affectedRowIndices).toEqual([19]);
+      // Exactly one outlier issue for age, not two.
+      const ageOutliers = issues.filter(i => i.issueType === "outlier" && i.column === "age");
+      expect(ageOutliers).toHaveLength(1);
     });
 
-    test("does NOT flag clusters of consecutive large readings — the whole point of the redesign", () => {
-      // The earlier z-score detector would have flagged half of this column.
-      // We deliberately don't: the user owns the definition of "out of range",
-      // and unless they say so, clustered extremes are real data.
+    // -------- Default-off coverage (Part 1 of the round-5 patch) --------
+
+    test("auto-IQR is OFF by default — same input that would flag with the option is silent", () => {
+      // Same 19+1=500 fixture as the "auto-fires" test above, but without the
+      // `enableOutlierDetection: true` option. Locks the default to false.
       const rows: Array<Record<string, unknown>> = [];
-      for (let i = 0; i < 10; i++) rows.push({ glucose: 100 });
-      for (let i = 0; i < 10; i++) rows.push({ glucose: 400 }); // sustained high
-      const issues = profileDataset({ columns: ["glucose"], rows });
+      for (let i = 0; i < 19; i++) rows.push({ v: 50 + (i % 3) });
+      rows.push({ v: 500 });
+      const issues = profileDataset({ columns: ["v"], rows });
       expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
     });
+
+    test("validRanges still fires when enableOutlierDetection is false (explicit override)", () => {
+      // Per spec: a caller who explicitly supplied validRanges shouldn't be
+      // surprised by the new gating. The validRange-based mode runs even with
+      // auto-IQR off. Only the auto branch is gated.
+      const ds: DatasetView = {
+        columns: ["bmi"],
+        rows: [{ bmi: 25 }, { bmi: 75 }, { bmi: 30 }, { bmi: 100 }],
+      };
+      const issues = profileDataset(ds, { validRanges: { bmi: { min: 10, max: 60 } } });
+      const outlier = issues.find(i => i.issueType === "outlier" && i.column === "bmi");
+      expect(outlier).toBeDefined();
+      expect(outlier!.affectedRowCount).toBe(2);
+      expect(outlier!.description).toContain("valid range");
+    });
   });
 
   describe("inconsistent_label detector", () => {
diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.ts b/agent-service/src/agent/tools/dataguard/apply-fix.ts
index fef924e9684..8a1474b4026 100644
--- a/agent-service/src/agent/tools/dataguard/apply-fix.ts
+++ b/agent-service/src/agent/tools/dataguard/apply-fix.ts
@@ -62,12 +62,21 @@ export function applyFix(
       // slightly off (e.g. 950 vs 950.0 vs "950") — that turned every outlier
       // proposal into a byte-identical re-export, which then made LakeFS abort
       // the version commit with "No changes detected in dataset".
+      //
+      // Critically, we only count a row as "affected" when the replacement
+      // actually changes the cell. This matters for iterative cleanup: after
+      // v1→v2→v3 of capping outliers to the IQR fence, the proposal "replace
+      // these 3 rows with fence value X" hits cells that are *already* X.
+      // Without the equality guard, `applied` would be > 0, the frontend would
+      // try to write back a byte-identical CSV, and LakeFS would reject the
+      // commit with "No changes detected." With the guard, applied === 0,
+      // the frontend skips the upload, and the user sees "Nothing to apply."
       const targetIndices = params.rowIndices as number[] | undefined;
       let affected = 0;
       if (targetIndices && targetIndices.length > 0) {
         const indexSet = new Set(targetIndices);
         for (let i = 0; i < rows.length; i++) {
-          if (indexSet.has(i)) {
+          if (indexSet.has(i) && !cellEquals(rows[i][column], replacement)) {
             rows[i][column] = replacement;
             affected++;
           }
@@ -75,7 +84,7 @@ export function applyFix(
       } else {
         const match = params.match;
         for (const r of rows) {
-          if (cellEquals(r[column], match)) {
+          if (cellEquals(r[column], match) && !cellEquals(r[column], replacement)) {
             r[column] = replacement;
             affected++;
           }
diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
index b1e83bebc2f..f6d37f121b9 100644
--- a/agent-service/src/agent/tools/dataguard/profile-dataset.ts
+++ b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
@@ -45,6 +45,31 @@ export interface ProfileOptions {
   // detection skips it (free-text columns will hit this and be ignored).
   // Default 20. Set to 0 to disable label detection entirely.
   inconsistentLabelMaxCardinality?: number;
+  // Tukey's IQR fence multiplier for auto outlier detection. Default 1.5
+  // (the classical "mild outlier" threshold). Bump to 3.0 for "extreme
+  // outliers only" or set very high to effectively disable auto-IQR while
+  // still honoring per-column `validRanges`.
+  outlierIqrMultiplier?: number;
+  // Minimum number of numeric, non-missing, non-placeholder observations a
+  // column needs before auto-IQR runs. Below this, quartiles aren't trustworthy
+  // — skip the column silently. Default 10. validRanges still fires at any size.
+  outlierMinObservations?: number;
+  // Whether to run the outlier detector. Default false — disabled because
+  // the IQR-based detector converges to no-op fixes after a few iterative
+  // Apply rounds (capping outliers to the fence eventually produces cells
+  // already at the fence, and the LLM keeps proposing replace_value with
+  // the same fence value, which `apply-fix` correctly treats as a no-op
+  // but the user perceives as "stuck"). validRanges still works as a
+  // per-column override when the caller opts into outlier detection.
+  //
+  // Gating spec:
+  //   !enableOutlierDetection && !validRanges → skip outlier entirely
+  //   !enableOutlierDetection &&  validRanges → run validRange-only (no auto IQR)
+  //    enableOutlierDetection &&  validRanges → validRange + auto IQR for cols w/o ranges
+  //    enableOutlierDetection && !validRanges → auto IQR for every numeric column
+  // Detector code (IQR + range-violation paths) is kept intact so the option
+  // is a flip-of-a-switch — useful when we re-enable after fixing convergence.
+  enableOutlierDetection?: boolean;
 }
 
 const DEFAULT_MAX_INDICES_IN_ISSUE = 50;
@@ -344,31 +369,102 @@ export function profileDataset(
     }
   }
 
-  // Outlier detector — values that fall outside a user-supplied hard range.
-  // We deliberately do NOT auto-detect outliers via z-score: legitimate
-  // consecutive large readings (e.g. clusters of high glucose in a clinical
-  // dataset) would be flagged en masse. Requires the caller to opt in by
-  // providing validRanges per column. Skips rows already flagged as
-  // placeholders so the same row doesn't surface under two issue types.
-  if (options.validRanges) {
-    for (const [col, range] of Object.entries(options.validRanges)) {
-      if (!dataset.columns.includes(col)) continue;
+  // Outlier detector — two-mode:
+  //
+  // 1. Caller supplied `validRanges` for a column → use that hard range
+  //    (authoritative; user-known domain limits).
+  // 2. Otherwise → auto IQR (Tukey's 1.5× fence) on numeric columns. IQR
+  //    uses Q1/Q3 which are robust to a few extreme values AND to small
+  //    clusters of large readings (Q3 absorbs them), so unlike the earlier
+  //    z-score variant it doesn't over-flag legitimate biological clusters.
+  //
+  // Skip in either mode:
+  //   • rows already flagged as placeholder (avoid double-counting)
+  //   • rows already flagged as missing
+  //   • mixed-type columns (require ≥ 80% numeric)
+  //   • too-small samples (need ≥ outlierMinObservations data points)
+  //   • degenerate distributions (IQR === 0, all values clustered)
+  const validRanges = options.validRanges ?? {};
+  const iqrMultiplier = options.outlierIqrMultiplier ?? 1.5;
+  const outlierMinObs = options.outlierMinObservations ?? 10;
+  const enableOutlier = options.enableOutlierDetection ?? false;
+  const hasValidRanges = Object.keys(validRanges).length > 0;
+  // Top-level gate: skip the entire outlier loop only when both auto-IQR is
+  // off AND the caller didn't supply any validRanges. Per spec, an explicit
+  // validRanges override should still fire even when enableOutlierDetection
+  // is false — otherwise users who carefully configured hard ranges would be
+  // silently surprised.
+  if (enableOutlier || hasValidRanges) {
+    for (const col of dataset.columns) {
       const placeholderHits = placeholderHitByColRow.get(col)!;
-      const outlierIndices: number[] = [];
+
+      // Collect numeric, non-placeholder, non-missing values with their row index.
+      const values: Array<{ i: number; v: number }> = [];
+      let nonMissingCount = 0;
       for (let i = 0; i < dataset.rows.length; i++) {
         if (placeholderHits.has(i)) continue;
-        const v = toNumber(dataset.rows[i][col]);
+        const raw = dataset.rows[i][col];
+        if (isMissing(raw, missingTokens)) continue;
+        nonMissingCount++;
+        const v = toNumber(raw);
         if (v === undefined) continue;
-        if (v < range.min || v > range.max) outlierIndices.push(i);
+        values.push({ i, v });
+      }
+
+      let outlierIndices: number[] = [];
+      let mode: "validRange" | "iqr" | null = null;
+      let evidenceParts = "";
+
+      if (validRanges[col]) {
+        // Mode 1: user-supplied hard range wins per-column. Runs regardless
+        // of enableOutlierDetection so an explicit override is always honored.
+        mode = "validRange";
+        const range = validRanges[col];
+        outlierIndices = values.filter(p => p.v < range.min || p.v > range.max).map(p => p.i);
+        evidenceParts = `Valid range: [${range.min}, ${range.max}]; violations: ${outlierIndices.length}.`;
+      } else if (enableOutlier) {
+        // Mode 2: auto IQR — only when the caller opted in. Guard against
+        // false-positives:
+        //   - too few observations → can't trust quartiles
+        //   - mostly-non-numeric column → skip (can't compare apples to oranges)
+        //   - all values clustered (IQR === 0) → no outliers possible
+        if (values.length < outlierMinObs) continue;
+        if (values.length / Math.max(nonMissingCount, 1) < 0.8) continue;
+
+        const sorted = [...values].sort((a, b) => a.v - b.v);
+        const q1 = quantile(sorted.map(p => p.v), 0.25);
+        const q3 = quantile(sorted.map(p => p.v), 0.75);
+        const iqr = q3 - q1;
+        if (iqr === 0) continue;
+
+        const lowerFence = q1 - iqrMultiplier * iqr;
+        const upperFence = q3 + iqrMultiplier * iqr;
+        outlierIndices = values.filter(p => p.v < lowerFence || p.v > upperFence).map(p => p.i);
+        if (outlierIndices.length === 0) continue;
+        mode = "iqr";
+        evidenceParts =
+          `Q1=${q1.toFixed(2)}, Q3=${q3.toFixed(2)}, IQR=${iqr.toFixed(2)}, ` +
+          `fence=[${lowerFence.toFixed(2)}, ${upperFence.toFixed(2)}] ` +
+          `(Tukey's ${iqrMultiplier}× rule); ${outlierIndices.length} value(s) outside fence.`;
+      } else {
+        // Auto-IQR disabled and no per-column validRange → silently skip.
+        continue;
       }
-      if (outlierIndices.length === 0) continue;
+
+      if (mode === null || outlierIndices.length === 0) continue;
+
+      outlierIndices.sort((a, b) => a - b);
       const idx = maybeIndices(outlierIndices, indexCap);
+      const desc =
+        mode === "validRange"
+          ? `${outlierIndices.length} row(s) in ${col} fall outside the valid range [${validRanges[col].min}, ${validRanges[col].max}]`
+          : `${outlierIndices.length} row(s) in ${col} are statistical outliers (Tukey's ${iqrMultiplier}× IQR fence)`;
       issues.push({
         issueId: nextIssueId(),
         issueType: "outlier",
         column: col,
-        description: `${outlierIndices.length} row(s) in ${col} fall outside the valid range [${range.min}, ${range.max}]`,
-        evidence: `Valid range: [${range.min}, ${range.max}]; violations: ${outlierIndices.length}.`,
+        description: desc,
+        evidence: evidenceParts,
         affectedRowCount: outlierIndices.length,
         affectedRowIndices: idx,
         affectedRowKeys: maybeKeys(idx, dataset.rows, dataset.columns),
@@ -379,3 +475,20 @@ export function profileDataset(
 
   return issues;
 }
+
+/**
+ * Linear-interpolation quantile (R-7 / numpy default), Q1 = quantile(0.25),
+ * Q3 = quantile(0.75). Input must be sorted ascending. `q` in [0, 1].
+ * Pure helper — exported for testability.
+ */
+export function quantile(sortedValues: ReadonlyArray<number>, q: number): number {
+  if (sortedValues.length === 0) return NaN;
+  if (sortedValues.length === 1) return sortedValues[0];
+  const pos = (sortedValues.length - 1) * q;
+  const base = Math.floor(pos);
+  const rest = pos - base;
+  if (base + 1 < sortedValues.length) {
+    return sortedValues[base] + rest * (sortedValues[base + 1] - sortedValues[base]);
+  }
+  return sortedValues[base];
+}
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
index b943ac0824d..07912d1d58f 100644
--- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
@@ -150,6 +150,74 @@ describe("DataGuardChecklistComponent.onShowInResultPanel — locate branching",
     resolveNavigate(true);
   });
 
+  it("JSONL path: computes rowKeyOccurrence as the count of prior identical keys in the cursor walk", async () => {
+    // Simulates a duplicate-row issue: 4 affected rows, 2 unique keys. The
+    // checklist must hand the table-frame the occurrence index so each click
+    // lands on a distinct display row even though all 4 fingerprints are
+    // pairwise identical for the same dup group.
+    //
+    // Pattern [k1, k2, k1, k2] mimics two J001 dups interleaved with two J004
+    // dups (worker-shuffled JSONL). The expected occurrences are:
+    //   click 1 (cursor=0, key=k1): no prior k1 → 0
+    //   click 2 (cursor=1, key=k2): no prior k2 → 0
+    //   click 3 (cursor=2, key=k1): one prior k1 → 1
+    //   click 4 (cursor=3, key=k2): one prior k2 → 1
+    const captured: Array<{ rowKey?: string; rowKeyOccurrence?: number }> = [];
+    const { component } = makeComponent("JSONLFileScan", () => Promise.resolve(true));
+    // Hijack navigate to record what payload it sees (the default spy is fine
+    // but we want both fields in one place per call).
+    const realNavigate = (component as any).rowNavigator.navigate as ReturnType<typeof vi.fn>;
+    realNavigate.mockImplementation((req: NavRequest) => {
+      captured.push({ rowKey: req.rowKey, rowKeyOccurrence: req.rowKeyOccurrence });
+      return Promise.resolve(true);
+    });
+
+    const entry = makeEntry([1, 2, 3, 4], ["k1", "k2", "k1", "k2"]);
+    for (let i = 0; i < 4; i++) await component.onShowInResultPanel(entry);
+
+    expect(captured.map(c => c.rowKey)).toEqual(["k1", "k2", "k1", "k2"]);
+    expect(captured.map(c => c.rowKeyOccurrence)).toEqual([0, 0, 1, 1]);
+  });
+
+  it("JSONL path: four dup clicks (all identical keys) yield occurrences 0..3 and advance the cursor each time", async () => {
+    // True duplicate_id case: every affectedRowKey is the SAME string (k0).
+    // findRowByKey would return the same display index for all 4 clicks; the
+    // occurrence parameter is what saves us. Each click resolves true so the
+    // cursor advances 0→1→2→3 and the requested occurrence walks 0→1→2→3.
+    const captured: Array<{ rowKeyOccurrence?: number }> = [];
+    const { component } = makeComponent("JSONLFileScan", () => Promise.resolve(true));
+    const realNavigate = (component as any).rowNavigator.navigate as ReturnType<typeof vi.fn>;
+    realNavigate.mockImplementation((req: NavRequest) => {
+      captured.push({ rowKeyOccurrence: req.rowKeyOccurrence });
+      return Promise.resolve(true);
+    });
+
+    const sameKey = "dup-key";
+    const entry = makeEntry([10, 11, 12, 13], [sameKey, sameKey, sameKey, sameKey]);
+    for (let i = 0; i < 4; i++) await component.onShowInResultPanel(entry);
+
+    expect(captured.map(c => c.rowKeyOccurrence)).toEqual([0, 1, 2, 3]);
+    expect((component as any).locateCursors.get("issue-1")).toBe(4);
+  });
+
+  it("CSV path: rowKeyOccurrence is irrelevant — no rowKey is sent, occurrence defaults on the table side", async () => {
+    // The CSV branch deliberately skips fingerprint matching entirely (single
+    // worker → display order matches profiler order). It must NOT set rowKey
+    // or rowKeyOccurrence, so the result-table-frame routes to
+    // handleLocateByIndex unchanged.
+    const captured: NavRequest[] = [];
+    const { component } = makeComponent("CSVFileScan", () => Promise.resolve(true));
+    const realNavigate = (component as any).rowNavigator.navigate as ReturnType<typeof vi.fn>;
+    realNavigate.mockImplementation((req: NavRequest) => {
+      captured.push(req);
+      return Promise.resolve(true);
+    });
+    const entry = makeEntry([0, 1, 2, 3], ["k0", "k0", "k0", "k0"]);
+    await component.onShowInResultPanel(entry);
+    expect(captured[0].rowKey).toBeUndefined();
+    expect(captured[0].rowKeyOccurrence).toBeUndefined();
+  });
+
   it("JSONL path: cursor advances only after navigate() resolves true, stays put on false", async () => {
     // First click: navigate resolves true → cursor advances.
     const { component, navigateSpy } = makeComponent("JSONLFileScan", () => Promise.resolve(true));
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
index d5fe65dbda2..3a8d939d17d 100644
--- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
@@ -349,9 +349,23 @@ export class DataGuardChecklistComponent implements OnInit, OnDestroy {
     // back to the index when `rowKey` cannot be found in the loaded data.
     const rowKeys = entry.issue.affectedRowKeys;
     let rowKey: string | undefined;
+    // For duplicate-row issues, every affected row has the SAME fingerprint
+    // — so all entries of rowKeys are byte-identical strings. We compute how
+    // many times the target key appeared earlier in the cursor walk and pass
+    // that to the table-frame as `rowKeyOccurrence` so each click can land on
+    // a distinct display row. Unique-key issues (the common case) keep
+    // occurrence=0 because the same key never appears twice. Counting against
+    // [0..cursor) (cursor itself exclusive) means click #1 = 0 occurrences
+    // seen, click #2 of the SAME key = 1, click #3 = 2, etc. — exactly the
+    // semantics findNthRowByKey expects.
+    let rowKeyOccurrence = 0;
     if (rowKeys && rowKeys.length === rowIndices.length) {
       const safeCursor = Number.isFinite(cursor) && cursor >= 0 ? Math.floor(cursor) : 0;
-      rowKey = rowKeys[safeCursor % rowKeys.length];
+      const idx = safeCursor % rowKeys.length;
+      rowKey = rowKeys[idx];
+      for (let i = 0; i < idx; i++) {
+        if (rowKeys[i] === rowKey) rowKeyOccurrence++;
+      }
     } else if (rowKeys === undefined) {
       // Server omitted fingerprints (e.g., issue was too large to enumerate).
       // Surface a hint so users aren't surprised when the highlight lands on
@@ -369,6 +383,7 @@ export class DataGuardChecklistComponent implements OnInit, OnDestroy {
       operatorId: opId,
       rowIndex: step.value,
       rowKey,
+      rowKeyOccurrence,
       column: entry.issue.column,
     });
     if (flashed) {
diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
index 295dd4aa4e0..117db3e2338 100644
--- a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
+++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
@@ -260,7 +260,14 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
         // (handles Texera's JSONL multi-worker row-shuffle). Fall back to the
         // raw index path when rowKey is absent or no row matches anywhere.
         if (req.rowKey !== undefined) {
-          this.handleLocateByKey(req.operatorId, req.requestId, req.rowKey, req.column, req.rowIndex);
+          this.handleLocateByKey(
+            req.operatorId,
+            req.requestId,
+            req.rowKey,
+            req.column,
+            req.rowIndex,
+            req.rowKeyOccurrence ?? 0
+          );
         } else {
           this.handleLocateByIndex(req.operatorId, req.requestId, req.rowIndex, req.column);
         }
@@ -291,7 +298,8 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
     requestId: number,
     rowKey: string,
     column: string | undefined,
-    fallbackIndex: number
+    fallbackIndex: number,
+    occurrence: number
   ): void {
     // Every async resumption MUST first check that our captured requestId is
     // still the current one. If a newer click superseded us, emit
@@ -299,26 +307,52 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
     // Returns `true` when the caller should bail.
     const isSuperseded = (): boolean => this.currentLocateToken !== requestId;
 
-    const tryFlashOnCurrentPage = (): boolean => {
-      if (this.operatorId !== requestOperatorId) return true; // bail, but treat as "handled"
+    // Cumulative count of matches we've already walked past in earlier pages.
+    // The `occurrence` semantic is global (across the full result set), so the
+    // per-page lookup is `findNthRowByKey(...,occurrence - matchesSeenBeforeCurrentPage)`.
+    // When that returns -1, the page has K matches but they're earlier
+    // occurrences than the one we want — we advance the counter by K and move
+    // to the next page. This is critical for the duplicate-row case where the
+    // 4 dup rows can land on any 4 of the result-panel's display rows
+    // (potentially split across multiple pages by Texera's parallel scan).
+    let matchesSeenBeforeCurrentPage = 0;
+
+    const tryFlashOnCurrentPage = (): { flashed: boolean; matchesOnThisPage: number } => {
+      if (this.operatorId !== requestOperatorId) {
+        // bail, but treat as "handled" — return matchesOnThisPage=0 so the
+        // caller doesn't bump the cumulative counter on a stale frame.
+        return { flashed: true, matchesOnThisPage: 0 };
+      }
       const columns = this.currentColumns?.map(c => c.columnDef) ?? [];
-      const rowInPage = DataGuardRowNavigatorService.findRowByKey(
-        this.currentResult as ReadonlyArray<Record<string, unknown>>,
-        columns,
-        rowKey
-      );
+      const rows = this.currentResult as ReadonlyArray<Record<string, unknown>>;
+      const wantWithinPage = occurrence - matchesSeenBeforeCurrentPage;
+      const rowInPage = DataGuardRowNavigatorService.findNthRowByKey(rows, columns, rowKey, wantWithinPage);
       if (rowInPage >= 0) {
         this.flashRow(rowInPage, column);
-        return true;
+        return { flashed: true, matchesOnThisPage: 0 };
       }
-      return false;
+      // No match (or not enough matches) on this page: tally how many matches
+      // ARE on this page so the walker can advance the cumulative counter.
+      const matchesOnThisPage = DataGuardRowNavigatorService.countMatchesByKey(rows, columns, rowKey);
+      return { flashed: false, matchesOnThisPage };
     };
 
-    // 1. Fast path — match on the page already rendered.
-    if (tryFlashOnCurrentPage()) {
-      this.reportFlashResult(requestId, true);
-      return;
+    // 1. Fast path — only safe for `occurrence === 0` (the pre-occurrence
+    //    behaviour). When occurrence > 0 we can't trust a fast-path match on
+    //    the current page: the user may be on page 5 of 10 and the first
+    //    match here might be the 2nd or 3rd global occurrence, not the 1st.
+    //    Force the slow walk so the canonical page order is honoured.
+    if (occurrence === 0) {
+      const fast = tryFlashOnCurrentPage();
+      if (fast.flashed) {
+        this.reportFlashResult(requestId, true);
+        return;
+      }
     }
+    // The slow walk below starts from page 1 unconditionally and counts
+    // matches in canonical page order via matchesSeenBeforeCurrentPage, so it
+    // gives a deterministic answer regardless of which page is currently
+    // displayed.
 
     // 2. Walk subsequent pages. Start from page 1 (not currentPageIndex+1)
     //    because the user may not be on page 1 when they click — the affected
@@ -362,10 +396,12 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
             this.reportFlashResult(requestId, false);
             return;
           }
-          if (tryFlashOnCurrentPage()) {
+          const attempt = tryFlashOnCurrentPage();
+          if (attempt.flashed) {
             this.reportFlashResult(requestId, true);
             return;
           }
+          matchesSeenBeforeCurrentPage += attempt.matchesOnThisPage;
           walkPage(pageIndex + 1);
         });
       this.changePaginatedResultData();
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
index 4609106af1e..84933ef33c3 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
@@ -252,6 +252,69 @@ describe("DataGuardRowNavigatorService", () => {
     });
   });
 
+  describe("findNthRowByKey", () => {
+    // Regression: duplicate-row issues emit N identical fingerprints in
+    // affectedRowKeys (every dup row has the same content by definition).
+    // findRowByKey returns the first display match for every click, so 4
+    // dup clicks would collapse to 2 visible flashes if we did not cycle by
+    // occurrence. findNthRowByKey returns the Nth match so the checklist can
+    // hand a different occurrence to each click and land on a distinct row.
+    const dupRows = [
+      { id: "X", group: "g1" }, // 0: unrelated
+      { id: "J001", group: "g1" }, // 1: dup A, first match
+      { id: "Y", group: "g2" }, // 2: unrelated
+      { id: "J001", group: "g1" }, // 3: dup A, second match
+    ];
+    const columns = ["id", "group"];
+
+    it("returns the index of the Nth match (0-indexed) and -1 once exhausted", () => {
+      const dupKey = DataGuardRowNavigatorService.rowFingerprint({ id: "J001", group: "g1" }, columns);
+      expect(DataGuardRowNavigatorService.findNthRowByKey(dupRows, columns, dupKey, 0)).toBe(1);
+      expect(DataGuardRowNavigatorService.findNthRowByKey(dupRows, columns, dupKey, 1)).toBe(3);
+      expect(DataGuardRowNavigatorService.findNthRowByKey(dupRows, columns, dupKey, 2)).toBe(-1);
+    });
+
+    it("returns -1 on empty inputs / no matches", () => {
+      const dupKey = DataGuardRowNavigatorService.rowFingerprint({ id: "J001", group: "g1" }, columns);
+      expect(DataGuardRowNavigatorService.findNthRowByKey([], columns, dupKey, 0)).toBe(-1);
+      expect(DataGuardRowNavigatorService.findNthRowByKey(dupRows, [], dupKey, 0)).toBe(-1);
+      const missing = DataGuardRowNavigatorService.rowFingerprint({ id: "ZZZ", group: "g9" }, columns);
+      expect(DataGuardRowNavigatorService.findNthRowByKey(dupRows, columns, missing, 0)).toBe(-1);
+    });
+
+    it("coerces negative / non-finite occurrence to 0 (defensive)", () => {
+      const dupKey = DataGuardRowNavigatorService.rowFingerprint({ id: "J001", group: "g1" }, columns);
+      expect(DataGuardRowNavigatorService.findNthRowByKey(dupRows, columns, dupKey, -1)).toBe(1);
+      expect(DataGuardRowNavigatorService.findNthRowByKey(dupRows, columns, dupKey, NaN)).toBe(1);
+    });
+
+    it("findRowByKey is a thin wrapper over findNthRowByKey(..., 0)", () => {
+      const dupKey = DataGuardRowNavigatorService.rowFingerprint({ id: "J001", group: "g1" }, columns);
+      expect(DataGuardRowNavigatorService.findRowByKey(dupRows, columns, dupKey)).toBe(
+        DataGuardRowNavigatorService.findNthRowByKey(dupRows, columns, dupKey, 0)
+      );
+    });
+  });
+
+  describe("countMatchesByKey", () => {
+    it("counts every row whose fingerprint matches the key", () => {
+      const rows = [
+        { id: "J001", v: 1 },
+        { id: "J002", v: 2 },
+        { id: "J001", v: 1 },
+        { id: "J001", v: 1 },
+      ];
+      const columns = ["id", "v"];
+      const key = DataGuardRowNavigatorService.rowFingerprint({ id: "J001", v: 1 }, columns);
+      expect(DataGuardRowNavigatorService.countMatchesByKey(rows, columns, key)).toBe(3);
+    });
+
+    it("returns 0 on empty inputs / no match", () => {
+      expect(DataGuardRowNavigatorService.countMatchesByKey([], ["id"], "x")).toBe(0);
+      expect(DataGuardRowNavigatorService.countMatchesByKey([{ id: "a" }], [], "x")).toBe(0);
+    });
+  });
+
   describe("navigate / getNav$", () => {
     it("multicasts the request to nav$ subscribers, stamping a requestId, and resolves Promise<boolean>", async () => {
       const svc = new DataGuardRowNavigatorService();
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
index b20657cdf3b..c2c79163427 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
@@ -73,6 +73,27 @@ export interface DataGuardRowNavRequest {
   rowKey?: string;
   /** Optional column to focus / highlight inside the row. */
   column?: string;
+  /**
+   * For duplicate-row issues every affected row has IDENTICAL content, so all
+   * entries of `affectedRowKeys` point to the SAME fingerprint string. A naive
+   * `findRowByKey` would return the first matching display row on every click,
+   * collapsing 4 dup clicks into 2 visible flashes. This field lets the caller
+   * say "I want the Nth display match of this fingerprint" — the checklist
+   * counts how many times the current key appeared earlier in the cursor walk
+   * and passes that as the occurrence so each click lands on a distinct row.
+   *
+   * 0-indexed (0 = first display match, 1 = second, …). Defaults to 0 when
+   * omitted, preserving the pre-existing single-match semantics for unique
+   * fingerprints (the common case — missing values, placeholders, outliers,
+   * inconsistent labels are all per-row unique keys).
+   *
+   * Crucially, the result-table-frame's page walk treats this occurrence
+   * **cumulatively across pages**: if page 1 has 1 match and the user wanted
+   * occurrence=1, the walker advances past page 1 and looks for the first
+   * match on subsequent pages — it does NOT pick the first match on page 2.
+   * See `handleLocateByKey` for the matchesSeenBeforeCurrentPage accumulator.
+   */
+  rowKeyOccurrence?: number;
 }
 
 /**
@@ -268,23 +289,67 @@ export class DataGuardRowNavigatorService {
   }
 
   /**
-   * Linear-scan find of the row whose fingerprint matches `targetKey`. Returns
-   * the 0-based display index, or -1 if no row matches.
+   * Linear-scan find of the **Nth** row whose fingerprint matches `targetKey`.
+   * Returns the 0-based display index of the `occurrence`-th match (0 = first,
+   * 1 = second, …) or -1 if there are fewer matches than requested.
    *
-   * `columns` is the schema seen by the caller (display-side); it can be in any
-   * order — the fingerprint canonicalises it. Empty arrays return -1.
+   * Duplicate-row issues are why this exists. Every affected row in a
+   * `duplicate_id` issue has the SAME fingerprint by definition, so the
+   * checklist's cursor walk hands the same `targetKey` to every click and
+   * relies on `occurrence` to walk distinct display rows.
+   *
+   * `columns` is the schema seen by the caller (display-side); it can be in
+   * any order — the fingerprint canonicalises it. Empty arrays return -1.
+   * Negative or non-finite `occurrence` is coerced to 0 (defensive).
    */
-  public static findRowByKey(
+  public static findNthRowByKey(
     rows: ReadonlyArray<Record<string, unknown>>,
     columns: ReadonlyArray<string>,
-    targetKey: string
+    targetKey: string,
+    occurrence: number
   ): number {
     if (rows.length === 0 || columns.length === 0) return -1;
+    const want = Number.isFinite(occurrence) && occurrence >= 0 ? Math.floor(occurrence) : 0;
+    let seen = 0;
     for (let i = 0; i < rows.length; i++) {
       if (DataGuardRowNavigatorService.rowFingerprint(rows[i], columns) === targetKey) {
-        return i;
+        if (seen === want) return i;
+        seen++;
       }
     }
     return -1;
   }
+
+  /**
+   * Thin wrapper preserving the pre-occurrence call sites (and tests). Returns
+   * the first display-row index whose fingerprint matches `targetKey`, or -1.
+   */
+  public static findRowByKey(
+    rows: ReadonlyArray<Record<string, unknown>>,
+    columns: ReadonlyArray<string>,
+    targetKey: string
+  ): number {
+    return DataGuardRowNavigatorService.findNthRowByKey(rows, columns, targetKey, 0);
+  }
+
+  /**
+   * Count how many rows on the supplied page match `targetKey`. Used by the
+   * result-table-frame's page walker to maintain a cumulative "matches seen
+   * before the current page" counter so an `occurrence` request lands on the
+   * correct display row even when matches straddle a page boundary.
+   */
+  public static countMatchesByKey(
+    rows: ReadonlyArray<Record<string, unknown>>,
+    columns: ReadonlyArray<string>,
+    targetKey: string
+  ): number {
+    if (rows.length === 0 || columns.length === 0) return 0;
+    let count = 0;
+    for (let i = 0; i < rows.length; i++) {
+      if (DataGuardRowNavigatorService.rowFingerprint(rows[i], columns) === targetKey) {
+        count++;
+      }
+    }
+    return count;
+  }
 }

From ea3b3236002a28bf5ae7059f50f62b260d489378 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 05:42:57 -0700
Subject: [PATCH 10/14] fix(dataguard): locate jumps to wrong row when JSONL
 cell is null
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Texera's JSONLScanSourceOpExec parses through Jackson; `JsonNullNode.asText()`
returns the literal string `"null"` (4 chars), not Java null. The result-panel
row for a source `{"score": null}` therefore arrives at the frontend as
`{score: "null"}` while the profiler preserved real JS null — fingerprints
diverged, `findRowByKey` returned -1 for every null cell, and the silent
byte-order index fallback in result-table-frame landed on a worker-shuffled
display row (e.g., U037 score=88 instead of U007 Grace score=null).

Fix: normalize all missing forms to a single bare `null` fingerprint token
on both sides — real null, undefined, NaN, empty/whitespace strings, the
literal string `"null"`, and the standard missing-token set (`na`, `n/a`,
`nan`, `none`), all case- and trim-insensitive. The profiler routes through
the shared `isCellMissing` predicate; the frontend mirrors with an inline
duplicate (no shared module across services).

Round-1..5 locate invariants intact: `String(v)` numeric coercion preserved
on the non-missing path, `rowKeyOccurrence` cycling untouched, CSV path
unaffected.
---
 .../__tests__/profile-dataset.test.ts         | 32 ++++++++++
 .../agent/tools/dataguard/profile-dataset.ts  | 17 ++++-
 .../data-guard-row-navigator.service.spec.ts  | 49 +++++++++++++++
 .../agent/data-guard-row-navigator.service.ts | 63 ++++++++++++++++---
 4 files changed, 148 insertions(+), 13 deletions(-)

diff --git a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
index fe710137b82..cd54b357f2b 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
@@ -579,6 +579,38 @@ describe("profileDataset", () => {
       // patient_id: JSON.stringify(String("p-7")) = "\"p-7\""
       expect(key).toBe('"180"' + "null" + '"p-7"');
     });
+
+    // Round-6 regression — JSONL null-cell locate bug.
+    //
+    // Texera's `JSONLScanSourceOpExec` reads each line via Jackson and pipes
+    // it through `JSONUtils.JSONToMap`, which calls `JsonNode#asText()` on
+    // every value node. For a JsonNullNode, Jackson's `asText()` returns the
+    // literal STRING `"null"` (4 chars n-u-l-l), not Java null. So when the
+    // result panel renders a row that the source file had as `{score: null}`,
+    // the cell value the frontend sees is the string `"null"`, while the
+    // profiler-side parseJsonl preserved a real JS `null`.
+    //
+    // Pre-fix the profiler emitted bare `null` for the cell, the frontend
+    // emitted `"\"null\""` (the quoted form), the fingerprints diverged,
+    // `findRowByKey` missed every row, and the silent index-fallback path
+    // flashed whatever shuffled display row sat at the byte-order index.
+    //
+    // The fix collapses both representations to the bare `null` token via
+    // the shared `isMissing` predicate so the two sides agree.
+    test("regression: explicit-null cell and Jackson-asText `\"null\"` string fingerprint identically (JSONL round 6)", () => {
+      const profilerRow = { score: null as unknown, user: "Grace" };
+      const texeraRow = { score: "null", user: "Grace" };
+      const a = rowFingerprint(profilerRow, ["score", "user"]);
+      const b = rowFingerprint(texeraRow, ["score", "user"]);
+      expect(a).toBe(b);
+    });
+
+    test("regression: standard missing-token spellings all fingerprint to the bare null token", () => {
+      const expected = rowFingerprint({ x: null }, ["x"]);
+      for (const token of ["null", "NULL", "Null", "NA", "n/a", "N/A", "None", "NONE", "nan", "NaN", "", "  "]) {
+        expect(rowFingerprint({ x: token }, ["x"])).toBe(expected);
+      }
+    });
   });
 
   // ---------------------------------------------------------------------------
diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
index f6d37f121b9..bcc22a18052 100644
--- a/agent-service/src/agent/tools/dataguard/profile-dataset.ts
+++ b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
@@ -119,13 +119,24 @@ function maybeIndices(
  *     while DataGuard's own `parseJsonl` keeps native JSON types — without
  *     the coercion the two sides fingerprint differently and `findRowByKey`
  *     misses every row in mixed-type columns.
- *   - Null / undefined / missing keys emit the bare token `null` (no quotes).
+ *   - **Round 6 — missing-token canonicalization.** Any cell `isMissing()`
+ *     considers absent (`null`, `undefined`, `NaN`, `""`, and the
+ *     case-insensitive trimmed tokens `na`/`n/a`/`null`/`none`/`nan`) emits
+ *     the same bare `null` token. This closes a JSONL locate bug: Texera's
+ *     `JSONToMap` calls Jackson's `JsonNode#asText()` on a `NullNode`, which
+ *     returns the literal STRING `"null"` (not Java null). Without the
+ *     canonicalization the profiler-side null cell fingerprints as bare
+ *     `null` while the Texera-side cell fingerprints as `"\"null\""` and the
+ *     locate match silently misses, falling back to the byte-order index path
+ *     that lands on whatever shuffled display row happens to sit at that
+ *     position.
  *   - The individual JSON tokens are concatenated with an empty separator;
  *     because each token is self-delimited (`"…"` or the literal `null`),
  *     no ambiguity is introduced.
  *
  * Edge cases handled:
- *   - Missing key vs explicit null → identical fingerprint (`null`).
+ *   - Missing key vs explicit null vs the string `"null"` (Jackson asText on
+ *     a JsonNullNode) → identical fingerprint (`null`).
  *   - JSON-stringify special characters (quotes, backslashes, unicode) → the
  *     standard JSON.stringify escapes apply identically on V8 (Texera frontend)
  *     and on Bun (agent-service).
@@ -133,7 +144,7 @@ function maybeIndices(
  *     implement IEEE-754 ToString per ECMA-262 §7.1.17).
  */
 function fingerprintCell(v: unknown): string {
-  if (v === null || v === undefined) return "null";
+  if (isCellMissing(v)) return "null";
   return JSON.stringify(String(v));
 }
 
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
index 84933ef33c3..1d9d5a998e0 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
@@ -219,6 +219,55 @@ describe("DataGuardRowNavigatorService", () => {
       expect(a).toBe(b);
       expect(a).toBe('"28.1"');
     });
+
+    // Round-6 regression — JSONL null-cell locate bug.
+    //
+    // Texera's JSONLScanSourceOpExec pipes each row through JSONUtils.JSONToMap,
+    // which calls Jackson's JsonNode#asText() on every value node. For a
+    // JsonNullNode that returns the literal STRING `"null"` (4 chars n-u-l-l),
+    // not Java null. So the result-panel row for a source `{score: null}` line
+    // arrives at the frontend as `{score: "null"}` (string), while the
+    // profiler-side parseJsonl preserved a real JS `null`.
+    //
+    // Pre-fix the two sides fingerprinted differently — bare `null` vs.
+    // `"\"null\""` — findRowByKey returned -1 for every affected row, and the
+    // silent byte-order index fallback flashed whatever shuffled display row
+    // sat at the source-byte position (e.g. a non-null row on a later page).
+    //
+    // The fix routes any cell that satisfies `isMissingCell` (null / undefined
+    // / NaN / "" / whitespace / `na` / `n/a` / `null` / `none` / `nan`,
+    // case-insensitive) through the same bare `null` token on both sides.
+    it("regression: explicit-null cell and Jackson-asText `\"null\"` string fingerprint identically (JSONL round 6)", () => {
+      const profilerRow = { score: null, user: "Grace" };
+      const texeraRow = { score: "null", user: "Grace" };
+      const a = DataGuardRowNavigatorService.rowFingerprint(profilerRow, ["score", "user"]);
+      const b = DataGuardRowNavigatorService.rowFingerprint(texeraRow, ["score", "user"]);
+      expect(a).toBe(b);
+    });
+
+    it("regression: standard missing-token spellings all fingerprint to the bare null token", () => {
+      const expected = DataGuardRowNavigatorService.rowFingerprint({ x: null }, ["x"]);
+      for (const token of ["null", "NULL", "Null", "NA", "n/a", "N/A", "None", "NONE", "nan", "NaN", "", "  "]) {
+        expect(DataGuardRowNavigatorService.rowFingerprint({ x: token }, ["x"])).toBe(expected);
+      }
+    });
+
+    // Roundtrip integration: the profiler emits a row-key for a null cell,
+    // then the result-panel renders the same row with the Texera-coerced
+    // string `"null"`. `findRowByKey` MUST locate it.
+    it("regression: findRowByKey locates a Texera-coerced null row using the profiler's row-key (round 6)", () => {
+      const profilerRow = { region: "East", score: null, "user.id": "U007", "user.name": "Grace" };
+      const columns = ["region", "score", "user.id", "user.name"];
+      const profilerKey = DataGuardRowNavigatorService.rowFingerprint(profilerRow, columns);
+      // Texera-side display rows: Jackson coerces null → "null", and the
+      // multi-worker scan may have shuffled this row to position 2.
+      const displayRows = [
+        { region: "north", score: "88", "user.id": "U037", "user.name": "Kai" },
+        { region: "south", score: "72", "user.id": "U012", "user.name": "Ivy" },
+        { region: "East", score: "null", "user.id": "U007", "user.name": "Grace" },
+      ];
+      expect(DataGuardRowNavigatorService.findRowByKey(displayRows, columns, profilerKey)).toBe(2);
+    });
   });
 
   describe("findRowByKey", () => {
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
index c2c79163427..7074b9abc06 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
@@ -250,6 +250,40 @@ export class DataGuardRowNavigatorService {
     }
   }
 
+  /**
+   * Case-insensitive set of string tokens that mean "no value was recorded."
+   * MUST stay in sync with `MISSING_TOKENS_LOWER` in
+   * `agent-service/src/agent/tools/dataguard/missing-detection.ts`. We
+   * deliberately inline the set rather than import — frontend and
+   * agent-service are separate build targets with no shared module path —
+   * and the contract is enforced by parallel unit tests using the same
+   * fixtures (see the round-6 regression test for JSONL `null` cells).
+   */
+  private static readonly MISSING_TOKENS_LOWER: ReadonlySet<string> = new Set([
+    "na",
+    "n/a",
+    "null",
+    "none",
+    "nan",
+  ]);
+
+  /**
+   * Mirror of `isMissing` in agent-service `missing-detection.ts`. Treats
+   * `null`, `undefined`, `NaN`, empty / whitespace-only strings, and the
+   * case-insensitive trimmed missing-token set above as missing. Used by
+   * `fingerprintCell` so a profiler-side null cell and a Texera-side
+   * `"null"` string cell collapse to the same fingerprint token (see
+   * round-6 doc on `fingerprintCell` below).
+   */
+  private static isMissingCell(v: unknown): boolean {
+    if (v === null || v === undefined) return true;
+    if (typeof v === "number" && Number.isNaN(v)) return true;
+    if (typeof v !== "string") return false;
+    const trimmed = v.trim();
+    if (trimmed === "") return true;
+    return DataGuardRowNavigatorService.MISSING_TOKENS_LOWER.has(trimmed.toLowerCase());
+  }
+
   /**
    * Compute the fingerprint of a result-panel row. Implementation MUST stay
    * byte-identical to the agent-service `rowFingerprint` helper, otherwise the
@@ -257,15 +291,24 @@ export class DataGuardRowNavigatorService {
    *
    * Contract (mirrored from agent-service):
    *   - Canonicalize column order by alphabetical sort.
-   *   - Each non-null cell is normalised to a string via `String(v)` before
-   *     `JSON.stringify`, so number `45` and string `"45"` produce the same
-   *     token. This is the fix for the JSONL-multi-worker mixed-type case:
-   *     Texera's `JSONLScanSourceOpExec` widens mixed columns to String via
-   *     `parseField(stringValue, schemaType)`, while DataGuard's `parseJsonl`
-   *     keeps native JSON types. Without coercion the two sides fingerprint
-   *     differently and `findRowByKey` misses every row.
-   *   - `undefined` and missing keys emit the bare token `null` (no quotes),
-   *     identical to explicit-null cells.
+   *   - Each non-missing cell is normalised to a string via `String(v)`
+   *     before `JSON.stringify`, so number `45` and string `"45"` produce the
+   *     same token. This is the fix for the JSONL-multi-worker mixed-type
+   *     case: Texera's `JSONLScanSourceOpExec` widens mixed columns to String
+   *     via `parseField(stringValue, schemaType)`, while DataGuard's
+   *     `parseJsonl` keeps native JSON types. Without coercion the two sides
+   *     fingerprint differently and `findRowByKey` misses every row.
+   *   - **Round 6 — missing-token canonicalization.** Any cell `isMissingCell`
+   *     considers absent (`null`, `undefined`, `NaN`, `""`/whitespace, and
+   *     the case-insensitive trimmed tokens `na`/`n/a`/`null`/`none`/`nan`)
+   *     emits the same bare `null` token. This closes the JSONL locate bug
+   *     where Texera's `JSONToMap` calls Jackson's `JsonNode#asText()` on a
+   *     `NullNode`, returning the literal STRING `"null"` rather than Java
+   *     null — without canonicalization the profiler-side cell fingerprints
+   *     as bare `null` while the Texera-side cell fingerprints as
+   *     `"\"null\""`, the locate match silently misses, and the byte-order
+   *     index fallback lands on whatever shuffled display row happens to sit
+   *     at that position.
    *   - Concatenate JSON tokens with an empty separator (each token is
    *     self-delimited as `"…"` or the literal `null`).
    *
@@ -275,7 +318,7 @@ export class DataGuardRowNavigatorService {
    * each side using the same input fixtures.
    */
   private static fingerprintCell(v: unknown): string {
-    if (v === null || v === undefined) return "null";
+    if (DataGuardRowNavigatorService.isMissingCell(v)) return "null";
     return JSON.stringify(String(v));
   }
 

From 05cb8380220acac5997faf7490fe3be641777601 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 05:50:13 -0700
Subject: [PATCH 11/14] fix(dataguard): replace silent locate fallback with a
 not-found toast
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the fingerprint walk exhausts every page without finding the target
row, result-table-frame used to silently fall back to highlighting the
file-byte-order index — which is right for single-worker CSV but wrong
for multi-worker JSONL (worker-shuffle lands an unrelated row at that
position). Earlier versions toasted on every miss and were too noisy,
but that was because the pre-`isCellMissing`-fingerprint contract had
been mismatching 100% of the time on null cells; now that round-6
normalises null/"null"/missing forms across the wire, a real miss only
happens when the data has drifted from the scan.

The table frame now emits `flashed: false` on walk exhaustion; the
checklist caller toasts ("data may have changed since the scan — try
Scan again") only when a rowKey was sent, so the CSV no-rowKey path
stays silent and rapid-double-click supersession doesn't spam.

Tests: extend the existing "JSONL cursor stays put on false" spec to
assert the toast fires, and add a CSV-path test that asserts it does
NOT fire when no rowKey was sent.
---
 .../dataguard-checklist.component.spec.ts     | 28 +++++++++++++++++--
 .../dataguard-checklist.component.ts          | 11 ++++++++
 .../result-table-frame.component.ts           | 26 +++++++++++------
 3 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
index 07912d1d58f..c9cb9fb6a53 100644
--- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
@@ -120,7 +120,7 @@ describe("DataGuardChecklistComponent.onShowInResultPanel — locate branching",
     // (which would also subscribe to the auto-trigger orchestration stream we
     // don't need here).
     (component as any).scan = scanState;
-    return { component, navigateSpy };
+    return { component, navigateSpy, notificationService };
   }
 
   it("CSV path: advances cursor synchronously and fires navigate without awaiting", async () => {
@@ -233,12 +233,34 @@ describe("DataGuardChecklistComponent.onShowInResultPanel — locate branching",
     expect(navigateSpy.mock.calls[0]![0]!.rowKey).toBe("k0");
 
     // Second click: navigate resolves false → cursor stays at 1, next click
-    // will retry the same target rather than skipping it.
-    const { component: c2, navigateSpy: spy2 } = makeComponent("JSONLFileScan", () => Promise.resolve(false));
+    // will retry the same target rather than skipping it. Because a rowKey
+    // was sent (JSONL path), a not-found toast must fire so the user knows
+    // the data has drifted from the scan instead of being silently dropped
+    // on a wrong byte-order row.
+    const { component: c2, navigateSpy: spy2, notificationService: notif2 } = makeComponent(
+      "JSONLFileScan",
+      () => Promise.resolve(false)
+    );
     (c2 as any).locateCursors.set("issue-1", 1);
     await c2.onShowInResultPanel(entry);
     expect((c2 as any).locateCursors.get("issue-1")).toBe(1);
     expect(spy2).toHaveBeenCalledTimes(1);
     expect(spy2.mock.calls[0]![0]!.rowIndex).toBe(7); // step.value at cursor=1
+    expect(notif2.info).toHaveBeenCalledTimes(1);
+    expect(notif2.info.mock.calls[0]![0]).toMatch(/couldn't find this row/i);
+  });
+
+  it("CSV path: no toast on navigate() false (legitimate index fallback, not a drift signal)", async () => {
+    // CSV is single-worker, so the index path is the intended target — no
+    // rowKey is sent. A false outcome there means the navigate timed out
+    // mid-page-render or was superseded by a newer click, not that the row
+    // can't be found. Toasting here would be noisy on every rapid double-click.
+    const { component, notificationService } = makeComponent(
+      "CSVFileScan",
+      () => Promise.resolve(false)
+    );
+    const entry = makeEntry([0, 1, 2, 3], ["k0", "k0", "k0", "k0"]);
+    await component.onShowInResultPanel(entry);
+    expect(notificationService.info).not.toHaveBeenCalled();
   });
 });
diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
index 3a8d939d17d..defc0eb0cd4 100644
--- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.ts
@@ -388,6 +388,17 @@ export class DataGuardChecklistComponent implements OnInit, OnDestroy {
     });
     if (flashed) {
       this.locateCursors.set(entry.issueId, step.nextCursor);
+    } else if (rowKey !== undefined) {
+      // Walk exhausted with a fingerprint-based request: the scan's fingerprint
+      // doesn't match anything currently in the result panel. This happens
+      // when the dataset has drifted since the scan (e.g., the user re-ran
+      // the workflow after editing the file, or applied a fix that altered
+      // this column). Prior behaviour silently fell back to the file-byte-
+      // order index and lit up an unrelated row — strictly worse than no
+      // flash at all. Tell the user instead.
+      this.notificationService.info(
+        "DataGuard: couldn't find this row in the current result — the data may have changed since the scan. Try Scan again."
+      );
     }
   }
 
diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
index 117db3e2338..2d266bd3c25 100644
--- a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
+++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.ts
@@ -287,8 +287,10 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
    * Locate a row by its content fingerprint (`rowKey`). Scans the currently
    * loaded page first (fast path — common when the issue is on the first
    * page); on a miss walks subsequent pages up to LOCATE_BY_KEY_MAX_PAGES.
-   * When no match is found anywhere, toasts the user and leaves the operator
-   * highlighted but un-flashed.
+   * When no match is found anywhere, emits `flashed: false` — the checklist
+   * caller observes the false outcome and surfaces a "row not found" toast,
+   * since at that point the data has drifted from the scan and silently
+   * highlighting a byte-order-index row would mislead the user.
    *
    * Same operator-id captured-at-click-time guard as the index path — bails
    * silently if the user switched operators mid-page-load.
@@ -373,13 +375,19 @@ export class ResultTableFrameComponent implements OnInit, OnChanges, OnDestroy {
         return;
       }
       if (pageIndex > lastPage) {
-        // Exhausted the search window. Silently fall back to the file-byte-order
-        // index — this is the same path locate took before fingerprints existed
-        // and is correct for single-worker output (the common case). For
-        // multi-worker shuffle cases the flash may land on the wrong cell, but
-        // toasting on every click is more annoying than the occasional miss is
-        // confusing; the toast was firing 100% of the time during normal use.
-        this.handleLocateByIndex(requestOperatorId, requestId, fallbackIndex, column);
+        // Exhausted the search window. We deliberately do NOT fall back to the
+        // file-byte-order index here — that path is correct only for single-
+        // worker output, and on multi-worker JSONL it silently lands on the
+        // wrong cell (worker-shuffle puts an unrelated row at the byte-order
+        // position). Earlier versions toasted on every click because the
+        // pre-`isCellMissing`-fingerprint contract mismatched 100% of the time
+        // for null cells; with the round-6 fingerprint normalisation in place
+        // a miss now only happens when the data genuinely no longer matches
+        // the scan (post-Apply drift, schema change, etc.) — surfacing that
+        // is exactly what we want. Emit `flashed: false`; the checklist
+        // caller decides whether to toast based on whether a `rowKey` was in
+        // the request.
+        this.reportFlashResult(requestId, false);
         return;
       }
       this.currentPageIndex = pageIndex;

From 8b9cf1d0855804d8398fa6ddf26dcb33f3b9fce7 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 05:55:28 -0700
Subject: [PATCH 12/14] style(dataguard): apply prettier formatting

---
 .../dataguard-checklist.component.spec.ts          | 14 ++++++--------
 .../agent/data-guard-row-navigator.service.spec.ts |  2 +-
 .../agent/data-guard-row-navigator.service.ts      |  8 +-------
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
index c9cb9fb6a53..9d396a19fcc 100644
--- a/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
+++ b/frontend/src/app/workspace/component/dataguard-checklist/dataguard-checklist.component.spec.ts
@@ -237,10 +237,11 @@ describe("DataGuardChecklistComponent.onShowInResultPanel — locate branching",
     // was sent (JSONL path), a not-found toast must fire so the user knows
     // the data has drifted from the scan instead of being silently dropped
     // on a wrong byte-order row.
-    const { component: c2, navigateSpy: spy2, notificationService: notif2 } = makeComponent(
-      "JSONLFileScan",
-      () => Promise.resolve(false)
-    );
+    const {
+      component: c2,
+      navigateSpy: spy2,
+      notificationService: notif2,
+    } = makeComponent("JSONLFileScan", () => Promise.resolve(false));
     (c2 as any).locateCursors.set("issue-1", 1);
     await c2.onShowInResultPanel(entry);
     expect((c2 as any).locateCursors.get("issue-1")).toBe(1);
@@ -255,10 +256,7 @@ describe("DataGuardChecklistComponent.onShowInResultPanel — locate branching",
     // rowKey is sent. A false outcome there means the navigate timed out
     // mid-page-render or was superseded by a newer click, not that the row
     // can't be found. Toasting here would be noisy on every rapid double-click.
-    const { component, notificationService } = makeComponent(
-      "CSVFileScan",
-      () => Promise.resolve(false)
-    );
+    const { component, notificationService } = makeComponent("CSVFileScan", () => Promise.resolve(false));
     const entry = makeEntry([0, 1, 2, 3], ["k0", "k0", "k0", "k0"]);
     await component.onShowInResultPanel(entry);
     expect(notificationService.info).not.toHaveBeenCalled();
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
index 1d9d5a998e0..4174eb0c5be 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.spec.ts
@@ -237,7 +237,7 @@ describe("DataGuardRowNavigatorService", () => {
     // The fix routes any cell that satisfies `isMissingCell` (null / undefined
     // / NaN / "" / whitespace / `na` / `n/a` / `null` / `none` / `nan`,
     // case-insensitive) through the same bare `null` token on both sides.
-    it("regression: explicit-null cell and Jackson-asText `\"null\"` string fingerprint identically (JSONL round 6)", () => {
+    it('regression: explicit-null cell and Jackson-asText `"null"` string fingerprint identically (JSONL round 6)', () => {
       const profilerRow = { score: null, user: "Grace" };
       const texeraRow = { score: "null", user: "Grace" };
       const a = DataGuardRowNavigatorService.rowFingerprint(profilerRow, ["score", "user"]);
diff --git a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
index 7074b9abc06..9caad0d54d0 100644
--- a/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
+++ b/frontend/src/app/workspace/service/agent/data-guard-row-navigator.service.ts
@@ -259,13 +259,7 @@ export class DataGuardRowNavigatorService {
    * and the contract is enforced by parallel unit tests using the same
    * fixtures (see the round-6 regression test for JSONL `null` cells).
    */
-  private static readonly MISSING_TOKENS_LOWER: ReadonlySet<string> = new Set([
-    "na",
-    "n/a",
-    "null",
-    "none",
-    "nan",
-  ]);
+  private static readonly MISSING_TOKENS_LOWER: ReadonlySet<string> = new Set(["na", "n/a", "null", "none", "nan"]);
 
   /**
    * Mirror of `isMissing` in agent-service `missing-detection.ts`. Treats

From 87c8744d100029c7cc964285ac61e638c5ab8a92 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 05:59:37 -0700
Subject: [PATCH 13/14] fix(dataguard): skip no-op writes in standardize, same
 as replace_value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After iterative Apply rounds normalize a string column to its canonical
form (e.g., `region: "South"` everywhere), the inconsistent_label detector
can keep re-flagging the column whenever the LLM proposes a mapping that
includes the canonical entry itself (`{south: "South"}` against rows that
are already `"South"`). The standardize branch was incrementing
`rowsAffected` on every mapping-key hit regardless of whether
`mapping[v] === v`, so the frontend pushed a byte-identical CSV/JSONL to
LakeFS and got "No changes detected in dataset. Version creation aborted"
— the same convergence failure the round-yesterday `cellEquals` guard
fixed for replace_value.

Add the same guard to `case "standardize"`. `affected` increments only
when the cell genuinely changes.

trim_whitespace already guards (`trimmed !== v`); impute is safe by
construction in the normal path (only missing cells are visited); the
all-missing-column edge case is a known follow-up and not in scope here.
---
 .../dataguard/__tests__/apply-fix.test.ts     | 73 +++++++++++++++++++
 .../src/agent/tools/dataguard/apply-fix.ts    | 16 +++-
 2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
index a0aeb1c2cf4..818e66a218a 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
@@ -382,6 +382,79 @@ describe("applyFix", () => {
     expect(result.dataset.rows[4].yn).toBe("unknown");
   });
 
+  test("standardize: identity mapping on already-canonical data → zero affected", () => {
+    // Iterative-cleanup regression mirroring the replace_value rowIndices
+    // no-op guard. On a JSONL already at v3 (twice-cleaned), the LLM can
+    // propose a mapping whose values equal the keys (e.g. {a: "a", b: "b"})
+    // for cells that are already canonical. Without the cellEquals skip,
+    // affected > 0 → frontend pushes a byte-identical CSV → LakeFS aborts
+    // the version commit with "No changes detected in dataset."
+    const ds: DatasetView = {
+      columns: ["region"],
+      rows: [{ region: "South" }, { region: "North" }, { region: "South" }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "standardize",
+        operationParams: {
+          column: "region",
+          mapping: { South: "South", North: "North" },
+        },
+      })
+    );
+    expect(result.rowsAffected).toBe(0);
+    expect(result.dataset.rows[0].region).toBe("South");
+    expect(result.dataset.rows[1].region).toBe("North");
+    expect(result.dataset.rows[2].region).toBe("South");
+  });
+
+  test("standardize: mixed dataset only counts genuine changes", () => {
+    // Mapping {south: "South"}, but the column already contains a row that
+    // is canonical "South". Only the two lowercase rows should count.
+    const ds: DatasetView = {
+      columns: ["region"],
+      rows: [{ region: "south" }, { region: "South" }, { region: "south" }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "standardize",
+        operationParams: {
+          column: "region",
+          mapping: { south: "South" },
+        },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows[0].region).toBe("South");
+    expect(result.dataset.rows[1].region).toBe("South");
+    expect(result.dataset.rows[2].region).toBe("South");
+  });
+
+  test("standardize: cell not in mapping → untouched, not counted", () => {
+    // The cell "East" has no mapping entry, so it should pass through as-is
+    // and not contribute to rowsAffected.
+    const ds: DatasetView = {
+      columns: ["region"],
+      rows: [{ region: "south" }, { region: "East" }, { region: "south" }],
+    };
+    const result = applyFix(
+      ds,
+      makeProposal({
+        operationKind: "standardize",
+        operationParams: {
+          column: "region",
+          mapping: { south: "South" },
+        },
+      })
+    );
+    expect(result.rowsAffected).toBe(2);
+    expect(result.dataset.rows[0].region).toBe("South");
+    expect(result.dataset.rows[1].region).toBe("East");
+    expect(result.dataset.rows[2].region).toBe("South");
+  });
+
   test("rename_column: updates columns array and per-row keys", () => {
     const ds: DatasetView = {
       columns: ["sample_id", "value"],
diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.ts b/agent-service/src/agent/tools/dataguard/apply-fix.ts
index 8a1474b4026..f4cda1e32d0 100644
--- a/agent-service/src/agent/tools/dataguard/apply-fix.ts
+++ b/agent-service/src/agent/tools/dataguard/apply-fix.ts
@@ -136,11 +136,23 @@ export function applyFix(
       const column = params.column as string;
       const mapping = params.mapping as Record<string, string>;
       let affected = 0;
+      // No-op skip: only count a row as affected when mapping[v] actually
+      // differs from the current cell. Same iterative-convergence pain as
+      // replace_value above — on re-scans of an already-cleaned dataset the
+      // LLM can propose an identity mapping (e.g. {south: "South"} where every
+      // row already says "South", or even {south: "south"} when v3 still flags
+      // canonical-case values). Without this guard, every "match" writes a
+      // byte-identical cell but increments affected, the frontend pushes an
+      // unchanged CSV, and LakeFS rejects the version commit with
+      // "No changes detected in dataset."
       for (const r of rows) {
         const v = r[column];
         if (typeof v === "string" && Object.prototype.hasOwnProperty.call(mapping, v)) {
-          r[column] = mapping[v];
-          affected++;
+          const next = mapping[v];
+          if (!cellEquals(v, next)) {
+            r[column] = next;
+            affected++;
+          }
         }
       }
       return { dataset: { columns, rows }, rowsAffected: affected };

From ad43d2e06437b97b64938d29c47c2496dfb889d4 Mon Sep 17 00:00:00 2001
From: eugenegujing <eugenegujing@outlook.com>
Date: Sat, 16 May 2026 06:49:24 -0700
Subject: [PATCH 14/14] style(dataguard): apply prettier + add Apache license
 headers

- agent-service: prettier --write across 21 DataGuard files (CI was running
  format:check, not just format). Backend tests unchanged at 231/0/500.
- frontend: add Apache 2.0 headers to the two permission-prompt component
  partials that skywalking-eyes flagged.
---
 agent-service/src/agent/texera-agent.ts       | 11 +----
 .../apply-batch-modify-reject.test.ts         | 20 ++++-----
 .../dataguard/__tests__/apply-fix.test.ts     | 18 +++-----
 .../dataguard/__tests__/bias-check.test.ts    | 17 ++++---
 .../__tests__/dataguard-session.test.ts       |  1 -
 .../dataguard/__tests__/decision-log.test.ts  |  4 +-
 .../dataguard/__tests__/export-jsonl.test.ts  |  5 +--
 .../__tests__/permission-types.test.ts        | 10 ++---
 .../__tests__/profile-dataset.test.ts         | 44 ++++++-------------
 .../dataguard/__tests__/suggest-fix.test.ts   | 24 +++++-----
 .../__tests__/with-approval-no-modify.test.ts |  8 +---
 .../src/agent/tools/dataguard/apply-fix.ts    |  6 +--
 .../src/agent/tools/dataguard/bias-check.ts   |  2 +-
 .../agent/tools/dataguard/dataguard-tools.ts  |  6 +--
 .../tools/dataguard/missing-detection.ts      | 24 ++--------
 .../agent/tools/dataguard/profile-dataset.ts  | 30 ++++++-------
 .../src/agent/tools/dataguard/suggest-fix.ts  | 18 ++------
 .../agent/tools/dataguard/with-approval.ts    |  5 +--
 agent-service/src/server.ts                   |  5 +--
 agent-service/src/types/dataguard.test.ts     |  7 +--
 agent-service/src/types/dataguard.ts          | 13 +-----
 .../permission-prompt.component.html          | 19 ++++++++
 .../permission-prompt.component.scss          | 19 ++++++++
 23 files changed, 126 insertions(+), 190 deletions(-)

diff --git a/agent-service/src/agent/texera-agent.ts b/agent-service/src/agent/texera-agent.ts
index f2bc6a2736a..82b31a2fd7d 100644
--- a/agent-service/src/agent/texera-agent.ts
+++ b/agent-service/src/agent/texera-agent.ts
@@ -51,11 +51,7 @@ import { assembleContext } from "./util/context-utils";
 import { compileWorkflowAsync, type WorkflowCompilationResponse } from "../api/compile-api";
 import { createLogger } from "../logger";
 import type { Logger } from "pino";
-import type {
-  FixProposal,
-  IssueType,
-  PermissionDecision,
-} from "../types/dataguard";
+import type { FixProposal, IssueType, PermissionDecision } from "../types/dataguard";
 import { DataGuardSession } from "./tools/dataguard/dataguard-session";
 import type { ApprovalGateway } from "./tools/dataguard/with-approval";
 import type { LlmCallFn } from "./tools/dataguard/suggest-fix";
@@ -970,10 +966,7 @@ export class TexeraAgent implements ApprovalGateway {
   }
 }
 
-function extractIssueIdFromStep(
-  steps: Map<string, ReActStep>,
-  stepId: string
-): string {
+function extractIssueIdFromStep(steps: Map<string, ReActStep>, stepId: string): string {
   const step = steps.get(stepId);
   return step?.pendingApproval?.proposal.issueId ?? "";
 }
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts
index f5bfec93c4f..c6ea1eee0a9 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-batch-modify-reject.test.ts
@@ -66,7 +66,7 @@ beforeEach(() => {
 });
 
 describe(`POST ${API}/agents/:id/dataguard/apply-batch — Modify verdict cut (#11a)`, () => {
-  test("rejects verdict: \"modify\" with a 4xx body-schema error", async () => {
+  test('rejects verdict: "modify" with a 4xx body-schema error', async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
       decisions: [{ issueId: "iss-1", verdict: "modify" }],
@@ -78,15 +78,13 @@ describe(`POST ${API}/agents/:id/dataguard/apply-batch — Modify verdict cut (#
   test("rejects unknown field `modifiedAction` on a decision entry", async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
-      decisions: [
-        { issueId: "iss-1", verdict: "allow", modifiedAction: "Flag instead of replace" },
-      ],
+      decisions: [{ issueId: "iss-1", verdict: "allow", modifiedAction: "Flag instead of replace" }],
     });
     expect(res.status).toBeGreaterThanOrEqual(400);
     expect(res.status).toBeLessThan(500);
   });
 
-  test("still accepts verdict: \"allow\" (baseline — parity check that the cut didn't over-reach)", async () => {
+  test('still accepts verdict: "allow" (baseline — parity check that the cut didn\'t over-reach)', async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
       decisions: [{ issueId: "iss-not-loaded", verdict: "allow" }],
@@ -97,7 +95,7 @@ describe(`POST ${API}/agents/:id/dataguard/apply-batch — Modify verdict cut (#
     expect(res.status).toBe(200);
   });
 
-  test("still accepts verdict: \"deny\" (baseline)", async () => {
+  test('still accepts verdict: "deny" (baseline)', async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
       decisions: [{ issueId: "iss-not-loaded", verdict: "deny" }],
@@ -105,7 +103,7 @@ describe(`POST ${API}/agents/:id/dataguard/apply-batch — Modify verdict cut (#
     expect(res.status).toBe(200);
   });
 
-  test("rejects a mixed batch where ANY decision uses verdict: \"modify\"", async () => {
+  test('rejects a mixed batch where ANY decision uses verdict: "modify"', async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
       decisions: [
@@ -120,7 +118,7 @@ describe(`POST ${API}/agents/:id/dataguard/apply-batch — Modify verdict cut (#
 });
 
 describe(`POST ${API}/agents/:id/dataguard/apply-batch — remember flag scope (#12)`, () => {
-  test("rejects { verdict: \"deny\", remember: true } — remember only applies to allow", async () => {
+  test('rejects { verdict: "deny", remember: true } — remember only applies to allow', async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
       decisions: [{ issueId: "iss-1", verdict: "deny", remember: true }],
@@ -129,7 +127,7 @@ describe(`POST ${API}/agents/:id/dataguard/apply-batch — remember flag scope (
     expect(res.status).toBeLessThan(500);
   });
 
-  test("accepts { verdict: \"allow\", remember: true } (baseline)", async () => {
+  test('accepts { verdict: "allow", remember: true } (baseline)', async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
       decisions: [{ issueId: "iss-not-loaded", verdict: "allow", remember: true }],
@@ -138,7 +136,7 @@ describe(`POST ${API}/agents/:id/dataguard/apply-batch — remember flag scope (
     expect(res.status).toBe(200);
   });
 
-  test("accepts { verdict: \"deny\", remember: false } — only `remember: true` + deny is the forbidden combo", async () => {
+  test('accepts { verdict: "deny", remember: false } — only `remember: true` + deny is the forbidden combo', async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
       decisions: [{ issueId: "iss-not-loaded", verdict: "deny", remember: false }],
@@ -146,7 +144,7 @@ describe(`POST ${API}/agents/:id/dataguard/apply-batch — remember flag scope (
     expect(res.status).toBe(200);
   });
 
-  test("accepts { verdict: \"deny\" } with `remember` omitted entirely", async () => {
+  test('accepts { verdict: "deny" } with `remember` omitted entirely', async () => {
     const id = await createAgent();
     const res = await postJson(`${API}/agents/${id}/dataguard/apply-batch`, {
       decisions: [{ issueId: "iss-not-loaded", verdict: "deny" }],
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
index 818e66a218a..f4d824abbe5 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/apply-fix.test.ts
@@ -277,14 +277,7 @@ describe("applyFix", () => {
     // most frequent string — it's a missing-marker, not data.
     const ds: DatasetView = {
       columns: ["group"],
-      rows: [
-        { group: "A" },
-        { group: "NULL" },
-        { group: "A" },
-        { group: "NULL" },
-        { group: "B" },
-        { group: null },
-      ],
+      rows: [{ group: "A" }, { group: "NULL" }, { group: "A" }, { group: "NULL" }, { group: "B" }, { group: null }],
     };
     const result = applyFix(
       ds,
@@ -325,9 +318,7 @@ describe("applyFix", () => {
   test("impute mode: fills missing with most common string", () => {
     const ds: DatasetView = {
       columns: ["c"],
-      rows: [
-        { c: "A" }, { c: "A" }, { c: "B" }, { c: null }, { c: "" },
-      ],
+      rows: [{ c: "A" }, { c: "A" }, { c: "B" }, { c: null }, { c: "" }],
     };
     const result = applyFix(
       ds,
@@ -458,7 +449,10 @@ describe("applyFix", () => {
   test("rename_column: updates columns array and per-row keys", () => {
     const ds: DatasetView = {
       columns: ["sample_id", "value"],
-      rows: [{ sample_id: "S1", value: 1 }, { sample_id: "S2", value: 2 }],
+      rows: [
+        { sample_id: "S1", value: 1 },
+        { sample_id: "S2", value: 2 },
+      ],
     };
     const result = applyFix(
       ds,
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts
index 791637396c1..6f8b0e5d9f9 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/bias-check.test.ts
@@ -37,16 +37,21 @@ describe("computeBiasCheck", () => {
     const before: DatasetView = {
       columns: ["group"],
       rows: [
-        { group: "A" }, { group: "A" }, { group: "A" }, { group: "A" }, { group: "A" },
-        { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" },
+        { group: "A" },
+        { group: "A" },
+        { group: "A" },
+        { group: "A" },
+        { group: "A" },
+        { group: "B" },
+        { group: "B" },
+        { group: "B" },
+        { group: "B" },
+        { group: "B" },
       ],
     };
     const after: DatasetView = {
       columns: ["group"],
-      rows: [
-        { group: "A" },
-        { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" },
-      ],
+      rows: [{ group: "A" }, { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }, { group: "B" }],
     };
     // A retains 20%, B retains 100% → 80-point gap → skew
     const r = computeBiasCheck(before, after, "group");
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
index 389858105bf..ec917931273 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/dataguard-session.test.ts
@@ -108,5 +108,4 @@ describe("DataGuardSession", () => {
     expect(s.removeAutoAllowRule(rule.ruleId)).toBe(true);
     expect(s.matchesAutoAllowRule("placeholder_value")).toBe(false);
   });
-
 });
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
index da8ddf4e496..92fa18b708f 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/decision-log.test.ts
@@ -71,9 +71,7 @@ describe("serializeDecisionLogCsv", () => {
   test("missing appliedAt renders as an empty trailing field", () => {
     // Post-#11a the schema is 9 cols; appliedAt is the last and can be blank
     // for denied decisions (nothing was applied).
-    const csv = serializeDecisionLogCsv([
-      entry({ userDecision: "deny", appliedAt: undefined }),
-    ]);
+    const csv = serializeDecisionLogCsv([entry({ userDecision: "deny", appliedAt: undefined })]);
     const row = csv.split("\n")[1];
     expect(row.endsWith(",")).toBe(true);
   });
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/export-jsonl.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/export-jsonl.test.ts
index 1ad5ed1f4df..2e4f4347c78 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/export-jsonl.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/export-jsonl.test.ts
@@ -146,10 +146,7 @@ describe(`GET ${API}/agents/:id/dataguard/export-jsonl`, () => {
     const agent = _getAgentForTests(id)!;
     agent.getDataGuardSession().setDataset({
       columns: ["text"],
-      rows: [
-        { text: 'line1\nline2 with "quotes"' },
-        { text: "tab\there" },
-      ],
+      rows: [{ text: 'line1\nline2 with "quotes"' }, { text: "tab\there" }],
     });
     const res = await getRaw(`${API}/agents/${id}/dataguard/export-jsonl`);
     const text = await res.text();
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts
index 06402063524..c2ea136f9f1 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/permission-types.test.ts
@@ -28,15 +28,11 @@
 // `PermissionDecision`, `DecisionLogEntry`, and `RecordDecisionInput`.
 
 import { describe, expect, test } from "bun:test";
-import type {
-  DecisionLogEntry,
-  PermissionDecision,
-  Verdict,
-} from "../../../../types/dataguard";
+import type { DecisionLogEntry, PermissionDecision, Verdict } from "../../../../types/dataguard";
 import type { RecordDecisionInput } from "../dataguard-session";
 
 describe("Verdict type — Modify is gone (#11a)", () => {
-  test("a value of \"modify\" is NOT assignable to Verdict", () => {
+  test('a value of "modify" is NOT assignable to Verdict', () => {
     // @ts-expect-error "modify" is removed from the Verdict union by #11a.
     const v: Verdict = "modify";
     // The .toBe argument is also "modify" and that argument is statically
@@ -45,7 +41,7 @@ describe("Verdict type — Modify is gone (#11a)", () => {
     expect(v).toBe("modify");
   });
 
-  test("\"allow\" and \"deny\" remain valid Verdict members", () => {
+  test('"allow" and "deny" remain valid Verdict members', () => {
     const allow: Verdict = "allow";
     const deny: Verdict = "deny";
     expect(allow).toBe("allow");
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
index cd54b357f2b..8ae770e492a 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/profile-dataset.test.ts
@@ -80,9 +80,7 @@ describe("profileDataset", () => {
   test("detects 999 as placeholder in numeric column (default placeholder list)", () => {
     const ds: DatasetView = {
       columns: ["age"],
-      rows: [
-        { age: 25 }, { age: 999 }, { age: 30 }, { age: 999 }, { age: 999 },
-      ],
+      rows: [{ age: 25 }, { age: 999 }, { age: 30 }, { age: 999 }, { age: 999 }],
     };
     const issues = profileDataset(ds);
     const ph = issues.find(i => i.issueType === "placeholder_value");
@@ -94,9 +92,7 @@ describe("profileDataset", () => {
   test("custom placeholder list overrides default", () => {
     const ds: DatasetView = {
       columns: ["status"],
-      rows: [
-        { status: "ok" }, { status: "missing" }, { status: "ok" }, { status: "missing" },
-      ],
+      rows: [{ status: "ok" }, { status: "missing" }, { status: "ok" }, { status: "missing" }],
     };
     const issues = profileDataset(ds, { placeholderValues: ["missing"] });
     const ph = issues.find(i => i.issueType === "placeholder_value");
@@ -141,12 +137,7 @@ describe("profileDataset", () => {
   });
 
   test("auto-infer recognizes bare `id`, `*Id`, `id_*` patterns too", () => {
-    const cases: Array<{ col: string }> = [
-      { col: "id" },
-      { col: "userId" },
-      { col: "id_card" },
-      { col: "ID" },
-    ];
+    const cases: Array<{ col: string }> = [{ col: "id" }, { col: "userId" }, { col: "id_card" }, { col: "ID" }];
     for (const { col } of cases) {
       const ds: DatasetView = {
         columns: [col, "value"],
@@ -195,7 +186,10 @@ describe("profileDataset", () => {
     // workflow may legitimately have duplicate categorical labels.
     const ds: DatasetView = {
       columns: ["color", "qty"],
-      rows: [{ color: "red", qty: 1 }, { color: "red", qty: 2 }],
+      rows: [
+        { color: "red", qty: 1 },
+        { color: "red", qty: 2 },
+      ],
     };
     const issues = profileDataset(ds);
     expect(issues.find(i => i.issueType === "duplicate_id")).toBeUndefined();
@@ -213,9 +207,7 @@ describe("profileDataset", () => {
   test("validRanges → detects outlier values", () => {
     const ds: DatasetView = {
       columns: ["bmi"],
-      rows: [
-        { bmi: 25.5 }, { bmi: 65 }, { bmi: 72 }, { bmi: 22 },
-      ],
+      rows: [{ bmi: 25.5 }, { bmi: 65 }, { bmi: 72 }, { bmi: 22 }],
     };
     const issues = profileDataset(ds, { validRanges: { bmi: { min: 10, max: 60 } } });
     const outlier = issues.find(i => i.issueType === "outlier");
@@ -340,9 +332,7 @@ describe("profileDataset", () => {
     test("skips column with fewer than outlierMinObservations numeric values", () => {
       // Quartiles aren't meaningful on a tiny sample, so the auto-IQR branch
       // bails. Default min-obs is 10; we feed 5.
-      const rows: Array<Record<string, unknown>> = [
-        { v: 1 }, { v: 2 }, { v: 3 }, { v: 4 }, { v: 1000 },
-      ];
+      const rows: Array<Record<string, unknown>> = [{ v: 1 }, { v: 2 }, { v: 3 }, { v: 4 }, { v: 1000 }];
       const issues = profileDataset({ columns: ["v"], rows }, { enableOutlierDetection: true });
       expect(issues.find(i => i.issueType === "outlier")).toBeUndefined();
     });
@@ -464,10 +454,7 @@ describe("profileDataset", () => {
       for (let i = 0; i < 25; i++) rows.push({ tag: `tag${i}` });
       // Add a clear inconsistency for the 26th distinct group — but we exceed the cap.
       rows.push({ tag: "tag1 " }); // would collide with "tag1" if checked
-      const issues = profileDataset(
-        { columns: ["tag"], rows },
-        { inconsistentLabelMaxCardinality: 20 }
-      );
+      const issues = profileDataset({ columns: ["tag"], rows }, { inconsistentLabelMaxCardinality: 20 });
       expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined();
     });
 
@@ -483,9 +470,7 @@ describe("profileDataset", () => {
     test("does not flag when all spellings agree (genuine low-cardinality categorical)", () => {
       const ds: DatasetView = {
         columns: ["group"],
-        rows: [
-          { group: "A" }, { group: "A" }, { group: "A" }, { group: "B" }, { group: "B" },
-        ],
+        rows: [{ group: "A" }, { group: "A" }, { group: "A" }, { group: "B" }, { group: "B" }],
       };
       const issues = profileDataset(ds);
       expect(issues.find(i => i.issueType === "inconsistent_label")).toBeUndefined();
@@ -597,7 +582,7 @@ describe("profileDataset", () => {
     //
     // The fix collapses both representations to the bare `null` token via
     // the shared `isMissing` predicate so the two sides agree.
-    test("regression: explicit-null cell and Jackson-asText `\"null\"` string fingerprint identically (JSONL round 6)", () => {
+    test('regression: explicit-null cell and Jackson-asText `"null"` string fingerprint identically (JSONL round 6)', () => {
       const profilerRow = { score: null as unknown, user: "Grace" };
       const texeraRow = { score: "null", user: "Grace" };
       const a = rowFingerprint(profilerRow, ["score", "user"]);
@@ -640,10 +625,7 @@ describe("profileDataset", () => {
       // neither indices nor keys are emitted — preserves the existing
       // maybeIndices behaviour.
       const rows = Array.from({ length: 100 }, () => ({ x: null }));
-      const issues = profileDataset(
-        { columns: ["x"], rows },
-        { maxIndicesInIssue: 10 }
-      );
+      const issues = profileDataset({ columns: ["x"], rows }, { maxIndicesInIssue: 10 });
       const miss = issues.find(i => i.issueType === "missing_value");
       expect(miss!.affectedRowIndices).toBeUndefined();
       expect(miss!.affectedRowKeys).toBeUndefined();
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts
index 99edc46304e..03464d86955 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/suggest-fix.test.ts
@@ -91,31 +91,29 @@ describe("suggestFix", () => {
   });
 
   test("throws on invalid JSON", async () => {
-    await expect(
-      suggestFix(makeIssue(), { llmCall: constantLlm("not json at all") })
-    ).rejects.toThrow(/invalid JSON/);
+    await expect(suggestFix(makeIssue(), { llmCall: constantLlm("not json at all") })).rejects.toThrow(/invalid JSON/);
   });
 
   test("throws when required field is missing", async () => {
     const bad = { ...JSON.parse(VALID_RAW_JSON) };
     delete bad.operationKind;
-    await expect(
-      suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })
-    ).rejects.toThrow(/schema validation/);
+    await expect(suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })).rejects.toThrow(
+      /schema validation/
+    );
   });
 
   test("throws when operationKind is not a known enum member", async () => {
     const bad = { ...JSON.parse(VALID_RAW_JSON), operationKind: "delete_database" };
-    await expect(
-      suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })
-    ).rejects.toThrow(/schema validation/);
+    await expect(suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })).rejects.toThrow(
+      /schema validation/
+    );
   });
 
   test("throws when riskTier is not low|medium|high", async () => {
     const bad = { ...JSON.parse(VALID_RAW_JSON), riskTier: "critical" };
-    await expect(
-      suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })
-    ).rejects.toThrow(/schema validation/);
+    await expect(suggestFix(makeIssue(), { llmCall: constantLlm(JSON.stringify(bad)) })).rejects.toThrow(
+      /schema validation/
+    );
   });
 
   test("passes issue details into the prompt for the LLM", async () => {
@@ -126,7 +124,7 @@ describe("suggestFix", () => {
       description: "3 duplicate sample IDs",
     });
     const proposal = await suggestFix(issue, {
-      llmCall: async (prompt) => {
+      llmCall: async prompt => {
         captured = prompt;
         return VALID_RAW_JSON;
       },
diff --git a/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts b/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts
index 8161e261ad2..ef82f1c84dc 100644
--- a/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts
+++ b/agent-service/src/agent/tools/dataguard/__tests__/with-approval-no-modify.test.ts
@@ -34,11 +34,7 @@
 
 import { describe, expect, test } from "bun:test";
 import { requestApproval, type ApprovalGateway } from "../with-approval";
-import type {
-  FixProposal,
-  IssueType,
-  PermissionDecision,
-} from "../../../../types/dataguard";
+import type { FixProposal, IssueType, PermissionDecision } from "../../../../types/dataguard";
 
 function makeProposal(overrides: Partial<FixProposal> = {}): FixProposal {
   return {
@@ -109,7 +105,7 @@ describe("requestApproval after Modify cut (#11a)", () => {
     expect(decision).not.toHaveProperty("modifiedAction");
   });
 
-  test("type system rejects a PermissionDecision literal with verdict: \"modify\"", () => {
+  test('type system rejects a PermissionDecision literal with verdict: "modify"', () => {
     // @ts-expect-error "modify" is no longer a Verdict member after #11a.
     const bad: PermissionDecision = { stepId: "x", verdict: "modify" };
     expect(bad.stepId).toBe("x");
diff --git a/agent-service/src/agent/tools/dataguard/apply-fix.ts b/agent-service/src/agent/tools/dataguard/apply-fix.ts
index f4cda1e32d0..2a174c75ef1 100644
--- a/agent-service/src/agent/tools/dataguard/apply-fix.ts
+++ b/agent-service/src/agent/tools/dataguard/apply-fix.ts
@@ -40,11 +40,7 @@ export interface ApplyOptions {
   missingTokens?: string[];
 }
 
-export function applyFix(
-  dataset: DatasetView,
-  proposal: FixProposal,
-  options: ApplyOptions = {}
-): ApplyResult {
+export function applyFix(dataset: DatasetView, proposal: FixProposal, options: ApplyOptions = {}): ApplyResult {
   const rows = dataset.rows.map(r => ({ ...r }));
   let columns = [...dataset.columns];
   const params = proposal.operationParams;
diff --git a/agent-service/src/agent/tools/dataguard/bias-check.ts b/agent-service/src/agent/tools/dataguard/bias-check.ts
index 0a7b9213a9b..689cc40963b 100644
--- a/agent-service/src/agent/tools/dataguard/bias-check.ts
+++ b/agent-service/src/agent/tools/dataguard/bias-check.ts
@@ -115,7 +115,7 @@ export function createBiasCheckTool(session: DataGuardSession) {
         .optional()
         .describe("Optional explicit 'before' dataset; if omitted, the tool cannot compute bias and returns an error."),
     }),
-    execute: async (input) => {
+    execute: async input => {
       const after = session.getDataset();
       if (!after) return "[ERROR] No dataset in session; load one before calling bias_check.";
       const before = input.beforeDataset ?? null;
diff --git a/agent-service/src/agent/tools/dataguard/dataguard-tools.ts b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts
index 50bb5ea3adf..848b51a47d8 100644
--- a/agent-service/src/agent/tools/dataguard/dataguard-tools.ts
+++ b/agent-service/src/agent/tools/dataguard/dataguard-tools.ts
@@ -85,7 +85,7 @@ Call this once at the start of a DataGuard run. Returns a JSON array of DataQual
         .optional()
         .describe("Override the default missing-token list (default: ['NA', 'N/A', 'n/a', 'null', 'NULL', 'None'])."),
     }),
-    execute: async (input) => {
+    execute: async input => {
       const dataset = ctx.session.getDataset();
       if (!dataset) {
         return "[ERROR] No dataset loaded into DataGuard session. The frontend must call setDataset before invoking profile_dataset.";
@@ -116,7 +116,7 @@ Call after profile_dataset. Pass the issueId from one of the returned issues. Re
     inputSchema: z.object({
       issueId: z.string().describe("The issueId of a DataQualityIssue returned by profile_dataset."),
     }),
-    execute: async (input) => {
+    execute: async input => {
       const issue = ctx.session.getIssue(input.issueId);
       if (!issue) {
         return `[ERROR] No issue with id "${input.issueId}". Call profile_dataset first.`;
@@ -146,7 +146,7 @@ The result includes the user's verdict.`,
     inputSchema: z.object({
       issueId: z.string().describe("The issueId whose proposal should be applied."),
     }),
-    execute: async (input) => {
+    execute: async input => {
       const proposal = ctx.session.getProposal(input.issueId);
       if (!proposal) {
         return `[ERROR] No proposal for issueId "${input.issueId}". Call suggest_fix first.`;
diff --git a/agent-service/src/agent/tools/dataguard/missing-detection.ts b/agent-service/src/agent/tools/dataguard/missing-detection.ts
index afa6e463ae9..d3b7a45da28 100644
--- a/agent-service/src/agent/tools/dataguard/missing-detection.ts
+++ b/agent-service/src/agent/tools/dataguard/missing-detection.ts
@@ -22,34 +22,16 @@
 // Why: when the two disagree, impute silently leaves cells that the profiler
 // flagged as missing — the user sees "NULL"/"N/A" still in the cleaned CSV.
 
-export const DEFAULT_PLACEHOLDERS: ReadonlyArray<string | number> = [
-  999,
-  -1,
-  "unknown",
-  "Unknown",
-];
+export const DEFAULT_PLACEHOLDERS: ReadonlyArray<string | number> = [999, -1, "unknown", "Unknown"];
 
 // Case-insensitive set of tokens that mean "no value was recorded." Compared
 // against the *trimmed*, lowercased cell so whitespace and case can't smuggle
 // a missing cell past the check.
-const MISSING_TOKENS_LOWER: ReadonlySet<string> = new Set([
-  "na",
-  "n/a",
-  "null",
-  "none",
-  "nan",
-]);
+const MISSING_TOKENS_LOWER: ReadonlySet<string> = new Set(["na", "n/a", "null", "none", "nan"]);
 
 // Kept for places that still want the raw token list (e.g., the profiler's
 // ProfileOptions API surface).
-export const DEFAULT_MISSING_TOKENS: ReadonlyArray<string> = [
-  "NA",
-  "N/A",
-  "n/a",
-  "null",
-  "NULL",
-  "None",
-];
+export const DEFAULT_MISSING_TOKENS: ReadonlyArray<string> = ["NA", "N/A", "n/a", "null", "NULL", "None"];
 
 export function isMissing(value: unknown, extraTokens: ReadonlyArray<string> = []): boolean {
   if (value === null || value === undefined) return true;
diff --git a/agent-service/src/agent/tools/dataguard/profile-dataset.ts b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
index bcc22a18052..21b357c3c1e 100644
--- a/agent-service/src/agent/tools/dataguard/profile-dataset.ts
+++ b/agent-service/src/agent/tools/dataguard/profile-dataset.ts
@@ -88,10 +88,7 @@ function isMissing(value: unknown, missingTokens: ReadonlyArray<string>): boolea
   return isCellMissing(value, missingTokens);
 }
 
-function maybeIndices(
-  indices: number[],
-  cap: number
-): number[] | undefined {
+function maybeIndices(indices: number[], cap: number): number[] | undefined {
   return indices.length <= cap ? indices : undefined;
 }
 
@@ -148,10 +145,7 @@ function fingerprintCell(v: unknown): string {
   return JSON.stringify(String(v));
 }
 
-export function rowFingerprint(
-  row: Record<string, unknown>,
-  columns: ReadonlyArray<string>
-): string {
+export function rowFingerprint(row: Record<string, unknown>, columns: ReadonlyArray<string>): string {
   const canonical = [...columns].sort();
   let out = "";
   for (const c of canonical) {
@@ -200,10 +194,7 @@ function inferIdColumn(columns: ReadonlyArray<string>): string | undefined {
   return undefined;
 }
 
-export function profileDataset(
-  dataset: DatasetView,
-  options: ProfileOptions = {}
-): DataQualityIssue[] {
+export function profileDataset(dataset: DatasetView, options: ProfileOptions = {}): DataQualityIssue[] {
   const placeholders = options.placeholderValues ?? DEFAULT_PLACEHOLDERS;
   const missingTokens = options.missingTokens ?? DEFAULT_MISSING_TOKENS;
   const indexCap = options.maxIndicesInIssue ?? DEFAULT_MAX_INDICES_IN_ISSUE;
@@ -269,9 +260,8 @@ export function profileDataset(
   // to infer one from column names (e.g. "sample_id" → use it). Without this
   // inference the auto-trigger's empty-body /scan would never find dup IDs in
   // user datasets — users don't configure scan options through the checklist UI.
-  const idCol = options.idColumn && dataset.columns.includes(options.idColumn)
-    ? options.idColumn
-    : inferIdColumn(dataset.columns);
+  const idCol =
+    options.idColumn && dataset.columns.includes(options.idColumn) ? options.idColumn : inferIdColumn(dataset.columns);
   if (idCol) {
     const positions = new Map<string, number[]>();
     for (let i = 0; i < dataset.rows.length; i++) {
@@ -443,8 +433,14 @@ export function profileDataset(
         if (values.length / Math.max(nonMissingCount, 1) < 0.8) continue;
 
         const sorted = [...values].sort((a, b) => a.v - b.v);
-        const q1 = quantile(sorted.map(p => p.v), 0.25);
-        const q3 = quantile(sorted.map(p => p.v), 0.75);
+        const q1 = quantile(
+          sorted.map(p => p.v),
+          0.25
+        );
+        const q3 = quantile(
+          sorted.map(p => p.v),
+          0.75
+        );
         const iqr = q3 - q1;
         if (iqr === 0) continue;
 
diff --git a/agent-service/src/agent/tools/dataguard/suggest-fix.ts b/agent-service/src/agent/tools/dataguard/suggest-fix.ts
index 01554959c9d..e9a5d456e51 100644
--- a/agent-service/src/agent/tools/dataguard/suggest-fix.ts
+++ b/agent-service/src/agent/tools/dataguard/suggest-fix.ts
@@ -32,14 +32,7 @@ export interface SuggestFixOptions {
 
 const fixProposalSchema = z.object({
   action: z.string().min(1),
-  operationKind: z.enum([
-    "replace_value",
-    "drop_rows",
-    "impute",
-    "standardize",
-    "trim_whitespace",
-    "rename_column",
-  ]),
+  operationKind: z.enum(["replace_value", "drop_rows", "impute", "standardize", "trim_whitespace", "rename_column"]),
   operationParams: z.record(z.string(), z.unknown()),
   riskTier: z.enum(["low", "medium", "high", "warning"]),
   reason: z.string().min(1),
@@ -62,10 +55,7 @@ const DEFAULT_RISK_TIER_BY_ISSUE: Record<string, RiskTier> = {
   inconsistent_label: "medium",
 };
 
-export async function suggestFix(
-  issue: DataQualityIssue,
-  options: SuggestFixOptions
-): Promise<FixProposal> {
+export async function suggestFix(issue: DataQualityIssue, options: SuggestFixOptions): Promise<FixProposal> {
   const prompt = buildPrompt(issue);
   const rawResponse = await options.llmCall(prompt);
   const cleaned = stripCodeFences(rawResponse);
@@ -74,9 +64,7 @@ export async function suggestFix(
   try {
     parsed = JSON.parse(cleaned);
   } catch (e) {
-    throw new Error(
-      `suggest_fix: LLM returned invalid JSON for issue ${issue.issueId}: ${(e as Error).message}`
-    );
+    throw new Error(`suggest_fix: LLM returned invalid JSON for issue ${issue.issueId}: ${(e as Error).message}`);
   }
 
   const validated = fixProposalSchema.safeParse(parsed);
diff --git a/agent-service/src/agent/tools/dataguard/with-approval.ts b/agent-service/src/agent/tools/dataguard/with-approval.ts
index 025a7d101f9..fdbe80f3a28 100644
--- a/agent-service/src/agent/tools/dataguard/with-approval.ts
+++ b/agent-service/src/agent/tools/dataguard/with-approval.ts
@@ -39,10 +39,7 @@ export interface ApprovalGateway {
   awaitDecision(stepId: string): Promise<PermissionDecision>;
 }
 
-export async function requestApproval(
-  gateway: ApprovalGateway,
-  proposal: FixProposal
-): Promise<PermissionDecision> {
+export async function requestApproval(gateway: ApprovalGateway, proposal: FixProposal): Promise<PermissionDecision> {
   // `high` and `warning` ALWAYS prompt — the "remember" rule does not apply.
   // This is the same shape Claude Code uses for destructive Bash operations.
   //
diff --git a/agent-service/src/server.ts b/agent-service/src/server.ts
index 0eb076719f6..771249cb75e 100644
--- a/agent-service/src/server.ts
+++ b/agent-service/src/server.ts
@@ -871,10 +871,7 @@ export function buildApp() {
             verdict: msg.verdict,
             remember: msg.remember,
           });
-          wsLog.info(
-            { agentId, stepId: msg.stepId, verdict: msg.verdict, resolved },
-            "received user decision"
-          );
+          wsLog.info({ agentId, stepId: msg.stepId, verdict: msg.verdict, resolved }, "received user decision");
           return;
         }
 
diff --git a/agent-service/src/types/dataguard.test.ts b/agent-service/src/types/dataguard.test.ts
index 534e657ad06..e7b62c4cc8a 100644
--- a/agent-service/src/types/dataguard.test.ts
+++ b/agent-service/src/types/dataguard.test.ts
@@ -177,12 +177,7 @@ describe("DataGuard type shapes", () => {
       "trim_whitespace",
       "rename_column",
     ];
-    const verdicts: Verdict[] = [
-      "allow",
-      "deny",
-      "auto_allow_low_risk",
-      "auto_allow_remembered",
-    ];
+    const verdicts: Verdict[] = ["allow", "deny", "auto_allow_low_risk", "auto_allow_remembered"];
     expect(risks).toHaveLength(4);
     expect(confidences).toHaveLength(3);
     expect(issueTypes).toHaveLength(5);
diff --git a/agent-service/src/types/dataguard.ts b/agent-service/src/types/dataguard.ts
index d1f8b61e215..98fe654107a 100644
--- a/agent-service/src/types/dataguard.ts
+++ b/agent-service/src/types/dataguard.ts
@@ -32,12 +32,7 @@ export type Confidence = "low" | "medium" | "high";
 // z-score "outlier" detector that flagged anything beyond ±3σ — too aggressive
 // (it removed legitimately large but consecutive readings), so it was dropped.
 // The remaining detector requires the user to supply a hard min/max per column.
-export type IssueType =
-  | "placeholder_value"
-  | "missing_value"
-  | "duplicate_id"
-  | "outlier"
-  | "inconsistent_label";
+export type IssueType = "placeholder_value" | "missing_value" | "duplicate_id" | "outlier" | "inconsistent_label";
 
 export type FixOperationKind =
   | "replace_value"
@@ -51,11 +46,7 @@ export type FixOperationKind =
 // handler recorded a user-supplied action override in the log but always
 // executed the original proposal.operationParams. Revisit post-hackathon
 // with a real natural-language → operationParams parser.
-export type Verdict =
-  | "allow"
-  | "deny"
-  | "auto_allow_low_risk"
-  | "auto_allow_remembered";
+export type Verdict = "allow" | "deny" | "auto_allow_low_risk" | "auto_allow_remembered";
 
 export interface DataQualityIssue {
   issueId: string;
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
index 7e80e93472f..f158d9498ff 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.html
@@ -1,3 +1,22 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
 <div
   class="dg-permission"
   *ngIf="step.pendingApproval && !submitted">
diff --git a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
index 07dc1c545a3..5867a9babb4 100644
--- a/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
+++ b/frontend/src/app/workspace/component/agent/agent-panel/permission-prompt/permission-prompt.component.scss
@@ -1,3 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 .dg-permission {
   border: 1px solid #e5b800;
   border-radius: 8px;