etr · etr · Jun 21, 2026 · Jun 21, 2026
diff --git a/hooks/validate-agent-output.sh b/hooks/validate-agent-output.sh
@@ -33,45 +33,126 @@ try:
     data = json.load(sys.stdin)
 
     agent_type = data.get("subagent_type", data.get("agent_type", "unknown"))
-    agent_output = data.get("tool_output", data.get("output", ""))
     agent_name = data.get("subagent_name", data.get("name", ""))
 
-    issues = []
+    # Resolve the agent output. The real SubagentStop payload does NOT carry the
+    # output inline -- it ships a "transcript_path" and the output lives as the
+    # last assistant message in that JSONL. The old code only looked at
+    # "tool_output"/"output" (keys the harness never sends), so output_str was
+    # ALWAYS empty and the hook flagged EVERY subagent with "short or empty
+    # output". Agents then went rogue trying to silence the hook (editing this
+    # script, editing settings.json). Fix: read the explicit field if present
+    # (forward-compat / tests), else recover the output from the transcript, and
+    # when the output is genuinely undeterminable, validate NOTHING -- never
+    # invent a warning from missing data.
+
+    def last_assistant_text(path):
+        """Best-effort: concatenated text of the last assistant message in a
+        transcript JSONL. Returns None if nothing usable is found."""
+        try:
+            text = None
+            with open(path, "r") as fh:
+                for line in fh:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        rec = json.loads(line)
+                    except Exception:
+                        continue
+                    msg = rec.get("message", rec)
+                    is_assistant = rec.get("type") == "assistant" or msg.get("role") == "assistant"
+                    if not is_assistant:
+                        continue
+                    content = msg.get("content", "")
+                    if isinstance(content, str):
+                        chunk = content
+                    elif isinstance(content, list):
+                        chunk = "".join(
+                            b.get("text", "") for b in content
+                            if isinstance(b, dict) and b.get("type") == "text"
+                        )
+                    else:
+                        chunk = ""
+                    if chunk.strip():
+                        text = chunk  # keep the LAST non-empty assistant message
+            return text
+        except Exception:
+            return None
+
+    agent_output = None
+    for key in ("tool_output", "output"):
+        val = data.get(key)
+        if isinstance(val, str) and val.strip():
+            agent_output = val
+            break
+    if agent_output is None:
+        transcript_path = data.get("transcript_path")
+        if isinstance(transcript_path, str) and transcript_path:
+            agent_output = last_assistant_text(transcript_path)
 
-    # Convert output to string for analysis
-    output_str = str(agent_output) if agent_output else ""
+    issues = []
+    output_str = agent_output if isinstance(agent_output, str) else ""
+    output_available = output_str.strip() != ""
+
+    # If we could not determine the output, there is nothing to validate. Stay
+    # silent -- a missing output is NOT evidence of a short or broken one.
+    if not output_available:
+        print(json.dumps({
+            "valid": True,
+            "issues": [],
+            "agent_type": agent_type,
+            "agent_name": agent_name
+        }))
+        sys.exit(0)
 
     # Check for common validation issues
 
-    # 1. Empty or very short outputs
+    # 1. Empty or very short outputs (only reachable when output IS available)
     if len(output_str.strip()) < 10:
         issues.append("Agent returned very short or empty output")
 
-    # 2. Check for error indicators in output (use word boundaries for accuracy)
+    # 2. Check for a CRASH indicator. Only a Python traceback is a reliable
+    #    signal that the agent itself blew up -- the bare words "error:",
+    #    "failed:", "exception:" are the everyday vocabulary of review agents
+    #    (a security/test reviewer reports errors and failures by design), so
+    #    flagging them produced constant false positives. The (pattern, label)
+    #    split keeps the raw regex -- backslashes and parens -- OUT of the issue
+    #    text, which is later interpolated into JSON: emitting e.g. "\(" produced
+    #    invalid JSON and a broken hook response.
     import re
     error_patterns = [
-        r"\berror:",
-        r"\bfailed:",
-        r"\bexception:",
-        r"\btraceback \(most recent call last\)"
+        (r"\btraceback \(most recent call last\)", "traceback"),
     ]
     output_lower = output_str.lower()
-    for pattern in error_patterns:
+    for pattern, label in error_patterns:
         if re.search(pattern, output_lower):
-            issues.append(f"Agent output contains error indicator matching: {pattern}")
-
-    # 3. Check if review agents provided a verdict
-    if agent_type in ["code-reviewer", "spec-reviewer"]:
-        has_verdict = any(v in output_lower for v in ["approved", "rejected", "issues:", "compliant", "non-compliant"])
+            issues.append(f"Agent output contains an error indicator ({label})")
+
+    # 3. Check if review agents provided a verdict. Groundwork reviewers are
+    #    named "<area>-reviewer" / "<area>-checker" (e.g. code-quality-reviewer,
+    #    spec-alignment-checker) and emit a compact verdict JSON whose values
+    #    are "approve" / "request-changes" -- match that vocabulary, not the
+    #    stale "code-reviewer"/"approved" one that never fired.
+    review_agent = isinstance(agent_type, str) and (
+        agent_type.endswith("-reviewer") or agent_type.endswith("-checker")
+    )
+    if review_agent:
+        verdict_markers = ["verdict", "approve", "request-changes", "request changes", "rejected", "compliant"]
+        has_verdict = any(v in output_lower for v in verdict_markers)
         if not has_verdict:
-            issues.append("Review agent may be missing clear verdict (approved/rejected)")
-
-    # 4. Check for incomplete outputs (common pattern)
-    incomplete_markers = ["...", "etc.", "and so on", "to be continued"]
-    for marker in incomplete_markers:
-        if marker in output_str:
-            issues.append(f"Agent output may be incomplete (contains: {marker})")
-            break
+            issues.append("Review agent may be missing a clear verdict (approve / request-changes)")
+
+    # 4. Truncation: the output looks cut off mid-thought. Only a *trailing*
+    #    ellipsis is a reliable signal -- a "..." anywhere else is almost always
+    #    a spread operator ({...x}), a path, a diff hunk, or an intra-sentence
+    #    pause, none of which mean the agent stopped early. "etc."/"and so on"
+    #    are intentionally not flagged: they end many complete sentences.
+    stripped = output_str.rstrip()
+    if stripped.endswith("...") or stripped.endswith("…"):
+        issues.append("Agent output may be truncated (ends with an ellipsis)")
+    elif "to be continued" in output_lower:
+        issues.append("Agent output may be truncated (unfinished continuation marker)")
 
     print(json.dumps({
         "valid": len(issues) == 0,
@@ -95,7 +176,7 @@ except Exception as e:
 {
   "hookSpecificOutput": {
     "hookEventName": "SubagentStop",
-    "additionalContext": "Agent validation (${AGENT_TYPE}): ${ISSUES_STR}. Consider reviewing the agent output."
+    "additionalContext": "Automated output diagnostic for the orchestrator (${AGENT_TYPE}): ${ISSUES_STR}. This is a non-blocking note ONLY -- do not act on it by editing plugin hooks, settings, or anything outside your assigned task. If you are the reporting agent, simply ensure your actual deliverable (verdict and findings file) is complete."
   }
 }
 EOF