From 46be86640a8107e95f86ff5311d06c0dbe807d4b Mon Sep 17 00:00:00 2001
From: Chris <72043878+cwilson613@users.noreply.github.com>
Date: Thu, 30 Apr 2026 21:42:23 -0400
Subject: [PATCH] fix: strip markdown code fences from LLM judge responses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some providers (notably Anthropic via litellm) wrap JSON responses
in ```json ... ``` markdown code blocks even when response_format=
json_object is requested. This causes json.loads() to fail with a
parse error, scoring every evaluation as 0/100.

Fixed in both scoring paths:
- runner.py (LLMJudgeProvider.score_run) — benchmark scoring
- asset_generator.py (generate_improvement_evals) — eval generation

The fix extracts JSON from code fences before parsing. Normal JSON
responses (without fences) pass through unchanged.

Tested with Anthropic claude-haiku-4-5 as eval judge — confirmed
scoring works correctly after this fix (was 0/100 on all benchmarks
before, now scores match expected behavior).
---
 evalmonkey/evals/asset_generator.py |  7 +++++++
 evalmonkey/evals/runner.py          | 16 ++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/evalmonkey/evals/asset_generator.py b/evalmonkey/evals/asset_generator.py
index 0e695ce..acd87c8 100644
--- a/evalmonkey/evals/asset_generator.py
+++ b/evalmonkey/evals/asset_generator.py
@@ -23,6 +23,7 @@
 
 import json
 import os
+import re
 import textwrap
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -139,6 +140,12 @@ def generate_improvement_evals(self, n: int = 5) -> List[dict]:
                 response_format={"type": "json_object"},
             )
             content = response.choices[0].message.content
+            # Strip markdown code fences — some providers (Anthropic)
+            # wrap JSON in ```json ... ``` even with response_format
+            if content and "```" in content:
+                match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", content, re.DOTALL)
+                if match:
+                    content = match.group(1).strip()
             # LLM sometimes wraps the array in {"evals": [...]}
             parsed = json.loads(content)
             if isinstance(parsed, list):
diff --git a/evalmonkey/evals/runner.py b/evalmonkey/evals/runner.py
index d7abb85..7e21b37 100644
--- a/evalmonkey/evals/runner.py
+++ b/evalmonkey/evals/runner.py
@@ -1,7 +1,22 @@
 import os
 import json
+import re
 from evalmonkey.utils.llm import call_llm
 
+def _strip_code_fences(text: str) -> str:
+    """Strip markdown code fences from LLM output.
+
+    Some providers (notably Anthropic via litellm) wrap JSON responses in
+    ```json ... ``` code blocks even when response_format=json_object is
+    requested.  This causes json.loads() to fail with a parse error.
+    """
+    if text and "```" in text:
+        match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+    return text
+
+
 class LLMJudgeProvider:
     """
     LLMJudgeProvider uses litellm to abstract all common backend API LLM providers.
@@ -32,6 +47,7 @@ def score_run(self, rubric: str, agent_output: str) -> dict:
                 response_format={"type": "json_object"}
             )
             content = response.choices[0].message.content
+            content = _strip_code_fences(content)
             return json.loads(content)
         except Exception as e:
             # Fallback if there's a JSON parse error or API issue