From 46be86640a8107e95f86ff5311d06c0dbe807d4b Mon Sep 17 00:00:00 2001 From: Chris <72043878+cwilson613@users.noreply.github.com> Date: Thu, 30 Apr 2026 21:42:23 -0400 Subject: [PATCH] fix: strip markdown code fences from LLM judge responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some providers (notably Anthropic via litellm) wrap JSON responses in ```json ... ``` markdown code blocks even when response_format= json_object is requested. This causes json.loads() to fail with a parse error, scoring every evaluation as 0/100. Fixed in both scoring paths: - runner.py (LLMJudgeProvider.score_run) — benchmark scoring - asset_generator.py (generate_improvement_evals) — eval generation The fix extracts JSON from code fences before parsing. Normal JSON responses (without fences) pass through unchanged. Tested with Anthropic claude-haiku-4-5 as eval judge — confirmed scoring works correctly after this fix (was 0/100 on all benchmarks before, now scores match expected behavior). --- evalmonkey/evals/asset_generator.py | 7 +++++++ evalmonkey/evals/runner.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/evalmonkey/evals/asset_generator.py b/evalmonkey/evals/asset_generator.py index 0e695ce..acd87c8 100644 --- a/evalmonkey/evals/asset_generator.py +++ b/evalmonkey/evals/asset_generator.py @@ -23,6 +23,7 @@ import json import os +import re import textwrap from dataclasses import dataclass, field from datetime import datetime @@ -139,6 +140,12 @@ def generate_improvement_evals(self, n: int = 5) -> List[dict]: response_format={"type": "json_object"}, ) content = response.choices[0].message.content + # Strip markdown code fences — some providers (Anthropic) + # wrap JSON in ```json ... ``` even with response_format + if content and "```" in content: + match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", content, re.DOTALL) + if match: + content = match.group(1).strip() # LLM sometimes wraps the array in {"evals": [...]} parsed = json.loads(content) if isinstance(parsed, list): diff --git a/evalmonkey/evals/runner.py b/evalmonkey/evals/runner.py index d7abb85..7e21b37 100644 --- a/evalmonkey/evals/runner.py +++ b/evalmonkey/evals/runner.py @@ -1,7 +1,22 @@ import os import json +import re from evalmonkey.utils.llm import call_llm +def _strip_code_fences(text: str) -> str: + """Strip markdown code fences from LLM output. + + Some providers (notably Anthropic via litellm) wrap JSON responses in + ```json ... ``` code blocks even when response_format=json_object is + requested. This causes json.loads() to fail with a parse error. + """ + if text and "```" in text: + match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL) + if match: + return match.group(1).strip() + return text + + class LLMJudgeProvider: """ LLMJudgeProvider uses litellm to abstract all common backend API LLM providers. @@ -32,6 +47,7 @@ def score_run(self, rubric: str, agent_output: str) -> dict: response_format={"type": "json_object"} ) content = response.choices[0].message.content + content = _strip_code_fences(content) return json.loads(content) except Exception as e: # Fallback if there's a JSON parse error or API issue