diff --git a/evalmonkey/evals/asset_generator.py b/evalmonkey/evals/asset_generator.py index 0e695ce..acd87c8 100644 --- a/evalmonkey/evals/asset_generator.py +++ b/evalmonkey/evals/asset_generator.py @@ -23,6 +23,7 @@ import json import os +import re import textwrap from dataclasses import dataclass, field from datetime import datetime @@ -139,6 +140,12 @@ def generate_improvement_evals(self, n: int = 5) -> List[dict]: response_format={"type": "json_object"}, ) content = response.choices[0].message.content + # Strip markdown code fences — some providers (Anthropic) + # wrap JSON in ```json ... ``` even with response_format + if content and "```" in content: + match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", content, re.DOTALL) + if match: + content = match.group(1).strip() # LLM sometimes wraps the array in {"evals": [...]} parsed = json.loads(content) if isinstance(parsed, list): diff --git a/evalmonkey/evals/runner.py b/evalmonkey/evals/runner.py index d7abb85..7e21b37 100644 --- a/evalmonkey/evals/runner.py +++ b/evalmonkey/evals/runner.py @@ -1,7 +1,22 @@ import os import json +import re from evalmonkey.utils.llm import call_llm +def _strip_code_fences(text: str) -> str: + """Strip markdown code fences from LLM output. + + Some providers (notably Anthropic via litellm) wrap JSON responses in + ```json ... ``` code blocks even when response_format=json_object is + requested. This causes json.loads() to fail with a parse error. + """ + if text and "```" in text: + match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL) + if match: + return match.group(1).strip() + return text + + class LLMJudgeProvider: """ LLMJudgeProvider uses litellm to abstract all common backend API LLM providers. @@ -32,6 +47,7 @@ def score_run(self, rubric: str, agent_output: str) -> dict: response_format={"type": "json_object"} ) content = response.choices[0].message.content + content = _strip_code_fences(content) return json.loads(content) except Exception as e: # Fallback if there's a JSON parse error or API issue