Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ EvalMonkey natively supports evaluating ANY LLM: **AWS Bedrock**, **Azure**, **G
> **Note on API Keys:** If you have special setups that generate long-lived, static API keys for Bedrock, Azure, or GCP, simply supply them in the `.env`! EvalMonkey seamlessly supports both standard IAM / Service Account credential flows *and* long-term stateless authentication strings.

## 🚀 At a Glance
- **8 Agent Frameworks natively supported**: CrewAI, LangChain, OpenAI Agents, Microsoft AutoGen, AWS Bedrock, Ollama, Strands, and custom HTTP endpoints.
- **11 Agent Frameworks natively supported**: CrewAI, LangChain, LlamaIndex, LangGraph, Pydantic AI, OpenAI Agents, Microsoft AutoGen, AWS Bedrock, Ollama, Strands, and custom HTTP endpoints.
- **19 Standard Benchmarks out-of-the-box**: GSM8K, BIG-Bench Hard, HotpotQA, ToxiGen, MT-Bench, MBPP, and more — all categorised by the agent type they target.
- **23 Chaos Injections ready to run**: 12 client-side payload mutations + 11 server-side middleware injections — all text-based, no GPU or vision dependencies.
- **Automatic Eval Asset Generation**: Poor benchmark scores automatically produce `traces.json`, `evals.json`, and `improvement_prompt.md` — one `cat` command away from Claude Code or Cursor.
Expand Down Expand Up @@ -143,12 +143,15 @@ evalmonkey run-benchmark --scenario arc \
| Framework | Notes |
|---|---|
| 🦜 **LangChain** | Any Chain, LCEL pipe, or AgentExecutor behind FastAPI |
| 🦙 **LlamaIndex** | Any QueryEngine, ChatEngine, or ReActAgent |
| 🕸️ **LangGraph** | Any compiled StateGraph or MessageGraph |
| 🛡️ **Pydantic AI** | Any validated Agent returning structured or text data |
| 🤖 **CrewAI** | Any Crew behind a `/chat` or custom endpoint |
| ✨ **OpenAI Agents SDK** | Native OpenAI Chat Completions format supported via `--response-path` |
| ☁️ **AWS Bedrock / Agent Core** | Any Bedrock endpoint, IAM or long-lived key |
| 🧩 **Microsoft AutoGen** | Any ConversableAgent behind HTTP |
| 🦙 **Ollama** | Running locally at `http://localhost:11434` |
| 🧵 **Strands SDK** | Built-in sample apps included |
| 🧬 **Strands** | Enterprise support agents and chatbots |
| 🌐 **Any HTTP Agent** | Flask, Express.js, Go — if it accepts POST it works |

<details>
Expand Down
59 changes: 59 additions & 0 deletions apps/framework_adapters/langgraph_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
EvalMonkey Adapter: LangGraph
====================================
Wraps any LangGraph state graph in a FastAPI endpoint so EvalMonkey
can fire benchmark payloads and chaos injections against it.

Install deps:
pip install langgraph langchain-openai fastapi uvicorn

Usage:
python langgraph_adapter.py
evalmonkey run-benchmark --scenario mmlu --target-url http://localhost:8012/solve
"""
import os
import uvicorn
from fastapi import FastAPI, Request
from typing import Annotated
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_openai import ChatOpenAI

app = FastAPI(title="EvalMonkey LangGraph Adapter")

# ---------------------------------------------------
# Build your LangGraph here
# ---------------------------------------------------
class State(TypedDict):
messages: Annotated[list, add_messages]

llm = ChatOpenAI(model=os.getenv("EVAL_MODEL", "gpt-4o"), temperature=0)

def chatbot(state: State):
return {"messages": [llm.invoke(state["messages"])]}

graph_builder = StateGraph(State)
graph_builder.add_node("chatbot", chatbot)
graph_builder.add_edge(START, "chatbot")
graph_builder.add_edge("chatbot", END)
graph = graph_builder.compile()

@app.post("/solve")
async def solve(request: Request):
payload = await request.json()
question = payload.get("question", payload.get("prompt", ""))

try:
# Pass the question to your compiled LangGraph
initial_state = {"messages": [("user", question)]}
result = graph.invoke(initial_state)

# Extract the final AI message
final_answer = result["messages"][-1].content
return {"status": "success", "data": final_answer}
except Exception as e:
return {"status": "error", "error_message": str(e)}

if __name__ == "__main__":
uvicorn.run(app, host="127.0.0.1", port=8012)
46 changes: 46 additions & 0 deletions apps/framework_adapters/llamaindex_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
EvalMonkey Adapter: LlamaIndex Agent
====================================
Wraps any LlamaIndex query engine or agent in a FastAPI endpoint so EvalMonkey
can fire benchmark payloads and chaos injections against it.

Install deps:
pip install llama-index fastapi uvicorn

Usage:
python llamaindex_adapter.py
evalmonkey run-benchmark --scenario mmlu --target-url http://localhost:8011/solve
"""
import os
import uvicorn
from fastapi import FastAPI, Request
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.agent import ReActAgent
from llama_index.llms.openai import OpenAI

app = FastAPI(title="EvalMonkey LlamaIndex Adapter")

# ---------------------------------------------------
# Build your LlamaIndex agent or query engine here
# ---------------------------------------------------
llm = OpenAI(model=os.getenv("EVAL_MODEL", "gpt-4o"), temperature=0)

# Dummy agent setup for demonstration
documents = [Document(text="EvalMonkey is a great benchmarking framework for testing AI agents.")]
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(llm=llm)

@app.post("/solve")
async def solve(request: Request):
payload = await request.json()
question = payload.get("question", payload.get("prompt", ""))

try:
# Pass the question to your query engine or agent
response = query_engine.query(question)
return {"status": "success", "data": str(response)}
except Exception as e:
return {"status": "error", "error_message": str(e)}

if __name__ == "__main__":
uvicorn.run(app, host="127.0.0.1", port=8011)
46 changes: 46 additions & 0 deletions apps/framework_adapters/pydantic_ai_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
EvalMonkey Adapter: Pydantic AI
====================================
Wraps a Pydantic AI Agent in a FastAPI endpoint so EvalMonkey
can fire benchmark payloads and chaos injections against it.

Install deps:
pip install pydantic-ai fastapi uvicorn

Usage:
python pydantic_ai_adapter.py
evalmonkey run-benchmark --scenario mmlu --target-url http://localhost:8013/solve
"""
import os
import uvicorn
from fastapi import FastAPI, Request
from pydantic_ai import Agent

app = FastAPI(title="EvalMonkey Pydantic AI Adapter")

# ---------------------------------------------------
# Build your Pydantic AI Agent here
# ---------------------------------------------------
# You can use 'openai:gpt-4o' or other providers supported by Pydantic AI
model_name = os.getenv("EVAL_MODEL", "openai:gpt-4o")
agent = Agent(
model_name,
system_prompt="You are a helpful AI assistant. Answer the user's questions clearly and concisely."
)

@app.post("/solve")
async def solve(request: Request):
payload = await request.json()
question = payload.get("question", payload.get("prompt", ""))

try:
# Pass the question to your Pydantic AI agent
result = agent.run_sync(question)

# result.data contains the validated response text
return {"status": "success", "data": result.data}
except Exception as e:
return {"status": "error", "error_message": str(e)}

if __name__ == "__main__":
uvicorn.run(app, host="127.0.0.1", port=8013)
Loading