diff --git a/README.md b/README.md index 965ef8d..382c9c4 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ EvalMonkey natively supports evaluating ANY LLM: **AWS Bedrock**, **Azure**, **G > **Note on API Keys:** If you have special setups that generate long-lived, static API keys for Bedrock, Azure, or GCP, simply supply them in the `.env`! EvalMonkey seamlessly supports both standard IAM / Service Account credential flows *and* long-term stateless authentication strings. ## πŸš€ At a Glance -- **8 Agent Frameworks natively supported**: CrewAI, LangChain, OpenAI Agents, Microsoft AutoGen, AWS Bedrock, Ollama, Strands, and custom HTTP endpoints. +- **11 Agent Frameworks natively supported**: CrewAI, LangChain, LlamaIndex, LangGraph, Pydantic AI, OpenAI Agents, Microsoft AutoGen, AWS Bedrock, Ollama, Strands, and custom HTTP endpoints. - **19 Standard Benchmarks out-of-the-box**: GSM8K, BIG-Bench Hard, HotpotQA, ToxiGen, MT-Bench, MBPP, and more β€” all categorised by the agent type they target. - **23 Chaos Injections ready to run**: 12 client-side payload mutations + 11 server-side middleware injections β€” all text-based, no GPU or vision dependencies. - **Automatic Eval Asset Generation**: Poor benchmark scores automatically produce `traces.json`, `evals.json`, and `improvement_prompt.md` β€” one `cat` command away from Claude Code or Cursor. @@ -143,12 +143,15 @@ evalmonkey run-benchmark --scenario arc \ | Framework | Notes | |---|---| | 🦜 **LangChain** | Any Chain, LCEL pipe, or AgentExecutor behind FastAPI | +| πŸ¦™ **LlamaIndex** | Any QueryEngine, ChatEngine, or ReActAgent | +| πŸ•ΈοΈ **LangGraph** | Any compiled StateGraph or MessageGraph | +| πŸ›‘οΈ **Pydantic AI** | Any validated Agent returning structured or text data | | πŸ€– **CrewAI** | Any Crew behind a `/chat` or custom endpoint | | ✨ **OpenAI Agents SDK** | Native OpenAI Chat Completions format supported via `--response-path` | | ☁️ **AWS Bedrock / Agent Core** | Any Bedrock endpoint, IAM or long-lived key | | 🧩 **Microsoft AutoGen** | Any ConversableAgent behind HTTP | | πŸ¦™ **Ollama** | Running locally at `http://localhost:11434` | -| 🧡 **Strands SDK** | Built-in sample apps included | +| 🧬 **Strands** | Enterprise support agents and chatbots | | 🌐 **Any HTTP Agent** | Flask, Express.js, Go β€” if it accepts POST it works |
diff --git a/apps/framework_adapters/langgraph_adapter.py b/apps/framework_adapters/langgraph_adapter.py new file mode 100644 index 0000000..40ed137 --- /dev/null +++ b/apps/framework_adapters/langgraph_adapter.py @@ -0,0 +1,59 @@ +""" +EvalMonkey Adapter: LangGraph +==================================== +Wraps any LangGraph state graph in a FastAPI endpoint so EvalMonkey +can fire benchmark payloads and chaos injections against it. + +Install deps: + pip install langgraph langchain-openai fastapi uvicorn + +Usage: + python langgraph_adapter.py + evalmonkey run-benchmark --scenario mmlu --target-url http://localhost:8012/solve +""" +import os +import uvicorn +from fastapi import FastAPI, Request +from typing import Annotated +from typing_extensions import TypedDict +from langgraph.graph import StateGraph, START, END +from langgraph.graph.message import add_messages +from langchain_openai import ChatOpenAI + +app = FastAPI(title="EvalMonkey LangGraph Adapter") + +# --------------------------------------------------- +# Build your LangGraph here +# --------------------------------------------------- +class State(TypedDict): + messages: Annotated[list, add_messages] + +llm = ChatOpenAI(model=os.getenv("EVAL_MODEL", "gpt-4o"), temperature=0) + +def chatbot(state: State): + return {"messages": [llm.invoke(state["messages"])]} + +graph_builder = StateGraph(State) +graph_builder.add_node("chatbot", chatbot) +graph_builder.add_edge(START, "chatbot") +graph_builder.add_edge("chatbot", END) +graph = graph_builder.compile() + +@app.post("/solve") +async def solve(request: Request): + payload = await request.json() + question = payload.get("question", payload.get("prompt", "")) + + try: + # Pass the question to your compiled LangGraph + initial_state = {"messages": [("user", question)]} + result = graph.invoke(initial_state) + + # Extract the final AI message + final_answer = result["messages"][-1].content + return {"status": "success", "data": final_answer} + except Exception as e: + return {"status": "error", "error_message": str(e)} + +if __name__ == "__main__": + uvicorn.run(app, host="127.0.0.1", port=8012) diff --git a/apps/framework_adapters/llamaindex_adapter.py b/apps/framework_adapters/llamaindex_adapter.py new file mode 100644 index 0000000..8be139a --- /dev/null +++ b/apps/framework_adapters/llamaindex_adapter.py @@ -0,0 +1,46 @@ +""" +EvalMonkey Adapter: LlamaIndex Agent +==================================== +Wraps any LlamaIndex query engine or agent in a FastAPI endpoint so EvalMonkey +can fire benchmark payloads and chaos injections against it. + +Install deps: + pip install llama-index fastapi uvicorn + +Usage: + python llamaindex_adapter.py + evalmonkey run-benchmark --scenario mmlu --target-url http://localhost:8011/solve +""" +import os +import uvicorn +from fastapi import FastAPI, Request +from llama_index.core import VectorStoreIndex, Document +from llama_index.core.agent import ReActAgent +from llama_index.llms.openai import OpenAI + +app = FastAPI(title="EvalMonkey LlamaIndex Adapter") + +# --------------------------------------------------- +# Build your LlamaIndex agent or query engine here +# --------------------------------------------------- +llm = OpenAI(model=os.getenv("EVAL_MODEL", "gpt-4o"), temperature=0) + +# Dummy agent setup for demonstration +documents = [Document(text="EvalMonkey is a great benchmarking framework for testing AI agents.")] +index = VectorStoreIndex.from_documents(documents) +query_engine = index.as_query_engine(llm=llm) + +@app.post("/solve") +async def solve(request: Request): + payload = await request.json() + question = payload.get("question", payload.get("prompt", "")) + + try: + # Pass the question to your query engine or agent + response = query_engine.query(question) + return {"status": "success", "data": str(response)} + except Exception as e: + return {"status": "error", "error_message": str(e)} + +if __name__ == "__main__": + uvicorn.run(app, host="127.0.0.1", port=8011) diff --git a/apps/framework_adapters/pydantic_ai_adapter.py b/apps/framework_adapters/pydantic_ai_adapter.py new file mode 100644 index 0000000..b2d44ce --- /dev/null +++ b/apps/framework_adapters/pydantic_ai_adapter.py @@ -0,0 +1,46 @@ +""" +EvalMonkey Adapter: Pydantic AI +==================================== +Wraps a Pydantic AI Agent in a FastAPI endpoint so EvalMonkey +can fire benchmark payloads and chaos injections against it. + +Install deps: + pip install pydantic-ai fastapi uvicorn + +Usage: + python pydantic_ai_adapter.py + evalmonkey run-benchmark --scenario mmlu --target-url http://localhost:8013/solve +""" +import os +import uvicorn +from fastapi import FastAPI, Request +from pydantic_ai import Agent + +app = FastAPI(title="EvalMonkey Pydantic AI Adapter") + +# --------------------------------------------------- +# Build your Pydantic AI Agent here +# --------------------------------------------------- +# You can use 'openai:gpt-4o' or other providers supported by Pydantic AI +model_name = os.getenv("EVAL_MODEL", "openai:gpt-4o") +agent = Agent( + model_name, + system_prompt="You are a helpful AI assistant. Answer the user's questions clearly and concisely." +) + +@app.post("/solve") +async def solve(request: Request): + payload = await request.json() + question = payload.get("question", payload.get("prompt", "")) + + try: + # Pass the question to your Pydantic AI agent + result = agent.run_sync(question) + + # result.data contains the validated response text + return {"status": "success", "data": result.data} + except Exception as e: + return {"status": "error", "error_message": str(e)} + +if __name__ == "__main__": + uvicorn.run(app, host="127.0.0.1", port=8013)