From 1ef5d880341563d2f348284dfbe33637097c5aae Mon Sep 17 00:00:00 2001 From: himmi-01 Date: Tue, 5 May 2026 22:45:57 -0700 Subject: [PATCH] feat: evalmonkey web ui and benchmark stability fixes - Added Next.js & FastAPI Web UI for live benchmarking - Fixed macOS torch shared memory permission crash - Improved HuggingFace datasets loading logic (trust_remote_code=True) - Fixed Hellaswag and MMLU strict LLM judge options mapping - Updated UI to auto-detect LLM judge from environment --- .gitignore | 14 +- README.md | 81 +++-- evalmonkey/scenarios/standard_benchmarks.py | 32 +- ui/README.md | 57 ++++ ui/backend/__init__.py | 1 + ui/backend/db.py | 176 ++++++++++ ui/backend/main.py | 245 ++++++++++++++ ui/backend/run_engine.py | 188 +++++++++++ ui/backend/schemas.py | 75 +++++ ui/frontend/app/globals.css | 200 +++++++++++ ui/frontend/app/history/page.tsx | 194 +++++++++++ ui/frontend/app/layout.tsx | 21 ++ ui/frontend/app/page.tsx | 142 ++++++++ ui/frontend/app/run/[id]/page.tsx | 258 ++++++++++++++ ui/frontend/app/run/new/page.tsx | 354 ++++++++++++++++++++ ui/frontend/components/CategoryBadge.tsx | 25 ++ ui/frontend/components/RunCard.tsx | 78 +++++ ui/frontend/components/ScoreRing.tsx | 59 ++++ ui/frontend/components/Sidebar.tsx | 57 ++++ ui/frontend/next.config.js | 13 + ui/frontend/package.json | 27 ++ ui/frontend/postcss.config.js | 6 + ui/frontend/tailwind.config.js | 44 +++ ui/frontend/tsconfig.json | 21 ++ ui/requirements-ui.txt | 2 + 25 files changed, 2341 insertions(+), 29 deletions(-) create mode 100644 ui/README.md create mode 100644 ui/backend/__init__.py create mode 100644 ui/backend/db.py create mode 100644 ui/backend/main.py create mode 100644 ui/backend/run_engine.py create mode 100644 ui/backend/schemas.py create mode 100644 ui/frontend/app/globals.css create mode 100644 ui/frontend/app/history/page.tsx create mode 100644 ui/frontend/app/layout.tsx create mode 100644 ui/frontend/app/page.tsx create mode 100644 ui/frontend/app/run/[id]/page.tsx create mode 100644 ui/frontend/app/run/new/page.tsx create mode 100644 ui/frontend/components/CategoryBadge.tsx create mode 100644 ui/frontend/components/RunCard.tsx create mode 100644 ui/frontend/components/ScoreRing.tsx create mode 100644 ui/frontend/components/Sidebar.tsx create mode 100644 ui/frontend/next.config.js create mode 100644 ui/frontend/package.json create mode 100644 ui/frontend/postcss.config.js create mode 100644 ui/frontend/tailwind.config.js create mode 100644 ui/frontend/tsconfig.json create mode 100644 ui/requirements-ui.txt diff --git a/.gitignore b/.gitignore index ffb25e4..039ad6c 100644 --- a/.gitignore +++ b/.gitignore @@ -41,4 +41,16 @@ env/ history.json *.log demo_run.sh -output/ \ No newline at end of file +output/ + +# UI โ€” Node / Next.js +ui/frontend/node_modules/ +ui/frontend/.next/ +ui/frontend/.swc/ +ui/frontend/next-env.d.ts +ui/frontend/tsconfig.tsbuildinfo +ui/frontend/package-lock.json +ui/frontend/.env.local + +# UI โ€” SQLite database (local data only) +*.db \ No newline at end of file diff --git a/README.md b/README.md index 09d0bd3..965ef8d 100644 --- a/README.md +++ b/README.md @@ -28,51 +28,84 @@ EvalMonkey natively supports evaluating ANY LLM: **AWS Bedrock**, **Azure**, **G ## ๐Ÿš€ At a Glance - **8 Agent Frameworks natively supported**: CrewAI, LangChain, OpenAI Agents, Microsoft AutoGen, AWS Bedrock, Ollama, Strands, and custom HTTP endpoints. -- **20 Standard Benchmarks out-of-the-box**: GSM8K, BIG-Bench Hard, HotpotQA, ToxiGen, MT-Bench, MBPP, and more โ€” all categorised by the agent type they target. +- **19 Standard Benchmarks out-of-the-box**: GSM8K, BIG-Bench Hard, HotpotQA, ToxiGen, MT-Bench, MBPP, and more โ€” all categorised by the agent type they target. - **23 Chaos Injections ready to run**: 12 client-side payload mutations + 11 server-side middleware injections โ€” all text-based, no GPU or vision dependencies. - **Automatic Eval Asset Generation**: Poor benchmark scores automatically produce `traces.json`, `evals.json`, and `improvement_prompt.md` โ€” one `cat` command away from Claude Code or Cursor. +--- + ## โšก๏ธ Quick Start +### Option A โ€” Let Claude Code or Cursor set it up for you (30 seconds) + +Open Claude Code, Cursor, or any AI coding assistant and paste this prompt: + +``` +Set up EvalMonkey in my project so I can benchmark my AI agent. + +1. Clone https://github.com/Corbell-AI/evalmonkey into a sibling folder +2. Run: pip install -e . inside that folder +3. Copy .env.example to .env and ask me which LLM provider I want to use as the benchmark judge (OpenAI, Anthropic, Bedrock, or Ollama) โ€” then fill in the correct key +4. Run: evalmonkey init --framework --name "My Agent" --port + Use the framework my agent is built with (crewai / langchain / openai / bedrock / autogen / ollama / strands / custom) +5. Show me the generated evalmonkey.yaml and ask me to confirm the agent URL and response path are correct +6. Run a quick smoke test: evalmonkey run-benchmark --scenario gsm8k --sample-agent rag_app --limit 2 + to confirm everything is wired up correctly +7. Then run the real benchmark against my agent: evalmonkey run-benchmark --scenario mmlu --limit 5 +8. Show me the score and explain what it means +``` + +> The agent will handle cloning, installing, configuring your `.env`, and running the first benchmark โ€” all without you typing a single command. + +--- + +### Option B โ€” Manual Setup (5 minutes) + +**1. Install** ```bash git clone https://github.com/Corbell-AI/evalmonkey cd evalmonkey pip install -e . ``` -**Step 1 โ€” Run this once inside your agent's project folder:** +**2. Configure your LLM key** (used only as the evaluation judge โ€” never for your agent) ```bash -cd /your/crewai-project # wherever your agent lives -evalmonkey init --framework crewai --name "My Research Crew" --port 8000 +cp .env.example .env ``` -This auto-generates a pre-filled `evalmonkey.yaml` with the correct request/response format for your framework. Supported: `crewai`, `langchain`, `openai`, `bedrock`, `autogen`, `ollama`, `strands`, `custom`. +Open `.env` and set **one** of these depending on your LLM provider: +```bash +EVAL_MODEL=gpt-4o +OPENAI_API_KEY=sk-... # OpenAI + +# โ€” OR โ€” +EVAL_MODEL=anthropic/claude-haiku-4-5 +ANTHROPIC_API_KEY=sk-ant-... # Anthropic -**Step 2 โ€” Edit the two settings that matter:** -```yaml -# evalmonkey.yaml โ€” generated for CrewAI -agent: - name: "My Research Crew" - framework: crewai - url: http://localhost:8000/chat # โ† where your agent listens - request_key: message - response_path: reply +# โ€” OR โ€” +EVAL_MODEL=bedrock/anthropic.claude-3-haiku-20240307-v1:0 +AWS_ACCESS_KEY_ID=... # AWS Bedrock - # โ† EvalMonkey will start this for you automatically! - # It spawns the process, waits for it to turn on, benchmarks, then stops it. - agent_command: "python src/agent.py" # or: uvicorn src.agent:app --port 8000 - agent_startup_wait: 3 # seconds to wait after launch +# โ€” OR โ€” (no key needed) +EVAL_MODEL=ollama/llama3 # Local Ollama +``` -eval_model: "gpt-4o" # โ† the LLM used as benchmark judge +**3. Smoke test with the built-in sample agent** (no agent of your own needed yet) +```bash +evalmonkey run-benchmark --scenario gsm8k --sample-agent rag_app --limit 3 ``` +You should see 3 samples run and a score printed. โœ… -**Step 3 โ€” Run everything. EvalMonkey starts your agent, benchmarks it, then stops it:** +**4. Point it at your own agent** ```bash -evalmonkey run-benchmark --scenario mmlu -evalmonkey run-chaos --scenario mmlu --chaos-profile client_prompt_injection -evalmonkey history --scenario mmlu +cd /path/to/your/agent/project +evalmonkey init --framework crewai --name "My Agent" --port 8000 +# Edit the generated evalmonkey.yaml to set your agent's URL and response format +evalmonkey run-benchmark --scenario mmlu --limit 5 ``` -> EvalMonkey discovers `evalmonkey.yaml` from the **current working directory** โ€” the same convention used by `pytest`, `promptfoo`, and `docker-compose`. Run all commands from your agent's project folder. +> `evalmonkey.yaml` is discovered from the **current working directory** โ€” same convention as `pytest` and `docker-compose`. + +--- ## ๐Ÿค Works With Any Agent โ€” No Code Changes Required diff --git a/evalmonkey/scenarios/standard_benchmarks.py b/evalmonkey/scenarios/standard_benchmarks.py index c40e4be..ce1296c 100644 --- a/evalmonkey/scenarios/standard_benchmarks.py +++ b/evalmonkey/scenarios/standard_benchmarks.py @@ -101,7 +101,17 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce Automatically downloads datasets and converts them to standard HTTP scenarios! """ try: - from datasets import load_dataset + import os + # Prevent PyTorch shared-memory multiprocessing on Mac. + # Even with streaming=True, HuggingFace datasets can invoke torch_shm_manager + # for internal caching โ€” which fails on Mac with "Permission denied". + os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") + os.environ.setdefault("OMP_NUM_THREADS", "1") + os.environ.setdefault("HF_DATASETS_OFFLINE", "0") + + from datasets import load_dataset, disable_progress_bar, disable_caching + disable_progress_bar() + disable_caching() # prevents torch_shm from being invoked for cache writes except ImportError: raise ImportError("The 'datasets' library is required to run standard benchmarks. Please run 'pip install datasets'.") @@ -132,7 +142,7 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce elif benchmark_name.lower() == "xlam": # A standard function calling benchmark try: - dataset = load_dataset("Salesforce/xlam-function-calling-60k", split="train", streaming=True) + dataset = load_dataset("Salesforce/xlam-function-calling-60k", split="train", streaming=True, trust_remote_code=True) for idx, item in enumerate(dataset): if idx >= limit: break @@ -172,20 +182,34 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce path, name, split, q_col, a_col = hf_map[benchmark_name.lower()] desc = SUPPORTED_BENCHMARKS[benchmark_name.lower()]["description"] print(f"Loading {benchmark_name} from HuggingFace Datasets ({path})...") - dataset = load_dataset(path, name, split=split, streaming=True) if name else load_dataset(path, split=split, streaming=True) + dataset = load_dataset(path, name, split=split, streaming=True, trust_remote_code=True) if name else load_dataset(path, split=split, streaming=True, trust_remote_code=True) for idx, item in enumerate(dataset): if idx >= limit: break question_text = str(item.get(q_col, "No question")) + expected_answer = str(item.get(a_col, 'Unknown')) + if benchmark_name.lower() == "mmlu" and "choices" in item: question_text += f"\nChoices: {item['choices']}" + try: + ans_idx = int(expected_answer) + expected_answer = f"Option {ans_idx}: {item['choices'][ans_idx]}" + except (ValueError, IndexError): + pass + elif benchmark_name.lower() == "hella-swag" and "endings" in item: + question_text += f"\nOptions:\n0: {item['endings'][0]}\n1: {item['endings'][1]}\n2: {item['endings'][2]}\n3: {item['endings'][3]}" + try: + ans_idx = int(expected_answer) + expected_answer = f"Option {ans_idx}: {item['endings'][ans_idx]}" + except (ValueError, IndexError): + pass scenarios.append(EvalScenario( id=f"{benchmark_name}_{idx}", description=desc, input_payload={"question": question_text}, - expected_behavior_rubric=f"Agent MUST deduce or output this answer: {item.get(a_col, 'Unknown')}" + expected_behavior_rubric=f"Agent MUST deduce or output this answer: {expected_answer}" )) else: print(f"Dataset mappings for {benchmark_name} are currently stubbed.") diff --git a/ui/README.md b/ui/README.md new file mode 100644 index 0000000..3fec255 --- /dev/null +++ b/ui/README.md @@ -0,0 +1,57 @@ + +# EvalMonkey UI + +A professional web interface for running benchmarks, chaos tests, and tracking agent reliability over time. + +## Quick Start + +**Terminal 1 โ€” Backend (FastAPI)** +```bash +cd +cp .env.example .env # add EVAL_MODEL + your LLM API key +uvicorn ui.backend.main:app --reload --port 8080 +``` + +**Terminal 2 โ€” Frontend (Next.js)** +```bash +cd /ui/frontend +npm run dev +``` + +Open **http://localhost:3000** in your browser. + +--- + +## Features + +| Page | Description | +|---|---| +| **Dashboard** | Production Reliability hero, live runs, recent results grid | +| **New Run** | 3-step wizard: agent URL โ†’ benchmark โ†’ configure & launch | +| **Live Run** | SSE-streamed real-time sample results with score rings | +| **History** | Recharts trend lines, reliability per scenario, all-runs table | + +## Architecture + +``` +FastAPI backend โ†’ SQLite (~/.evalmonkey/ui.db) + โ†• REST + SSE +Next.js frontend โ†’ http://localhost:3000 +``` + +The `StorageBackend` ABC in `ui/backend/db.py` makes the storage layer swappable โ€” replace `SQLiteBackend` with `PostgresBackend` in a single line. + +## Extending Storage +```python +# In ui/backend/db.py โ€” implement this ABC: +class MyBackend(StorageBackend): + def save_run(self, run: RunRecord) -> None: ... + # ... 5 other methods + +# Then in your app startup: +from ui.backend.db import set_backend +set_backend(MyBackend()) +``` + +## CLI โ€” No Impact +The existing `evalmonkey` CLI continues to work exactly as before. The UI is a completely additive layer โ€” it imports from the same `evalmonkey.*` packages but adds no changes to them. diff --git a/ui/backend/__init__.py b/ui/backend/__init__.py new file mode 100644 index 0000000..f907ee5 --- /dev/null +++ b/ui/backend/__init__.py @@ -0,0 +1 @@ +# EvalMonkey UI Backend diff --git a/ui/backend/db.py b/ui/backend/db.py new file mode 100644 index 0000000..68dc816 --- /dev/null +++ b/ui/backend/db.py @@ -0,0 +1,176 @@ +"""Abstract storage backend + SQLite implementation for EvalMonkey UI. + +Swap to a different backend (Postgres, Redis, etc.) by implementing +StorageBackend and passing your instance to set_backend(). +""" +from __future__ import annotations + +import json +import sqlite3 +from abc import ABC, abstractmethod +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path +from typing import List, Optional + +from .schemas import RunRecord, SampleResult + +DB_PATH = Path.home() / ".evalmonkey" / "ui.db" + + +class StorageBackend(ABC): + """Abstract interface โ€” implement this to swap storage layers.""" + + @abstractmethod + def save_run(self, run: RunRecord) -> None: ... + + @abstractmethod + def update_run(self, run_id: str, **kwargs) -> None: ... + + @abstractmethod + def get_run(self, run_id: str) -> Optional[RunRecord]: ... + + @abstractmethod + def get_all_runs(self, limit: int = 100) -> List[RunRecord]: ... + + @abstractmethod + def save_sample(self, sample: SampleResult) -> None: ... + + @abstractmethod + def get_samples(self, run_id: str) -> List[SampleResult]: ... + + +class SQLiteBackend(StorageBackend): + """SQLite-backed storage. Data lives at ~/.evalmonkey/ui.db โ€” zero setup.""" + + def __init__(self, db_path: Path = DB_PATH) -> None: + self.db_path = db_path + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._init_schema() + + @contextmanager + def _conn(self): + conn = sqlite3.connect(str(self.db_path), check_same_thread=False) + conn.row_factory = sqlite3.Row + try: + yield conn + conn.commit() + finally: + conn.close() + + def _init_schema(self) -> None: + with self._conn() as conn: + conn.executescript(""" + CREATE TABLE IF NOT EXISTS runs ( + id TEXT PRIMARY KEY, + scenario TEXT NOT NULL, + run_type TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'running', + target_url TEXT NOT NULL, + eval_model TEXT NOT NULL, + request_key TEXT NOT NULL DEFAULT 'question', + response_path TEXT NOT NULL DEFAULT 'data', + chaos_profile TEXT, + score INTEGER, + sample_count INTEGER DEFAULT 0, + completed_samples INTEGER DEFAULT 0, + created_at TEXT NOT NULL, + completed_at TEXT, + details TEXT DEFAULT '{}' + ); + CREATE TABLE IF NOT EXISTS sample_results ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id TEXT NOT NULL, + sample_index INTEGER NOT NULL, + eval_id TEXT NOT NULL, + question TEXT NOT NULL, + agent_output TEXT, + expected_rubric TEXT, + score INTEGER, + reasoning TEXT, + chaos_profile TEXT, + created_at TEXT NOT NULL, + FOREIGN KEY (run_id) REFERENCES runs(id) + ); + """) + + def save_run(self, run: RunRecord) -> None: + with self._conn() as conn: + conn.execute( + """INSERT INTO runs + (id, scenario, run_type, status, target_url, eval_model, + request_key, response_path, chaos_profile, score, sample_count, + completed_samples, created_at, completed_at, details) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", + ( + run.id, run.scenario, run.run_type, run.status, run.target_url, + run.eval_model, run.request_key, run.response_path, run.chaos_profile, + run.score, run.sample_count, run.completed_samples, + run.created_at, run.completed_at, json.dumps(run.details), + ), + ) + + def update_run(self, run_id: str, **kwargs) -> None: + if not kwargs: + return + if "details" in kwargs: + kwargs["details"] = json.dumps(kwargs["details"]) + sets = ", ".join(f"{k} = ?" for k in kwargs) + values = list(kwargs.values()) + [run_id] + with self._conn() as conn: + conn.execute(f"UPDATE runs SET {sets} WHERE id = ?", values) + + def get_run(self, run_id: str) -> Optional[RunRecord]: + with self._conn() as conn: + row = conn.execute("SELECT * FROM runs WHERE id = ?", (run_id,)).fetchone() + return self._row_to_run(dict(row)) if row else None + + def get_all_runs(self, limit: int = 100) -> List[RunRecord]: + with self._conn() as conn: + rows = conn.execute( + "SELECT * FROM runs ORDER BY created_at DESC LIMIT ?", (limit,) + ).fetchall() + return [self._row_to_run(dict(r)) for r in rows] + + def save_sample(self, sample: SampleResult) -> None: + with self._conn() as conn: + conn.execute( + """INSERT INTO sample_results + (run_id, sample_index, eval_id, question, agent_output, + expected_rubric, score, reasoning, chaos_profile, created_at) + VALUES (?,?,?,?,?,?,?,?,?,?)""", + ( + sample.run_id, sample.sample_index, sample.eval_id, + sample.question, sample.agent_output, sample.expected_rubric, + sample.score, sample.reasoning, sample.chaos_profile, sample.created_at, + ), + ) + + def get_samples(self, run_id: str) -> List[SampleResult]: + with self._conn() as conn: + rows = conn.execute( + "SELECT * FROM sample_results WHERE run_id = ? ORDER BY sample_index", + (run_id,), + ).fetchall() + return [SampleResult(**dict(r)) for r in rows] + + def _row_to_run(self, d: dict) -> RunRecord: + d["details"] = json.loads(d.get("details") or "{}") + return RunRecord(**d) + + +# โ”€โ”€ Singleton accessor โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +_backend: Optional[StorageBackend] = None + + +def get_backend() -> StorageBackend: + global _backend + if _backend is None: + _backend = SQLiteBackend() + return _backend + + +def set_backend(backend: StorageBackend) -> None: + """Replace the default SQLite backend (e.g. for tests or Postgres).""" + global _backend + _backend = backend diff --git a/ui/backend/main.py b/ui/backend/main.py new file mode 100644 index 0000000..61ffc7f --- /dev/null +++ b/ui/backend/main.py @@ -0,0 +1,245 @@ +"""EvalMonkey UI โ€” FastAPI backend. + +Run with: + cd + uvicorn ui.backend.main:app --reload --port 8080 +""" +from __future__ import annotations + +# Load .env automatically โ€” EVAL_MODEL and all LLM API keys must be +# available before any evalmonkey modules are imported. +import os +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + pass # python-dotenv not installed; user must export vars manually + + +import asyncio +import json +from datetime import datetime +from typing import AsyncGenerator, List, Optional + +from fastapi import BackgroundTasks, FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import StreamingResponse + +from evalmonkey.reporting.history import calculate_production_reliability, get_history +from evalmonkey.scenarios.standard_benchmarks import SUPPORTED_BENCHMARKS + +from .db import get_backend +from .run_engine import execute_run, get_queue +from .schemas import ( + BenchmarkInfo, + RunRecord, + RunSummary, + StartBenchmarkRequest, + StartChaosRequest, +) + +app = FastAPI(title="EvalMonkey UI API", version="0.1.0") + +app.add_middleware( + CORSMiddleware, + allow_origins=["http://localhost:3000", "http://127.0.0.1:3000"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# โ”€โ”€ Config (exposes env-based defaults to the frontend) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +@app.get("/api/config") +def get_config(): + """Return environment-detected defaults so the UI can pre-select the right judge model.""" + return { + "default_eval_model": os.getenv("EVAL_MODEL", "gpt-4o"), + "has_bedrock_key": bool(os.getenv("BEDROCK_API_KEY")), + "has_openai_key": bool(os.getenv("OPENAI_API_KEY")), + "has_anthropic_key": bool(os.getenv("ANTHROPIC_API_KEY")), + } + + +# โ”€โ”€ Benchmarks โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +@app.get("/api/benchmarks", response_model=List[BenchmarkInfo]) +def list_benchmarks(): + return [ + BenchmarkInfo(id=k, description=v["description"], category=v["agent_category"]) + for k, v in SUPPORTED_BENCHMARKS.items() + ] + + +# โ”€โ”€ Runs โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +@app.get("/api/runs", response_model=List[RunSummary]) +def list_runs(limit: int = 50): + runs = get_backend().get_all_runs(limit=limit) + return [RunSummary(**r.model_dump()) for r in runs] + + +@app.get("/api/runs/{run_id}", response_model=RunSummary) +def get_run(run_id: str): + run = get_backend().get_run(run_id) + if not run: + raise HTTPException(status_code=404, detail="Run not found") + return RunSummary(**run.model_dump()) + + +@app.get("/api/runs/{run_id}/samples") +def get_run_samples(run_id: str): + samples = get_backend().get_samples(run_id) + return [s.model_dump() for s in samples] + + +# โ”€โ”€ Start runs โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +@app.post("/api/run/benchmark") +async def start_benchmark(req: StartBenchmarkRequest, background_tasks: BackgroundTasks): + run = RunRecord( + scenario=req.scenario, + run_type="baseline", + target_url=req.target_url if not req.use_sample_agent else "http://127.0.0.1:8001/solve", + eval_model=req.eval_model, + request_key=req.request_key, + response_path=req.response_path, + sample_count=req.limit, + ) + get_backend().save_run(run) + # Initialize queue before background task + get_queue(run.id) + background_tasks.add_task(_run_benchmark_task, run.id, req) + return {"run_id": run.id, "status": "started"} + + +@app.post("/api/run/chaos") +async def start_chaos(req: StartChaosRequest, background_tasks: BackgroundTasks): + run = RunRecord( + scenario=req.scenario, + run_type="chaos", + target_url=req.target_url if not req.use_sample_agent else "http://127.0.0.1:8001/solve", + eval_model=req.eval_model, + request_key=req.request_key, + response_path=req.response_path, + chaos_profile=req.chaos_profile, + sample_count=req.limit, + ) + get_backend().save_run(run) + get_queue(run.id) + background_tasks.add_task(_run_chaos_task, run.id, req) + return {"run_id": run.id, "status": "started"} + + +async def _run_benchmark_task(run_id: str, req: StartBenchmarkRequest): + await execute_run(run_id, req, chaos_profile=None) + + +async def _run_chaos_task(run_id: str, req: StartChaosRequest): + await execute_run(run_id, req, chaos_profile=req.chaos_profile) + + +# โ”€โ”€ SSE Stream โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +@app.get("/api/run/{run_id}/stream") +async def stream_run(run_id: str): + run = get_backend().get_run(run_id) + if not run: + raise HTTPException(status_code=404, detail="Run not found") + return StreamingResponse( + _event_generator(run_id, run.status), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"}, + ) + + +async def _event_generator(run_id: str, initial_status: str) -> AsyncGenerator[str, None]: + # Flush already-stored samples immediately (handles page reload / late connect) + existing = get_backend().get_samples(run_id) + scores_so_far = [] + for s in existing: + scores_so_far.append(s.score or 0) + current_avg = int(sum(scores_so_far) / len(scores_so_far)) + event = { + "type": "sample", + "index": s.sample_index, + "eval_id": s.eval_id, + "question": s.question, + "agent_output": s.agent_output, + "expected_rubric": s.expected_rubric, + "score": s.score, + "reasoning": s.reasoning, + "current_score": current_avg, + } + yield f"data: {json.dumps(event)}\n\n" + + # If already finished, send complete event and stop + if initial_status in ("completed", "failed"): + run = get_backend().get_run(run_id) + if run and run.status == "completed": + yield f"data: {json.dumps({'type': 'complete', 'final_score': run.score, 'failure_count': run.details.get('failure_count', 0)})}\n\n" + elif run and run.status == "failed": + yield f"data: {json.dumps({'type': 'error', 'message': run.details.get('error', 'Run failed')})}\n\n" + return + + # Otherwise drain the live queue + queue = get_queue(run_id) + already_seen = {s.sample_index for s in existing} + + while True: + try: + event = await asyncio.wait_for(queue.get(), timeout=60.0) + # Skip sample events we already sent from DB + if event.get("type") == "sample" and event.get("index") in already_seen: + continue + yield f"data: {json.dumps(event)}\n\n" + if event.get("type") in ("complete", "error"): + break + except asyncio.TimeoutError: + # Send heartbeat to keep connection alive + yield ": heartbeat\n\n" + + +# โ”€โ”€ History & Reliability โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +@app.get("/api/history") +def get_all_history(): + """Get score history for all scenarios from the CLI history file.""" + from evalmonkey.reporting.history import get_history as _get_history + return _get_history() + + +@app.get("/api/history/{scenario}") +def get_scenario_history(scenario: str): + return get_history(scenario) + + +@app.get("/api/reliability") +def get_reliability(): + """Get production reliability for all scenarios that have history.""" + history = get_history() + scenarios = {h["scenario"] for h in history} + result = {} + for s in scenarios: + result[s] = { + "reliability": calculate_production_reliability(s), + "baseline_count": sum(1 for h in history if h["scenario"] == s and h["run_type"] == "baseline"), + "chaos_count": sum(1 for h in history if h["scenario"] == s and h["run_type"] == "chaos"), + } + return result + + +@app.get("/api/reliability/{scenario}") +def get_scenario_reliability(scenario: str): + return { + "scenario": scenario, + "reliability": calculate_production_reliability(scenario), + } + + +# โ”€โ”€ Health โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +@app.get("/api/health") +def health(): + return {"status": "ok", "version": "0.1.0"} diff --git a/ui/backend/run_engine.py b/ui/backend/run_engine.py new file mode 100644 index 0000000..7d231be --- /dev/null +++ b/ui/backend/run_engine.py @@ -0,0 +1,188 @@ +"""Run engine โ€” orchestrates benchmark/chaos runs asynchronously. + +This mirrors the CLI logic in scripts/cli.py but is designed to be +called from the FastAPI backend. Results are persisted to SQLite and +pushed to a per-run asyncio.Queue for SSE streaming to the frontend. +""" +from __future__ import annotations + +import asyncio +import subprocess +import time +from datetime import datetime +from typing import Dict, Optional + +from evalmonkey.evals.asset_generator import EvalAssetGenerator, FailingTrace, build_output_dir +from evalmonkey.evals.local_assets import load_local_evals +from evalmonkey.evals.runner import LLMJudgeProvider +from evalmonkey.reporting.history import record_run +from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark +from evalmonkey.simulator.load_gen import LoadGenerator + +from .db import get_backend +from .schemas import RunRecord, SampleResult, StartBenchmarkRequest, StartChaosRequest + +# โ”€โ”€ Per-run event queues (run_id โ†’ asyncio.Queue) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +_run_queues: Dict[str, asyncio.Queue] = {} + + +def get_queue(run_id: str) -> asyncio.Queue: + if run_id not in _run_queues: + _run_queues[run_id] = asyncio.Queue() + return _run_queues[run_id] + + +def cleanup_queue(run_id: str) -> None: + _run_queues.pop(run_id, None) + + +# โ”€โ”€ Sample agent helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +def _start_sample_agent(name: str): + if name == "rag_app": + import os + env = os.environ.copy() + env["PYTHONPATH"] = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + proc = subprocess.Popen( + ["python3.11", "apps/rag_app/app.py"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + env=env, + ) + time.sleep(3) + return proc, "http://127.0.0.1:8001/solve" + return None, None + + +# โ”€โ”€ Core run coroutine โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +async def execute_run(run_id: str, req: StartBenchmarkRequest, chaos_profile: Optional[str] = None) -> None: + """Background coroutine that runs a full benchmark and streams events.""" + db = get_backend() + queue = get_queue(run_id) + agent_process = None + + try: + # โ”€โ”€ Resolve target URL โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + effective_url = req.target_url + if req.use_sample_agent: + agent_process, effective_url = await asyncio.to_thread(_start_sample_agent, "rag_app") + if not effective_url: + raise ValueError("Failed to start sample agent") + + await queue.put({"type": "status", "message": f"Loading {req.scenario} benchmark..."}) + + # โ”€โ”€ Load benchmark scenarios โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + scenarios = await asyncio.to_thread(load_standard_benchmark, req.scenario, req.limit) + + if not scenarios: + # Try local evals + evals = load_local_evals("custom_evals.yaml") + target = next((e for e in evals if e.id == req.scenario), None) + if target: + scenarios = [target] + + if not scenarios: + raise ValueError(f"No scenarios found for benchmark: {req.scenario}") + + sample_count = len(scenarios) + db.update_run(run_id, sample_count=sample_count) + await queue.put({"type": "start", "sample_count": sample_count}) + + # โ”€โ”€ Run each sample โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + generator = LoadGenerator(effective_url, request_key=req.request_key, response_path=req.response_path) + judge = LLMJudgeProvider(model_name=req.eval_model) + asset_gen = EvalAssetGenerator(model_name=req.eval_model) + scores = [] + first_reasoning = "" + + for idx, eval_task in enumerate(scenarios): + await queue.put({"type": "progress", "index": idx, "total": sample_count, "message": f"Running sample {idx + 1}/{sample_count}..."}) + + # Fire request to agent + resp = await generator.run_scenario(req.scenario, eval_task.input_payload, chaos_profile=chaos_profile) + agent_output = str(resp.get("data", resp.get("error_message", "No output"))) + + # Score with LLM judge (sync โ†’ thread) + evaluation = await asyncio.to_thread(judge.score_run, eval_task.expected_behavior_rubric, agent_output) + score = evaluation.get("score", 0) + reasoning = evaluation.get("reasoning", "") + scores.append(score) + if idx == 0: + first_reasoning = reasoning + + # Persist sample result + sample = SampleResult( + run_id=run_id, + sample_index=idx, + eval_id=eval_task.id, + question=str(eval_task.input_payload.get("question", str(eval_task.input_payload)))[:2000], + agent_output=agent_output[:2000], + expected_rubric=eval_task.expected_behavior_rubric[:1000], + score=score, + reasoning=reasoning[:1000], + chaos_profile=chaos_profile, + ) + db.save_sample(sample) + db.update_run(run_id, completed_samples=idx + 1) + + # Record failure for asset generation + asset_gen.record_failure(FailingTrace( + scenario=req.scenario, + eval_id=eval_task.id, + input_payload=eval_task.input_payload, + agent_output=agent_output, + expected_rubric=eval_task.expected_behavior_rubric, + score=score, + reasoning=reasoning, + chaos_profile=chaos_profile, + )) + + current_avg = int(sum(scores) / len(scores)) + await queue.put({ + "type": "sample", + "index": idx, + "eval_id": eval_task.id, + "question": sample.question, + "agent_output": sample.agent_output, + "expected_rubric": sample.expected_rubric, + "score": score, + "reasoning": reasoning, + "current_score": current_avg, + }) + + # โ”€โ”€ Finalize โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + final_score = int(sum(scores) / len(scores)) if scores else 0 + run_type = "chaos" if chaos_profile else "baseline" + + # Persist to CLI history for continuity + record_run(req.scenario, run_type, final_score, details={"reasoning": first_reasoning, "sample_size": len(scores)}) + + # Save failure assets if needed + output_path = None + if asset_gen.has_failures: + output_dir = build_output_dir(req.scenario if not chaos_profile else f"{req.scenario}_{chaos_profile}") + output_path = await asyncio.to_thread(asset_gen.save, output_dir) + + db.update_run( + run_id, + status="completed", + score=final_score, + completed_at=datetime.now().isoformat(), + details={"reasoning": first_reasoning, "output_path": output_path, "failure_count": asset_gen.failure_count}, + ) + + await queue.put({ + "type": "complete", + "final_score": final_score, + "failure_count": asset_gen.failure_count, + "output_path": output_path, + }) + + except Exception as e: + db.update_run(run_id, status="failed", completed_at=datetime.now().isoformat(), details={"error": str(e)}) + await queue.put({"type": "error", "message": str(e)}) + finally: + if agent_process: + agent_process.terminate() + # Keep queue alive briefly for final reads then clean up + await asyncio.sleep(30) + cleanup_queue(run_id) diff --git a/ui/backend/schemas.py b/ui/backend/schemas.py new file mode 100644 index 0000000..392f538 --- /dev/null +++ b/ui/backend/schemas.py @@ -0,0 +1,75 @@ +"""Pydantic schemas for EvalMonkey UI API.""" +from __future__ import annotations + +import uuid +from datetime import datetime +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + + +class RunRecord(BaseModel): + id: str = Field(default_factory=lambda: str(uuid.uuid4())) + scenario: str + run_type: str # 'baseline' | 'chaos' + status: str = "running" # 'running' | 'completed' | 'failed' + target_url: str + eval_model: str = "gpt-4o" + request_key: str = "question" + response_path: str = "data" + chaos_profile: Optional[str] = None + score: Optional[int] = None + sample_count: int = 0 + completed_samples: int = 0 + created_at: str = Field(default_factory=lambda: datetime.now().isoformat()) + completed_at: Optional[str] = None + details: Dict[str, Any] = {} + + +class SampleResult(BaseModel): + id: Optional[int] = None + run_id: str + sample_index: int + eval_id: str + question: str + agent_output: Optional[str] = None + expected_rubric: Optional[str] = None + score: Optional[int] = None + reasoning: Optional[str] = None + chaos_profile: Optional[str] = None + created_at: str = Field(default_factory=lambda: datetime.now().isoformat()) + + +class StartBenchmarkRequest(BaseModel): + scenario: str + target_url: str + eval_model: str = "gpt-4o" + request_key: str = "question" + response_path: str = "data" + limit: int = 5 + use_sample_agent: bool = False + + +class StartChaosRequest(StartBenchmarkRequest): + chaos_profile: str + + +class BenchmarkInfo(BaseModel): + id: str + description: str + category: str + + +class RunSummary(BaseModel): + id: str + scenario: str + run_type: str + status: str + score: Optional[int] + sample_count: int + completed_samples: int + eval_model: str + chaos_profile: Optional[str] + created_at: str + completed_at: Optional[str] + target_url: str diff --git a/ui/frontend/app/globals.css b/ui/frontend/app/globals.css new file mode 100644 index 0000000..2588b91 --- /dev/null +++ b/ui/frontend/app/globals.css @@ -0,0 +1,200 @@ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap'); + +@tailwind base; +@tailwind components; +@tailwind utilities; + +:root { + --bg: #0a0a0a; + --surface: #111111; + --surface-2: #161616; + --surface-3: #1c1c1c; + --border: #222222; + --border-strong: #333333; + --text-1: #ffffff; + --text-2: #a1a1a1; + --text-3: #555555; + --accent: #22c55e; + --accent-dim: rgba(34, 197, 94, 0.10); + --accent-border: rgba(34, 197, 94, 0.25); + --red: #ef4444; + --amber: #f59e0b; + --radius: 6px; +} + +* { box-sizing: border-box; margin: 0; padding: 0; } + +html, body { + background: var(--bg); + color: var(--text-1); + font-family: 'Inter', system-ui, sans-serif; + font-size: 14px; + line-height: 1.5; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; +} + +/* Base card */ +.card { + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius); +} + +.card-hover:hover { + background: var(--surface-2); + border-color: var(--border-strong); +} + +/* Monospace for IDs, code, endpoints */ +.mono { font-family: 'JetBrains Mono', 'Fira Code', monospace; } + +/* Score colors */ +.score-red { color: #ef4444; } +.score-amber { color: #f59e0b; } +.score-green { color: #22c55e; } +.score-emerald{ color: #4ade80; } + +/* Scrollbar */ +::-webkit-scrollbar { width: 5px; height: 5px; } +::-webkit-scrollbar-track { background: transparent; } +::-webkit-scrollbar-thumb { background: #2a2a2a; border-radius: 3px; } +::-webkit-scrollbar-thumb:hover { background: #333; } + +/* Row animation โ€” subtle fade in */ +@keyframes fadeUp { + from { opacity: 0; transform: translateY(4px); } + to { opacity: 1; transform: translateY(0); } +} +.row-animate { animation: fadeUp 0.2s ease forwards; } + +/* Focus ring */ +:focus-visible { + outline: 1.5px solid var(--accent); + outline-offset: 2px; +} + +/* Input base */ +input, select { + background: var(--surface-2); + border: 1px solid var(--border); + border-radius: var(--radius); + color: var(--text-1); + padding: 8px 12px; + font-family: inherit; + font-size: 13px; + width: 100%; + transition: border-color 0.15s; +} +input:focus, select:focus { + outline: none; + border-color: var(--border-strong); +} +input::placeholder { color: var(--text-3); } + +/* Button base */ +.btn { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 6px; + font-size: 13px; + font-weight: 500; + padding: 7px 14px; + border-radius: var(--radius); + border: 1px solid var(--border); + background: var(--surface-2); + color: var(--text-2); + cursor: pointer; + transition: background 0.15s, border-color 0.15s, color 0.15s; + white-space: nowrap; + font-family: inherit; +} +.btn:hover { + background: var(--surface-3); + border-color: var(--border-strong); + color: var(--text-1); +} +.btn-primary { + background: var(--accent); + border-color: var(--accent); + color: #000; + font-weight: 600; +} +.btn-primary:hover { + background: #16a34a; + border-color: #16a34a; + color: #000; +} +.btn-danger { + background: rgba(239, 68, 68, 0.12); + border-color: rgba(239, 68, 68, 0.3); + color: #ef4444; +} +.btn-danger:hover { + background: rgba(239, 68, 68, 0.2); + color: #f87171; +} +.btn:disabled { + opacity: 0.4; + cursor: not-allowed; +} + +/* Tag / badge */ +.tag { + display: inline-flex; + align-items: center; + gap: 4px; + font-size: 11px; + font-weight: 500; + padding: 2px 8px; + border-radius: 4px; + border: 1px solid var(--border); + color: var(--text-2); + background: var(--surface-2); + white-space: nowrap; +} + +/* Status dot */ +.dot-live { + width: 6px; height: 6px; + border-radius: 50%; + background: var(--accent); + animation: pulse 1.8s ease-in-out infinite; + flex-shrink: 0; +} +@keyframes pulse { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.4; } +} + +/* Divider */ +hr { border: none; border-top: 1px solid var(--border); } + +/* Score ring transition */ +.score-ring-fill { + transition: stroke-dashoffset 0.7s cubic-bezier(0.4, 0, 0.2, 1); +} + +/* Toggle */ +.toggle { + width: 36px; height: 20px; + border-radius: 10px; + background: var(--surface-3); + border: 1px solid var(--border-strong); + position: relative; + cursor: pointer; + transition: background 0.15s, border-color 0.15s; + flex-shrink: 0; +} +.toggle.on { background: var(--accent); border-color: var(--accent); } +.toggle::after { + content: ''; + position: absolute; + top: 2px; left: 2px; + width: 14px; height: 14px; + border-radius: 50%; + background: #fff; + transition: left 0.15s; +} +.toggle.on::after { left: 18px; background: #000; } diff --git a/ui/frontend/app/history/page.tsx b/ui/frontend/app/history/page.tsx new file mode 100644 index 0000000..e65e998 --- /dev/null +++ b/ui/frontend/app/history/page.tsx @@ -0,0 +1,194 @@ +'use client' +import { useEffect, useState } from 'react' +import Link from 'next/link' +import { api } from '@/lib/api' +import { HistoryEntry, RunSummary, scoreColor } from '@/lib/types' +import { SUPPORTED_BENCHMARK_CATEGORIES } from '@/lib/benchmarks' +import { LineChart, Line, XAxis, YAxis, Tooltip, ResponsiveContainer, CartesianGrid, ReferenceLine } from 'recharts' +import { Plus } from 'lucide-react' + +const CustomTooltip = ({ active, payload }: any) => { + if (!active || !payload?.length) return null + return ( +
+
{new Date(payload[0]?.payload?.date).toLocaleDateString()}
+ {payload.map((p: any) => ( +
+ {p.value}/100 +
+ ))} +
+ ) +} + +export default function HistoryPage() { + const [runs, setRuns] = useState([]) + const [history, setHistory] = useState([]) + const [reliability, setReliability] = useState>({}) + const [selectedScenario, setSelectedScenario] = useState(null) + const [loading, setLoading] = useState(true) + + useEffect(() => { + Promise.all([api.listRuns(100), api.getHistory(), api.getReliability()]) + .then(([r, h, rel]) => { setRuns(r); setHistory(h); setReliability(rel) }) + .finally(() => setLoading(false)) + }, []) + + const completed = runs.filter(r => r.status === 'completed') + const scenarios = Array.from(new Set(completed.map(r => r.scenario))) + const displayScenario = selectedScenario ?? scenarios[0] ?? null + + const chartData = history + .filter(h => h.scenario === displayScenario) + .sort((a, b) => a.timestamp.localeCompare(b.timestamp)) + .map(h => ({ date: h.timestamp, score: h.score, type: h.run_type })) + + return ( +
+
+
+

History

+

Reliability trends over time

+
+ + New Run + +
+ + {loading ? ( +
+ {[...Array(4)].map((_, i) =>
)} +
+ ) : completed.length === 0 ? ( +
+
๐Ÿ“Š
+
No history yet
+
Complete your first run to see trends here
+ Run a Benchmark +
+ ) : ( + <> + {/* Scenario reliability tiles */} +
+
RELIABILITY BY SCENARIO
+
+ {scenarios.map(s => { + const rel = reliability[s] + const score = rel ? Math.round(rel.reliability) : null + const active = displayScenario === s + return ( + + ) + })} +
+
+ + {/* Chart */} + {displayScenario && chartData.length > 0 && ( +
+
+
+
{displayScenario}
+
Score over time
+
+ {reliability[displayScenario] && ( +
+
Reliability
+
+ {Math.round(reliability[displayScenario].reliability)} +
+
+ )} +
+ + + + new Date(d).toLocaleDateString('en-US', { month: 'short', day: 'numeric' })} + tick={{ fill: '#444', fontSize: 10 }} + axisLine={false} tickLine={false} + /> + + + } /> + + + +
+ โ€” 70% minimum threshold +
+
+ )} + + {/* Runs table */} +
+
ALL RUNS
+
+ + + + {['Scenario', 'Type', 'Score', 'Samples', 'Judge', 'Date', ''].map(h => ( + + ))} + + + + {completed.map((r, i) => ( + + + + + + + + + + ))} + +
{h}
{r.scenario} + + {r.run_type} + + + + {r.score ?? 'โ€”'} + + /100 + {r.sample_count}{r.eval_model.split('/').pop()} + {new Date(r.created_at).toLocaleDateString('en-US', { month: 'short', day: 'numeric', hour: '2-digit', minute: '2-digit' })} + + + Details โ†’ + +
+
+
+ + )} +
+ ) +} diff --git a/ui/frontend/app/layout.tsx b/ui/frontend/app/layout.tsx new file mode 100644 index 0000000..8503132 --- /dev/null +++ b/ui/frontend/app/layout.tsx @@ -0,0 +1,21 @@ +import type { Metadata } from 'next' +import './globals.css' +import Sidebar from '@/components/Sidebar' + +export const metadata: Metadata = { + title: 'EvalMonkey โ€” Agent Benchmarking', + description: 'Run standard benchmarks and chaos tests against your AI agents. Track reliability over time.', +} + +export default function RootLayout({ children }: { children: React.ReactNode }) { + return ( + + + +
+ {children} +
+ + + ) +} diff --git a/ui/frontend/app/page.tsx b/ui/frontend/app/page.tsx new file mode 100644 index 0000000..a23efcc --- /dev/null +++ b/ui/frontend/app/page.tsx @@ -0,0 +1,142 @@ +'use client' +import { useEffect, useState } from 'react' +import Link from 'next/link' +import { api } from '@/lib/api' +import { RunSummary, scoreColor } from '@/lib/types' +import RunCard from '@/components/RunCard' +import ScoreRing from '@/components/ScoreRing' +import { Plus, RefreshCw } from 'lucide-react' + +export default function DashboardPage() { + const [runs, setRuns] = useState([]) + const [reliability, setReliability] = useState>({}) + const [loading, setLoading] = useState(true) + + const refresh = async () => { + try { + const [runsData, relData] = await Promise.all([api.listRuns(30), api.getReliability()]) + setRuns(runsData) + setReliability(relData) + } catch {} + setLoading(false) + } + + useEffect(() => { + refresh() + const iv = setInterval(refresh, 5000) + return () => clearInterval(iv) + }, []) + + const overall = Object.values(reliability).length + ? Math.round(Object.values(reliability).reduce((s, v) => s + v.reliability, 0) / Object.values(reliability).length) + : null + + const running = runs.filter(r => r.status === 'running') + const completed = runs.filter(r => r.status === 'completed') + const avgScore = completed.length + ? Math.round(completed.reduce((s, r) => s + (r.score ?? 0), 0) / completed.length) + : null + + return ( +
+ {/* Page header */} +
+
+

Dashboard

+

Agent benchmark scores & reliability

+
+
+ + + New Run + +
+
+ + {/* Stat cards */} +
+ {/* Reliability */} +
+
PRODUCTION RELIABILITY
+
+ +
+
+ {overall !== null ? overall : 'โ€”'}/100 +
+
60% baseline ยท 40% chaos
+
+
+
+ + {/* Avg score */} +
+
AVG SCORE
+
+ {avgScore !== null ? avgScore : 'โ€”'} + /100 +
+
{completed.length} completed
+
+ + {/* Active */} +
+
ACTIVE RUNS
+
+
{running.length}
+ {running.length > 0 && } +
+
in progress
+
+
+ + {/* Live runs */} + {running.length > 0 && ( +
+
+ + LIVE RUNS +
+
+ {running.map(r => )} +
+
+ )} + + {/* Recent runs */} +
+
+ RECENT RUNS + + All history โ†’ + +
+ + {loading ? ( +
+ {[...Array(6)].map((_, i) => ( +
+ ))} +
+ ) : completed.length === 0 ? ( +
+
๐Ÿต
+
No runs yet
+
+ Run your first benchmark to track agent reliability +
+ + Run First Benchmark + +
+ ) : ( +
+ {completed.map(r => )} +
+ )} +
+
+ ) +} diff --git a/ui/frontend/app/run/[id]/page.tsx b/ui/frontend/app/run/[id]/page.tsx new file mode 100644 index 0000000..2c52b46 --- /dev/null +++ b/ui/frontend/app/run/[id]/page.tsx @@ -0,0 +1,258 @@ +'use client' +import { useEffect, useRef, useState } from 'react' +import { useParams } from 'next/navigation' +import { api } from '@/lib/api' +import { RunSummary, SampleResult, SSEEvent, scoreColor } from '@/lib/types' +import ScoreRing from '@/components/ScoreRing' +import { SUPPORTED_BENCHMARK_CATEGORIES } from '@/lib/benchmarks' +import { ArrowLeft, CheckCircle, XCircle, Zap } from 'lucide-react' +import Link from 'next/link' + +export default function RunDetailPage() { + const { id } = useParams<{ id: string }>() + const [run, setRun] = useState(null) + const [samples, setSamples] = useState([]) + const [currentScore, setCurrentScore] = useState(null) + const [status, setStatus] = useState('running') + const [statusMsg, setStatusMsg] = useState('Initializing...') + const [selected, setSelected] = useState(null) + const bottomRef = useRef(null) + + useEffect(() => { + api.getRun(id).then(r => { setRun(r); setStatus(r.status) }).catch(() => {}) + }, [id]) + + useEffect(() => { + // Connect directly to the backend โ€” bypasses the Next.js dev-server proxy + // which buffers SSE responses and causes "Waiting for first result..." to hang. + const backendUrl = process.env.NEXT_PUBLIC_BACKEND_URL ?? 'http://localhost:8080' + const es = new EventSource(`${backendUrl}/api/run/${id}/stream`) + es.onmessage = (e) => { + try { + const ev: SSEEvent = JSON.parse(e.data) + if (ev.type === 'status' || ev.type === 'progress') { + setStatusMsg(ev.message ?? '') + } else if (ev.type === 'sample') { + const s: SampleResult = { + id: null, run_id: id, + sample_index: ev.index ?? 0, + eval_id: ev.eval_id ?? '', + question: ev.question ?? '', + agent_output: ev.agent_output ?? null, + expected_rubric: ev.expected_rubric ?? null, + score: ev.score ?? null, + reasoning: ev.reasoning ?? null, + chaos_profile: null, + created_at: new Date().toISOString(), + } + setSamples(prev => prev.find(x => x.sample_index === s.sample_index) ? prev : [...prev, s]) + setCurrentScore(ev.current_score ?? null) + setTimeout(() => bottomRef.current?.scrollIntoView({ behavior: 'smooth' }), 80) + } else if (ev.type === 'complete') { + setStatus('completed'); setCurrentScore(ev.final_score ?? null) + setStatusMsg('Completed') + api.getRun(id).then(r => setRun(r)).catch(() => {}) + es.close() + } else if (ev.type === 'error') { + setStatus('failed'); setStatusMsg(ev.message ?? 'Run failed') + es.close() + } + } catch {} + } + es.onerror = () => { + // On connection error, poll once to get the final run state from DB + api.getRun(id).then(r => { + setRun(r) + setStatus(r.status) + if (r.status === 'failed') setStatusMsg((r as any).details?.error ?? 'Run failed') + if (r.status === 'completed') setStatusMsg('Completed') + }).catch(() => {}) + es.close() + } + return () => es.close() + }, [id]) + + const finalScore = status === 'completed' ? (run?.score ?? currentScore) : currentScore + const total = run?.sample_count ?? 0 + const progress = total > 0 ? (samples.length / total) * 100 : 0 + + return ( +
+ {/* Back */} + + Dashboard + + + {/* Run header card */} +
+
+
+
+ {run?.scenario ?? id} + {run?.run_type === 'chaos' && ( + + chaos + + )} +
+
+ {run?.eval_model ?? 'โ€”'} + {run?.chaos_profile && โšก {run.chaos_profile}} + {run?.target_url} +
+
+
+
+ {finalScore !== null ? finalScore : 'โ€”'} +
+
/ 100
+
+
+ + {/* Progress */} +
+
+
+ {status === 'running' && } + {status === 'completed' && } + {status === 'failed' && } + {statusMsg} +
+ {samples.length}/{total} +
+
+
+
+
+
+ + {/* Sample results */} +
+
+
SAMPLES
+ {samples.length === 0 ? ( +
+ {status === 'failed' ? ( + <> + +
Run failed
+
+ {statusMsg} +
+ Try Again + + ) : ( + <> +
+
{statusMsg || 'Waiting for first result...'}
+ + )} +
+ ) : ( +
+ {samples.map((s, i) => { + const col = scoreColor(s.score) + const open = selected === s.sample_index + return ( +
setSelected(open ? null : s.sample_index)} + > +
+ {/* Index */} +
+ {s.sample_index + 1} +
+ + {/* Question */} +
+
{s.question}
+ {s.agent_output && ( +
+ โ†ณ {s.agent_output} +
+ )} +
+ + {/* Score */} +
+
+ {s.score ?? 'โ€ฆ'} +
+
+
+ + {/* Expanded */} + {open && ( +
+
+
QUESTION
+
{s.question}
+
+
+
AGENT OUTPUT
+
{s.agent_output || '(none)'}
+
+
+
EXPECTED
+
{s.expected_rubric}
+
+
+
JUDGE REASONING
+
{s.reasoning || 'โ€”'}
+
+
+ )} +
+ ) + })} +
+
+ )} +
+ + {/* Breakdown (complete only) */} + {status === 'completed' && samples.length > 0 && ( +
+
BREAKDOWN
+
+ {[ + { label: '90โ€“100', min: 90, max: 101, col: '#4ade80' }, + { label: '75โ€“89', min: 75, max: 90, col: '#22c55e' }, + { label: '50โ€“74', min: 50, max: 75, col: '#f59e0b' }, + { label: '0โ€“49', min: 0, max: 50, col: '#ef4444' }, + ].map(({ label, min, max, col }) => { + const count = samples.filter(s => (s.score ?? 0) >= min && (s.score ?? 0) < max).length + return ( +
+
+ {label} + {count} +
+
+
+
+
+ ) + })} +
+ Run Again + History +
+ )} +
+
+ ) +} diff --git a/ui/frontend/app/run/new/page.tsx b/ui/frontend/app/run/new/page.tsx new file mode 100644 index 0000000..6037cff --- /dev/null +++ b/ui/frontend/app/run/new/page.tsx @@ -0,0 +1,354 @@ +'use client' +import { useEffect, useState } from 'react' +import { useRouter } from 'next/navigation' +import { api } from '@/lib/api' +import { BenchmarkInfo, CATEGORY_COLORS } from '@/lib/types' +import { CHAOS_PROFILES, EVAL_MODELS } from '@/lib/benchmarks' +import { ChevronRight, Zap, Bot, FlaskConical } from 'lucide-react' + +type Step = 1 | 2 | 3 + +export default function NewRunPage() { + const router = useRouter() + const [step, setStep] = useState(1) + const [benchmarks, setBenchmarks] = useState([]) + const [loading, setLoading] = useState(false) + + const [targetUrl, setTargetUrl] = useState('http://localhost:8000') + const [useSampleAgent, setUseSampleAgent] = useState(false) + const [requestKey, setRequestKey] = useState('question') + const [responsePath, setResponsePath] = useState('data') + const [selectedBenchmark, setSelectedBenchmark] = useState(null) + const [selectedCategory, setSelectedCategory] = useState('All') + const [evalModel, setEvalModel] = useState('gpt-4o') + const [limit, setLimit] = useState(5) + const [enableChaos, setEnableChaos] = useState(false) + const [chaosProfile, setChaosProfile] = useState('client_prompt_injection') + const [error, setError] = useState(null) + + useEffect(() => { + // Auto-select the judge model matching whatever is configured in .env + api.getConfig().then(cfg => setEvalModel(cfg.default_eval_model)).catch(() => {}) + api.listBenchmarks().then(setBenchmarks).catch(() => {}) + }, []) + + const categories = Array.from(new Set(benchmarks.map(b => b.category))) + + const handleLaunch = async () => { + if (!selectedBenchmark) return + setLoading(true); setError(null) + try { + const base = { + scenario: selectedBenchmark, + target_url: useSampleAgent ? 'http://127.0.0.1:8001/solve' : targetUrl, + eval_model: evalModel, request_key: requestKey, + response_path: responsePath, limit, use_sample_agent: useSampleAgent, + } + const result = enableChaos + ? await api.startChaos({ ...base, chaos_profile: chaosProfile }) + : await api.startBenchmark(base) + router.push(`/run/${result.run_id}`) + } catch (e: any) { setError(e.message); setLoading(false) } + } + + const stepLabels = ['Agent Setup', 'Benchmark', 'Configure'] + + return ( +
+
+

New Benchmark Run

+

Configure and launch an evaluation against your agent

+
+ + {/* Step indicator */} +
+ {([1, 2, 3] as Step[]).map((n, i) => ( +
+ + {n < 3 && } +
+ ))} +
+ + {/* Step 1: Agent Setup */} + {step === 1 && ( +
+ {/* Sample agent toggle */} +
{ setUseSampleAgent(!useSampleAgent); setSelectedBenchmark(null); }} + > +
+ +
+
Use Built-in Demo Agent
+
+ Auto-starts the sample RAG app on localhost:8001 +
+
+
+
+
+ + {!useSampleAgent && ( + <> +
+ + setTargetUrl(e.target.value)} + className="mono" + placeholder="http://localhost:8000/solve" + /> +
+
+
+ + setRequestKey(e.target.value)} + className="mono" placeholder="question" /> +

JSON key sent with the question

+
+
+ + setResponsePath(e.target.value)} + className="mono" placeholder="data" /> +

e.g. choices.0.message.content

+
+
+ + )} + + +
+ )} + + {/* Step 2: Benchmark Picker */} + {step === 2 && ( +
+
+

+ Select a standard benchmark dataset: +

+
+ + {categories.map(cat => ( + + ))} +
+ + {categories.map(cat => { + if (selectedCategory !== 'All' && selectedCategory !== cat) return null + + // Filter out benchmarks that don't make sense for the demo agent + const allowedForDemo = ['gsm8k', 'mmlu', 'truthfulqa', 'toxigen', 'hella-swag', 'winogrande', 'arc'] + const catBenchmarks = benchmarks.filter(b => b.category === cat && (!useSampleAgent || allowedForDemo.includes(b.id))) + + if (catBenchmarks.length === 0) return null + + return ( +
+
{cat.toUpperCase()}
+
+ {catBenchmarks.map(b => { + const sel = selectedBenchmark === b.id + return ( + + ) + })} +
+
+ )})} +
+
+ + +
+
+ )} + + {/* Step 3: Configure */} + {step === 3 && ( +
+
+ {/* Judge model */} +
+ +
+ {EVAL_MODELS.map(m => ( + + ))} +
+
+ + {/* Sample count */} +
+ + setLimit(+e.target.value)} + className="w-full" + style={{ accentColor: '#22c55e' }} + /> +
+ 1 (fast)50 (thorough) +
+
+ + {/* Chaos toggle */} +
+
+
+ + + CHAOS INJECTION + +
+
setEnableChaos(!enableChaos)} + /> +
+ {enableChaos && ( +
+ {CHAOS_PROFILES.map(p => ( + + ))} +
+ )} +
+
+ + {/* Summary */} +
+
+ Benchmark + {selectedBenchmark} +
+
+ Target + {useSampleAgent ? 'sample rag_app' : targetUrl} +
+
+ Judge + {EVAL_MODELS.find(m => m.id === evalModel)?.label} +
+
+ Samples + {limit} +
+ {enableChaos && ( +
+ Chaos + {chaosProfile} +
+ )} +
+ + {error && ( +
+ {error} +
+ )} + +
+ + +
+
+ )} +
+ ) +} diff --git a/ui/frontend/components/CategoryBadge.tsx b/ui/frontend/components/CategoryBadge.tsx new file mode 100644 index 0000000..b8b4150 --- /dev/null +++ b/ui/frontend/components/CategoryBadge.tsx @@ -0,0 +1,25 @@ +import { CATEGORY_COLORS } from '@/lib/types' + +interface Props { + category: string + size?: 'sm' | 'md' +} + +export default function CategoryBadge({ category, size = 'sm' }: Props) { + const c = CATEGORY_COLORS[category] ?? { bg: 'rgba(255,255,255,0.04)', text: '#666', border: '#252525' } + return ( + + {category} + + ) +} diff --git a/ui/frontend/components/RunCard.tsx b/ui/frontend/components/RunCard.tsx new file mode 100644 index 0000000..32409ab --- /dev/null +++ b/ui/frontend/components/RunCard.tsx @@ -0,0 +1,78 @@ +'use client' +import Link from 'next/link' +import { RunSummary, scoreColor } from '@/lib/types' +import ScoreRing from './ScoreRing' +import { SUPPORTED_BENCHMARK_CATEGORIES } from '@/lib/benchmarks' +import { Zap } from 'lucide-react' + +interface Props { run: RunSummary } + +function timeAgo(iso: string): string { + const diff = Date.now() - new Date(iso).getTime() + const mins = Math.floor(diff / 60000) + if (mins < 1) return 'just now' + if (mins < 60) return `${mins}m ago` + const hrs = Math.floor(mins / 60) + if (hrs < 24) return `${hrs}h ago` + return `${Math.floor(hrs / 24)}d ago` +} + +export default function RunCard({ run }: Props) { + const col = scoreColor(run.status === 'running' ? null : run.score) + + return ( + + {/* Header */} +
+
+
+ {run.scenario} + {run.run_type === 'chaos' && ( + + chaos + + )} +
+ {SUPPORTED_BENCHMARK_CATEGORIES[run.scenario] ?? 'Research'} +
+ +
+ + {/* Progress bar (only when running) */} + {run.status === 'running' && ( +
+
+
+
+
+ )} + + {/* Footer */} +
+ {timeAgo(run.created_at)} + + {run.status === 'running' ? 'โ— running' + : run.status === 'failed' ? 'โœ• failed' + : `${run.sample_count} samples`} + +
+ + ) +} diff --git a/ui/frontend/components/ScoreRing.tsx b/ui/frontend/components/ScoreRing.tsx new file mode 100644 index 0000000..ff2b852 --- /dev/null +++ b/ui/frontend/components/ScoreRing.tsx @@ -0,0 +1,59 @@ +'use client' +import { scoreColor } from '@/lib/types' + +interface Props { + score: number | null + size?: number + strokeWidth?: number + showLabel?: boolean + animate?: boolean +} + +export default function ScoreRing({ + score, + size = 64, + strokeWidth = 5, + showLabel = true, + animate = true, +}: Props) { + const radius = (size - strokeWidth) / 2 + const circumference = 2 * Math.PI * radius + const pct = score !== null ? Math.max(0, Math.min(100, score)) : 0 + const offset = circumference - (pct / 100) * circumference + const color = scoreColor(score) + + return ( +
+ + {/* Track */} + + {/* Fill */} + + + {showLabel && ( + + {score !== null ? score : 'โ€”'} + + )} +
+ ) +} diff --git a/ui/frontend/components/Sidebar.tsx b/ui/frontend/components/Sidebar.tsx new file mode 100644 index 0000000..934fcaf --- /dev/null +++ b/ui/frontend/components/Sidebar.tsx @@ -0,0 +1,57 @@ +'use client' +import Link from 'next/link' +import { usePathname } from 'next/navigation' +import { BarChart3, History, Plus } from 'lucide-react' + +const navItems = [ + { href: '/', label: 'Dashboard', icon: BarChart3 }, + { href: '/run/new', label: 'New Run', icon: Plus }, + { href: '/history', label: 'History', icon: History }, +] + +export default function Sidebar() { + const path = usePathname() + + return ( + + ) +} diff --git a/ui/frontend/next.config.js b/ui/frontend/next.config.js new file mode 100644 index 0000000..77d993c --- /dev/null +++ b/ui/frontend/next.config.js @@ -0,0 +1,13 @@ +/** @type {import('next').NextConfig} */ +const nextConfig = { + async rewrites() { + return [ + { + source: '/api/:path*', + destination: 'http://localhost:8080/api/:path*', + }, + ] + }, +} + +module.exports = nextConfig diff --git a/ui/frontend/package.json b/ui/frontend/package.json new file mode 100644 index 0000000..03187a8 --- /dev/null +++ b/ui/frontend/package.json @@ -0,0 +1,27 @@ +{ + "name": "evalmonkey-ui", + "version": "0.1.0", + "private": true, + "scripts": { + "dev": "next dev", + "build": "next build", + "start": "next start" + }, + "dependencies": { + "next": "14.2.3", + "react": "^18", + "react-dom": "^18", + "recharts": "^2.12.7", + "lucide-react": "^0.378.0", + "clsx": "^2.1.1" + }, + "devDependencies": { + "typescript": "^5", + "@types/node": "^20", + "@types/react": "^18", + "@types/react-dom": "^18", + "tailwindcss": "^3.4.1", + "postcss": "^8", + "autoprefixer": "^10.0.1" + } +} diff --git a/ui/frontend/postcss.config.js b/ui/frontend/postcss.config.js new file mode 100644 index 0000000..33ad091 --- /dev/null +++ b/ui/frontend/postcss.config.js @@ -0,0 +1,6 @@ +module.exports = { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} diff --git a/ui/frontend/tailwind.config.js b/ui/frontend/tailwind.config.js new file mode 100644 index 0000000..fd55c9d --- /dev/null +++ b/ui/frontend/tailwind.config.js @@ -0,0 +1,44 @@ +/** @type {import('tailwindcss').Config} */ +module.exports = { + content: [ + './pages/**/*.{js,ts,jsx,tsx,mdx}', + './components/**/*.{js,ts,jsx,tsx,mdx}', + './app/**/*.{js,ts,jsx,tsx,mdx}', + ], + theme: { + extend: { + colors: { + bg: '#0a0a0a', + surface: '#111111', + s2: '#161616', + s3: '#1c1c1c', + border: '#222222', + bstrong: '#333333', + t1: '#ffffff', + t2: '#a1a1a1', + t3: '#555555', + accent: '#22c55e', + }, + fontFamily: { + sans: ['Inter', 'system-ui', 'sans-serif'], + mono: ['JetBrains Mono', 'Fira Code', 'monospace'], + }, + borderRadius: { + DEFAULT: '6px', + sm: '4px', + md: '6px', + lg: '8px', + }, + animation: { + 'fade-up': 'fadeUp 0.2s ease forwards', + }, + keyframes: { + fadeUp: { + from: { opacity: '0', transform: 'translateY(4px)' }, + to: { opacity: '1', transform: 'translateY(0)' }, + }, + }, + }, + }, + plugins: [], +} diff --git a/ui/frontend/tsconfig.json b/ui/frontend/tsconfig.json new file mode 100644 index 0000000..109b22f --- /dev/null +++ b/ui/frontend/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "target": "es5", + "lib": ["dom", "dom.iterable", "esnext"], + "allowJs": true, + "skipLibCheck": true, + "strict": true, + "noEmit": true, + "esModuleInterop": true, + "module": "esnext", + "moduleResolution": "bundler", + "resolveJsonModule": true, + "isolatedModules": true, + "jsx": "preserve", + "incremental": true, + "plugins": [{ "name": "next" }], + "paths": { "@/*": ["./*"] } + }, + "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], + "exclude": ["node_modules"] +} diff --git a/ui/requirements-ui.txt b/ui/requirements-ui.txt new file mode 100644 index 0000000..7c608d9 --- /dev/null +++ b/ui/requirements-ui.txt @@ -0,0 +1,2 @@ +fastapi>=0.110.0 +uvicorn[standard]>=0.29.0