From 1ef5d880341563d2f348284dfbe33637097c5aae Mon Sep 17 00:00:00 2001
From: himmi-01 <himanshisharma01jan@gmail.com>
Date: Tue, 5 May 2026 22:45:57 -0700
Subject: [PATCH] feat: evalmonkey web ui and benchmark stability fixes

- Added Next.js & FastAPI Web UI for live benchmarking
- Fixed macOS torch shared memory permission crash
- Improved HuggingFace datasets loading logic (trust_remote_code=True)
- Fixed Hellaswag and MMLU strict LLM judge options mapping
- Updated UI to auto-detect LLM judge from environment
---
 .gitignore                                  |  14 +-
 README.md                                   |  81 +++--
 evalmonkey/scenarios/standard_benchmarks.py |  32 +-
 ui/README.md                                |  57 ++++
 ui/backend/__init__.py                      |   1 +
 ui/backend/db.py                            | 176 ++++++++++
 ui/backend/main.py                          | 245 ++++++++++++++
 ui/backend/run_engine.py                    | 188 +++++++++++
 ui/backend/schemas.py                       |  75 +++++
 ui/frontend/app/globals.css                 | 200 +++++++++++
 ui/frontend/app/history/page.tsx            | 194 +++++++++++
 ui/frontend/app/layout.tsx                  |  21 ++
 ui/frontend/app/page.tsx                    | 142 ++++++++
 ui/frontend/app/run/[id]/page.tsx           | 258 ++++++++++++++
 ui/frontend/app/run/new/page.tsx            | 354 ++++++++++++++++++++
 ui/frontend/components/CategoryBadge.tsx    |  25 ++
 ui/frontend/components/RunCard.tsx          |  78 +++++
 ui/frontend/components/ScoreRing.tsx        |  59 ++++
 ui/frontend/components/Sidebar.tsx          |  57 ++++
 ui/frontend/next.config.js                  |  13 +
 ui/frontend/package.json                    |  27 ++
 ui/frontend/postcss.config.js               |   6 +
 ui/frontend/tailwind.config.js              |  44 +++
 ui/frontend/tsconfig.json                   |  21 ++
 ui/requirements-ui.txt                      |   2 +
 25 files changed, 2341 insertions(+), 29 deletions(-)
 create mode 100644 ui/README.md
 create mode 100644 ui/backend/__init__.py
 create mode 100644 ui/backend/db.py
 create mode 100644 ui/backend/main.py
 create mode 100644 ui/backend/run_engine.py
 create mode 100644 ui/backend/schemas.py
 create mode 100644 ui/frontend/app/globals.css
 create mode 100644 ui/frontend/app/history/page.tsx
 create mode 100644 ui/frontend/app/layout.tsx
 create mode 100644 ui/frontend/app/page.tsx
 create mode 100644 ui/frontend/app/run/[id]/page.tsx
 create mode 100644 ui/frontend/app/run/new/page.tsx
 create mode 100644 ui/frontend/components/CategoryBadge.tsx
 create mode 100644 ui/frontend/components/RunCard.tsx
 create mode 100644 ui/frontend/components/ScoreRing.tsx
 create mode 100644 ui/frontend/components/Sidebar.tsx
 create mode 100644 ui/frontend/next.config.js
 create mode 100644 ui/frontend/package.json
 create mode 100644 ui/frontend/postcss.config.js
 create mode 100644 ui/frontend/tailwind.config.js
 create mode 100644 ui/frontend/tsconfig.json
 create mode 100644 ui/requirements-ui.txt

diff --git a/.gitignore b/.gitignore
index ffb25e4..039ad6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,4 +41,16 @@ env/
 history.json
 *.log
 demo_run.sh
-output/
\ No newline at end of file
+output/
+
+# UI — Node / Next.js
+ui/frontend/node_modules/
+ui/frontend/.next/
+ui/frontend/.swc/
+ui/frontend/next-env.d.ts
+ui/frontend/tsconfig.tsbuildinfo
+ui/frontend/package-lock.json
+ui/frontend/.env.local
+
+# UI — SQLite database (local data only)
+*.db
\ No newline at end of file
diff --git a/README.md b/README.md
index 09d0bd3..965ef8d 100644
--- a/README.md
+++ b/README.md
@@ -28,51 +28,84 @@ EvalMonkey natively supports evaluating ANY LLM: **AWS Bedrock**, **Azure**, **G
 
 ## 🚀 At a Glance
 - **8 Agent Frameworks natively supported**: CrewAI, LangChain, OpenAI Agents, Microsoft AutoGen, AWS Bedrock, Ollama, Strands, and custom HTTP endpoints.
-- **20 Standard Benchmarks out-of-the-box**: GSM8K, BIG-Bench Hard, HotpotQA, ToxiGen, MT-Bench, MBPP, and more — all categorised by the agent type they target.
+- **19 Standard Benchmarks out-of-the-box**: GSM8K, BIG-Bench Hard, HotpotQA, ToxiGen, MT-Bench, MBPP, and more — all categorised by the agent type they target.
 - **23 Chaos Injections ready to run**: 12 client-side payload mutations + 11 server-side middleware injections — all text-based, no GPU or vision dependencies.
 - **Automatic Eval Asset Generation**: Poor benchmark scores automatically produce `traces.json`, `evals.json`, and `improvement_prompt.md` — one `cat` command away from Claude Code or Cursor.
 
+---
+
 ## ⚡️ Quick Start
 
+### Option A — Let Claude Code or Cursor set it up for you (30 seconds)
+
+Open Claude Code, Cursor, or any AI coding assistant and paste this prompt:
+
+```
+Set up EvalMonkey in my project so I can benchmark my AI agent.
+
+1. Clone https://github.com/Corbell-AI/evalmonkey into a sibling folder
+2. Run: pip install -e . inside that folder
+3. Copy .env.example to .env and ask me which LLM provider I want to use as the benchmark judge (OpenAI, Anthropic, Bedrock, or Ollama) — then fill in the correct key
+4. Run: evalmonkey init --framework <my_framework> --name "My Agent" --port <my_port>
+   Use the framework my agent is built with (crewai / langchain / openai / bedrock / autogen / ollama / strands / custom)
+5. Show me the generated evalmonkey.yaml and ask me to confirm the agent URL and response path are correct
+6. Run a quick smoke test: evalmonkey run-benchmark --scenario gsm8k --sample-agent rag_app --limit 2
+   to confirm everything is wired up correctly
+7. Then run the real benchmark against my agent: evalmonkey run-benchmark --scenario mmlu --limit 5
+8. Show me the score and explain what it means
+```
+
+> The agent will handle cloning, installing, configuring your `.env`, and running the first benchmark — all without you typing a single command.
+
+---
+
+### Option B — Manual Setup (5 minutes)
+
+**1. Install**
 ```bash
 git clone https://github.com/Corbell-AI/evalmonkey
 cd evalmonkey
 pip install -e .
 ```
 
-**Step 1 — Run this once inside your agent's project folder:**
+**2. Configure your LLM key** (used only as the evaluation judge — never for your agent)
 ```bash
-cd /your/crewai-project       # wherever your agent lives
-evalmonkey init --framework crewai --name "My Research Crew" --port 8000
+cp .env.example .env
 ```
-This auto-generates a pre-filled `evalmonkey.yaml` with the correct request/response format for your framework. Supported: `crewai`, `langchain`, `openai`, `bedrock`, `autogen`, `ollama`, `strands`, `custom`.
+Open `.env` and set **one** of these depending on your LLM provider:
+```bash
+EVAL_MODEL=gpt-4o
+OPENAI_API_KEY=sk-...          # OpenAI
+
+# — OR —
+EVAL_MODEL=anthropic/claude-haiku-4-5
+ANTHROPIC_API_KEY=sk-ant-...   # Anthropic
 
-**Step 2 — Edit the two settings that matter:**
-```yaml
-# evalmonkey.yaml — generated for CrewAI
-agent:
-  name: "My Research Crew"
-  framework: crewai
-  url: http://localhost:8000/chat       # ← where your agent listens
-  request_key: message
-  response_path: reply
+# — OR —
+EVAL_MODEL=bedrock/anthropic.claude-3-haiku-20240307-v1:0
+AWS_ACCESS_KEY_ID=...          # AWS Bedrock
 
-  # ← EvalMonkey will start this for you automatically!
-  # It spawns the process, waits for it to turn on, benchmarks, then stops it.
-  agent_command: "python src/agent.py"  # or: uvicorn src.agent:app --port 8000
-  agent_startup_wait: 3                 # seconds to wait after launch
+# — OR — (no key needed)
+EVAL_MODEL=ollama/llama3       # Local Ollama
+```
 
-eval_model: "gpt-4o"   # ← the LLM used as benchmark judge
+**3. Smoke test with the built-in sample agent** (no agent of your own needed yet)
+```bash
+evalmonkey run-benchmark --scenario gsm8k --sample-agent rag_app --limit 3
 ```
+You should see 3 samples run and a score printed. ✅
 
-**Step 3 — Run everything. EvalMonkey starts your agent, benchmarks it, then stops it:**
+**4. Point it at your own agent**
 ```bash
-evalmonkey run-benchmark --scenario mmlu
-evalmonkey run-chaos --scenario mmlu --chaos-profile client_prompt_injection
-evalmonkey history --scenario mmlu
+cd /path/to/your/agent/project
+evalmonkey init --framework crewai --name "My Agent" --port 8000
+# Edit the generated evalmonkey.yaml to set your agent's URL and response format
+evalmonkey run-benchmark --scenario mmlu --limit 5
 ```
 
-> EvalMonkey discovers `evalmonkey.yaml` from the **current working directory** — the same convention used by `pytest`, `promptfoo`, and `docker-compose`. Run all commands from your agent's project folder.
+> `evalmonkey.yaml` is discovered from the **current working directory** — same convention as `pytest` and `docker-compose`.
+
+---
 
 
 ## 🤝 Works With Any Agent — No Code Changes Required
diff --git a/evalmonkey/scenarios/standard_benchmarks.py b/evalmonkey/scenarios/standard_benchmarks.py
index c40e4be..ce1296c 100644
--- a/evalmonkey/scenarios/standard_benchmarks.py
+++ b/evalmonkey/scenarios/standard_benchmarks.py
@@ -101,7 +101,17 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce
     Automatically downloads datasets and converts them to standard HTTP scenarios!
     """
     try:
-        from datasets import load_dataset
+        import os
+        # Prevent PyTorch shared-memory multiprocessing on Mac.
+        # Even with streaming=True, HuggingFace datasets can invoke torch_shm_manager
+        # for internal caching — which fails on Mac with "Permission denied".
+        os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+        os.environ.setdefault("OMP_NUM_THREADS", "1")
+        os.environ.setdefault("HF_DATASETS_OFFLINE", "0")
+
+        from datasets import load_dataset, disable_progress_bar, disable_caching
+        disable_progress_bar()
+        disable_caching()  # prevents torch_shm from being invoked for cache writes
     except ImportError:
         raise ImportError("The 'datasets' library is required to run standard benchmarks. Please run 'pip install datasets'.")
 
@@ -132,7 +142,7 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce
     elif benchmark_name.lower() == "xlam":
         # A standard function calling benchmark 
         try:
-            dataset = load_dataset("Salesforce/xlam-function-calling-60k", split="train", streaming=True)
+            dataset = load_dataset("Salesforce/xlam-function-calling-60k", split="train", streaming=True, trust_remote_code=True)
             for idx, item in enumerate(dataset):
                 if idx >= limit:
                     break
@@ -172,20 +182,34 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce
                 path, name, split, q_col, a_col = hf_map[benchmark_name.lower()]
                 desc = SUPPORTED_BENCHMARKS[benchmark_name.lower()]["description"]
                 print(f"Loading {benchmark_name} from HuggingFace Datasets ({path})...")
-                dataset = load_dataset(path, name, split=split, streaming=True) if name else load_dataset(path, split=split, streaming=True)
+                dataset = load_dataset(path, name, split=split, streaming=True, trust_remote_code=True) if name else load_dataset(path, split=split, streaming=True, trust_remote_code=True)
                 for idx, item in enumerate(dataset):
                     if idx >= limit:
                         break
                     
                     question_text = str(item.get(q_col, "No question"))
+                    expected_answer = str(item.get(a_col, 'Unknown'))
+
                     if benchmark_name.lower() == "mmlu" and "choices" in item:
                         question_text += f"\nChoices: {item['choices']}"
+                        try:
+                            ans_idx = int(expected_answer)
+                            expected_answer = f"Option {ans_idx}: {item['choices'][ans_idx]}"
+                        except (ValueError, IndexError):
+                            pass
+                    elif benchmark_name.lower() == "hella-swag" and "endings" in item:
+                        question_text += f"\nOptions:\n0: {item['endings'][0]}\n1: {item['endings'][1]}\n2: {item['endings'][2]}\n3: {item['endings'][3]}"
+                        try:
+                            ans_idx = int(expected_answer)
+                            expected_answer = f"Option {ans_idx}: {item['endings'][ans_idx]}"
+                        except (ValueError, IndexError):
+                            pass
 
                     scenarios.append(EvalScenario(
                         id=f"{benchmark_name}_{idx}",
                         description=desc,
                         input_payload={"question": question_text},
-                        expected_behavior_rubric=f"Agent MUST deduce or output this answer: {item.get(a_col, 'Unknown')}"
+                        expected_behavior_rubric=f"Agent MUST deduce or output this answer: {expected_answer}"
                     ))
             else:
                 print(f"Dataset mappings for {benchmark_name} are currently stubbed.")
diff --git a/ui/README.md b/ui/README.md
new file mode 100644
index 0000000..3fec255
--- /dev/null
+++ b/ui/README.md
@@ -0,0 +1,57 @@
+
+# EvalMonkey UI
+
+A professional web interface for running benchmarks, chaos tests, and tracking agent reliability over time.
+
+## Quick Start
+
+**Terminal 1 — Backend (FastAPI)**
+```bash
+cd <path-to-evalmonkey>
+cp .env.example .env  # add EVAL_MODEL + your LLM API key
+uvicorn ui.backend.main:app --reload --port 8080
+```
+
+**Terminal 2 — Frontend (Next.js)**
+```bash
+cd <path-to-evalmonkey>/ui/frontend
+npm run dev
+```
+
+Open **http://localhost:3000** in your browser.
+
+---
+
+## Features
+
+| Page | Description |
+|---|---|
+| **Dashboard** | Production Reliability hero, live runs, recent results grid |
+| **New Run** | 3-step wizard: agent URL → benchmark → configure & launch |
+| **Live Run** | SSE-streamed real-time sample results with score rings |
+| **History** | Recharts trend lines, reliability per scenario, all-runs table |
+
+## Architecture
+
+```
+FastAPI backend  →  SQLite (~/.evalmonkey/ui.db)
+     ↕ REST + SSE
+Next.js frontend  →  http://localhost:3000
+```
+
+The `StorageBackend` ABC in `ui/backend/db.py` makes the storage layer swappable — replace `SQLiteBackend` with `PostgresBackend` in a single line.
+
+## Extending Storage
+```python
+# In ui/backend/db.py — implement this ABC:
+class MyBackend(StorageBackend):
+    def save_run(self, run: RunRecord) -> None: ...
+    # ... 5 other methods
+
+# Then in your app startup:
+from ui.backend.db import set_backend
+set_backend(MyBackend())
+```
+
+## CLI — No Impact
+The existing `evalmonkey` CLI continues to work exactly as before. The UI is a completely additive layer — it imports from the same `evalmonkey.*` packages but adds no changes to them.
diff --git a/ui/backend/__init__.py b/ui/backend/__init__.py
new file mode 100644
index 0000000..f907ee5
--- /dev/null
+++ b/ui/backend/__init__.py
@@ -0,0 +1 @@
+# EvalMonkey UI Backend
diff --git a/ui/backend/db.py b/ui/backend/db.py
new file mode 100644
index 0000000..68dc816
--- /dev/null
+++ b/ui/backend/db.py
@@ -0,0 +1,176 @@
+"""Abstract storage backend + SQLite implementation for EvalMonkey UI.
+
+Swap to a different backend (Postgres, Redis, etc.) by implementing
+StorageBackend and passing your instance to set_backend().
+"""
+from __future__ import annotations
+
+import json
+import sqlite3
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional
+
+from .schemas import RunRecord, SampleResult
+
+DB_PATH = Path.home() / ".evalmonkey" / "ui.db"
+
+
+class StorageBackend(ABC):
+    """Abstract interface — implement this to swap storage layers."""
+
+    @abstractmethod
+    def save_run(self, run: RunRecord) -> None: ...
+
+    @abstractmethod
+    def update_run(self, run_id: str, **kwargs) -> None: ...
+
+    @abstractmethod
+    def get_run(self, run_id: str) -> Optional[RunRecord]: ...
+
+    @abstractmethod
+    def get_all_runs(self, limit: int = 100) -> List[RunRecord]: ...
+
+    @abstractmethod
+    def save_sample(self, sample: SampleResult) -> None: ...
+
+    @abstractmethod
+    def get_samples(self, run_id: str) -> List[SampleResult]: ...
+
+
+class SQLiteBackend(StorageBackend):
+    """SQLite-backed storage. Data lives at ~/.evalmonkey/ui.db — zero setup."""
+
+    def __init__(self, db_path: Path = DB_PATH) -> None:
+        self.db_path = db_path
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self._init_schema()
+
+    @contextmanager
+    def _conn(self):
+        conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
+        conn.row_factory = sqlite3.Row
+        try:
+            yield conn
+            conn.commit()
+        finally:
+            conn.close()
+
+    def _init_schema(self) -> None:
+        with self._conn() as conn:
+            conn.executescript("""
+                CREATE TABLE IF NOT EXISTS runs (
+                    id TEXT PRIMARY KEY,
+                    scenario TEXT NOT NULL,
+                    run_type TEXT NOT NULL,
+                    status TEXT NOT NULL DEFAULT 'running',
+                    target_url TEXT NOT NULL,
+                    eval_model TEXT NOT NULL,
+                    request_key TEXT NOT NULL DEFAULT 'question',
+                    response_path TEXT NOT NULL DEFAULT 'data',
+                    chaos_profile TEXT,
+                    score INTEGER,
+                    sample_count INTEGER DEFAULT 0,
+                    completed_samples INTEGER DEFAULT 0,
+                    created_at TEXT NOT NULL,
+                    completed_at TEXT,
+                    details TEXT DEFAULT '{}'
+                );
+                CREATE TABLE IF NOT EXISTS sample_results (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    run_id TEXT NOT NULL,
+                    sample_index INTEGER NOT NULL,
+                    eval_id TEXT NOT NULL,
+                    question TEXT NOT NULL,
+                    agent_output TEXT,
+                    expected_rubric TEXT,
+                    score INTEGER,
+                    reasoning TEXT,
+                    chaos_profile TEXT,
+                    created_at TEXT NOT NULL,
+                    FOREIGN KEY (run_id) REFERENCES runs(id)
+                );
+            """)
+
+    def save_run(self, run: RunRecord) -> None:
+        with self._conn() as conn:
+            conn.execute(
+                """INSERT INTO runs
+                   (id, scenario, run_type, status, target_url, eval_model,
+                    request_key, response_path, chaos_profile, score, sample_count,
+                    completed_samples, created_at, completed_at, details)
+                   VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+                (
+                    run.id, run.scenario, run.run_type, run.status, run.target_url,
+                    run.eval_model, run.request_key, run.response_path, run.chaos_profile,
+                    run.score, run.sample_count, run.completed_samples,
+                    run.created_at, run.completed_at, json.dumps(run.details),
+                ),
+            )
+
+    def update_run(self, run_id: str, **kwargs) -> None:
+        if not kwargs:
+            return
+        if "details" in kwargs:
+            kwargs["details"] = json.dumps(kwargs["details"])
+        sets = ", ".join(f"{k} = ?" for k in kwargs)
+        values = list(kwargs.values()) + [run_id]
+        with self._conn() as conn:
+            conn.execute(f"UPDATE runs SET {sets} WHERE id = ?", values)
+
+    def get_run(self, run_id: str) -> Optional[RunRecord]:
+        with self._conn() as conn:
+            row = conn.execute("SELECT * FROM runs WHERE id = ?", (run_id,)).fetchone()
+        return self._row_to_run(dict(row)) if row else None
+
+    def get_all_runs(self, limit: int = 100) -> List[RunRecord]:
+        with self._conn() as conn:
+            rows = conn.execute(
+                "SELECT * FROM runs ORDER BY created_at DESC LIMIT ?", (limit,)
+            ).fetchall()
+        return [self._row_to_run(dict(r)) for r in rows]
+
+    def save_sample(self, sample: SampleResult) -> None:
+        with self._conn() as conn:
+            conn.execute(
+                """INSERT INTO sample_results
+                   (run_id, sample_index, eval_id, question, agent_output,
+                    expected_rubric, score, reasoning, chaos_profile, created_at)
+                   VALUES (?,?,?,?,?,?,?,?,?,?)""",
+                (
+                    sample.run_id, sample.sample_index, sample.eval_id,
+                    sample.question, sample.agent_output, sample.expected_rubric,
+                    sample.score, sample.reasoning, sample.chaos_profile, sample.created_at,
+                ),
+            )
+
+    def get_samples(self, run_id: str) -> List[SampleResult]:
+        with self._conn() as conn:
+            rows = conn.execute(
+                "SELECT * FROM sample_results WHERE run_id = ? ORDER BY sample_index",
+                (run_id,),
+            ).fetchall()
+        return [SampleResult(**dict(r)) for r in rows]
+
+    def _row_to_run(self, d: dict) -> RunRecord:
+        d["details"] = json.loads(d.get("details") or "{}")
+        return RunRecord(**d)
+
+
+# ── Singleton accessor ───────────────────────────────────────────────────────
+_backend: Optional[StorageBackend] = None
+
+
+def get_backend() -> StorageBackend:
+    global _backend
+    if _backend is None:
+        _backend = SQLiteBackend()
+    return _backend
+
+
+def set_backend(backend: StorageBackend) -> None:
+    """Replace the default SQLite backend (e.g. for tests or Postgres)."""
+    global _backend
+    _backend = backend
diff --git a/ui/backend/main.py b/ui/backend/main.py
new file mode 100644
index 0000000..61ffc7f
--- /dev/null
+++ b/ui/backend/main.py
@@ -0,0 +1,245 @@
+"""EvalMonkey UI — FastAPI backend.
+
+Run with:
+    cd <path-to-evalmonkey>
+    uvicorn ui.backend.main:app --reload --port 8080
+"""
+from __future__ import annotations
+
+# Load .env automatically — EVAL_MODEL and all LLM API keys must be
+# available before any evalmonkey modules are imported.
+import os
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass  # python-dotenv not installed; user must export vars manually
+
+
+import asyncio
+import json
+from datetime import datetime
+from typing import AsyncGenerator, List, Optional
+
+from fastapi import BackgroundTasks, FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+
+from evalmonkey.reporting.history import calculate_production_reliability, get_history
+from evalmonkey.scenarios.standard_benchmarks import SUPPORTED_BENCHMARKS
+
+from .db import get_backend
+from .run_engine import execute_run, get_queue
+from .schemas import (
+    BenchmarkInfo,
+    RunRecord,
+    RunSummary,
+    StartBenchmarkRequest,
+    StartChaosRequest,
+)
+
+app = FastAPI(title="EvalMonkey UI API", version="0.1.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:3000", "http://127.0.0.1:3000"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+# ── Config (exposes env-based defaults to the frontend) ──────────────────────
+
+@app.get("/api/config")
+def get_config():
+    """Return environment-detected defaults so the UI can pre-select the right judge model."""
+    return {
+        "default_eval_model": os.getenv("EVAL_MODEL", "gpt-4o"),
+        "has_bedrock_key":    bool(os.getenv("BEDROCK_API_KEY")),
+        "has_openai_key":     bool(os.getenv("OPENAI_API_KEY")),
+        "has_anthropic_key":  bool(os.getenv("ANTHROPIC_API_KEY")),
+    }
+
+
+# ── Benchmarks ────────────────────────────────────────────────────────────────
+
+@app.get("/api/benchmarks", response_model=List[BenchmarkInfo])
+def list_benchmarks():
+    return [
+        BenchmarkInfo(id=k, description=v["description"], category=v["agent_category"])
+        for k, v in SUPPORTED_BENCHMARKS.items()
+    ]
+
+
+# ── Runs ──────────────────────────────────────────────────────────────────────
+
+@app.get("/api/runs", response_model=List[RunSummary])
+def list_runs(limit: int = 50):
+    runs = get_backend().get_all_runs(limit=limit)
+    return [RunSummary(**r.model_dump()) for r in runs]
+
+
+@app.get("/api/runs/{run_id}", response_model=RunSummary)
+def get_run(run_id: str):
+    run = get_backend().get_run(run_id)
+    if not run:
+        raise HTTPException(status_code=404, detail="Run not found")
+    return RunSummary(**run.model_dump())
+
+
+@app.get("/api/runs/{run_id}/samples")
+def get_run_samples(run_id: str):
+    samples = get_backend().get_samples(run_id)
+    return [s.model_dump() for s in samples]
+
+
+# ── Start runs ────────────────────────────────────────────────────────────────
+
+@app.post("/api/run/benchmark")
+async def start_benchmark(req: StartBenchmarkRequest, background_tasks: BackgroundTasks):
+    run = RunRecord(
+        scenario=req.scenario,
+        run_type="baseline",
+        target_url=req.target_url if not req.use_sample_agent else "http://127.0.0.1:8001/solve",
+        eval_model=req.eval_model,
+        request_key=req.request_key,
+        response_path=req.response_path,
+        sample_count=req.limit,
+    )
+    get_backend().save_run(run)
+    # Initialize queue before background task
+    get_queue(run.id)
+    background_tasks.add_task(_run_benchmark_task, run.id, req)
+    return {"run_id": run.id, "status": "started"}
+
+
+@app.post("/api/run/chaos")
+async def start_chaos(req: StartChaosRequest, background_tasks: BackgroundTasks):
+    run = RunRecord(
+        scenario=req.scenario,
+        run_type="chaos",
+        target_url=req.target_url if not req.use_sample_agent else "http://127.0.0.1:8001/solve",
+        eval_model=req.eval_model,
+        request_key=req.request_key,
+        response_path=req.response_path,
+        chaos_profile=req.chaos_profile,
+        sample_count=req.limit,
+    )
+    get_backend().save_run(run)
+    get_queue(run.id)
+    background_tasks.add_task(_run_chaos_task, run.id, req)
+    return {"run_id": run.id, "status": "started"}
+
+
+async def _run_benchmark_task(run_id: str, req: StartBenchmarkRequest):
+    await execute_run(run_id, req, chaos_profile=None)
+
+
+async def _run_chaos_task(run_id: str, req: StartChaosRequest):
+    await execute_run(run_id, req, chaos_profile=req.chaos_profile)
+
+
+# ── SSE Stream ────────────────────────────────────────────────────────────────
+
+@app.get("/api/run/{run_id}/stream")
+async def stream_run(run_id: str):
+    run = get_backend().get_run(run_id)
+    if not run:
+        raise HTTPException(status_code=404, detail="Run not found")
+    return StreamingResponse(
+        _event_generator(run_id, run.status),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"},
+    )
+
+
+async def _event_generator(run_id: str, initial_status: str) -> AsyncGenerator[str, None]:
+    # Flush already-stored samples immediately (handles page reload / late connect)
+    existing = get_backend().get_samples(run_id)
+    scores_so_far = []
+    for s in existing:
+        scores_so_far.append(s.score or 0)
+        current_avg = int(sum(scores_so_far) / len(scores_so_far))
+        event = {
+            "type": "sample",
+            "index": s.sample_index,
+            "eval_id": s.eval_id,
+            "question": s.question,
+            "agent_output": s.agent_output,
+            "expected_rubric": s.expected_rubric,
+            "score": s.score,
+            "reasoning": s.reasoning,
+            "current_score": current_avg,
+        }
+        yield f"data: {json.dumps(event)}\n\n"
+
+    # If already finished, send complete event and stop
+    if initial_status in ("completed", "failed"):
+        run = get_backend().get_run(run_id)
+        if run and run.status == "completed":
+            yield f"data: {json.dumps({'type': 'complete', 'final_score': run.score, 'failure_count': run.details.get('failure_count', 0)})}\n\n"
+        elif run and run.status == "failed":
+            yield f"data: {json.dumps({'type': 'error', 'message': run.details.get('error', 'Run failed')})}\n\n"
+        return
+
+    # Otherwise drain the live queue
+    queue = get_queue(run_id)
+    already_seen = {s.sample_index for s in existing}
+
+    while True:
+        try:
+            event = await asyncio.wait_for(queue.get(), timeout=60.0)
+            # Skip sample events we already sent from DB
+            if event.get("type") == "sample" and event.get("index") in already_seen:
+                continue
+            yield f"data: {json.dumps(event)}\n\n"
+            if event.get("type") in ("complete", "error"):
+                break
+        except asyncio.TimeoutError:
+            # Send heartbeat to keep connection alive
+            yield ": heartbeat\n\n"
+
+
+# ── History & Reliability ──────────────────────────────────────────────────────
+
+@app.get("/api/history")
+def get_all_history():
+    """Get score history for all scenarios from the CLI history file."""
+    from evalmonkey.reporting.history import get_history as _get_history
+    return _get_history()
+
+
+@app.get("/api/history/{scenario}")
+def get_scenario_history(scenario: str):
+    return get_history(scenario)
+
+
+@app.get("/api/reliability")
+def get_reliability():
+    """Get production reliability for all scenarios that have history."""
+    history = get_history()
+    scenarios = {h["scenario"] for h in history}
+    result = {}
+    for s in scenarios:
+        result[s] = {
+            "reliability": calculate_production_reliability(s),
+            "baseline_count": sum(1 for h in history if h["scenario"] == s and h["run_type"] == "baseline"),
+            "chaos_count": sum(1 for h in history if h["scenario"] == s and h["run_type"] == "chaos"),
+        }
+    return result
+
+
+@app.get("/api/reliability/{scenario}")
+def get_scenario_reliability(scenario: str):
+    return {
+        "scenario": scenario,
+        "reliability": calculate_production_reliability(scenario),
+    }
+
+
+# ── Health ────────────────────────────────────────────────────────────────────
+
+@app.get("/api/health")
+def health():
+    return {"status": "ok", "version": "0.1.0"}
diff --git a/ui/backend/run_engine.py b/ui/backend/run_engine.py
new file mode 100644
index 0000000..7d231be
--- /dev/null
+++ b/ui/backend/run_engine.py
@@ -0,0 +1,188 @@
+"""Run engine — orchestrates benchmark/chaos runs asynchronously.
+
+This mirrors the CLI logic in scripts/cli.py but is designed to be
+called from the FastAPI backend. Results are persisted to SQLite and
+pushed to a per-run asyncio.Queue for SSE streaming to the frontend.
+"""
+from __future__ import annotations
+
+import asyncio
+import subprocess
+import time
+from datetime import datetime
+from typing import Dict, Optional
+
+from evalmonkey.evals.asset_generator import EvalAssetGenerator, FailingTrace, build_output_dir
+from evalmonkey.evals.local_assets import load_local_evals
+from evalmonkey.evals.runner import LLMJudgeProvider
+from evalmonkey.reporting.history import record_run
+from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark
+from evalmonkey.simulator.load_gen import LoadGenerator
+
+from .db import get_backend
+from .schemas import RunRecord, SampleResult, StartBenchmarkRequest, StartChaosRequest
+
+# ── Per-run event queues (run_id → asyncio.Queue) ────────────────────────────
+_run_queues: Dict[str, asyncio.Queue] = {}
+
+
+def get_queue(run_id: str) -> asyncio.Queue:
+    if run_id not in _run_queues:
+        _run_queues[run_id] = asyncio.Queue()
+    return _run_queues[run_id]
+
+
+def cleanup_queue(run_id: str) -> None:
+    _run_queues.pop(run_id, None)
+
+
+# ── Sample agent helpers ──────────────────────────────────────────────────────
+def _start_sample_agent(name: str):
+    if name == "rag_app":
+        import os
+        env = os.environ.copy()
+        env["PYTHONPATH"] = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+        proc = subprocess.Popen(
+            ["python3.11", "apps/rag_app/app.py"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            env=env,
+        )
+        time.sleep(3)
+        return proc, "http://127.0.0.1:8001/solve"
+    return None, None
+
+
+# ── Core run coroutine ────────────────────────────────────────────────────────
+async def execute_run(run_id: str, req: StartBenchmarkRequest, chaos_profile: Optional[str] = None) -> None:
+    """Background coroutine that runs a full benchmark and streams events."""
+    db = get_backend()
+    queue = get_queue(run_id)
+    agent_process = None
+
+    try:
+        # ── Resolve target URL ────────────────────────────────────────────────
+        effective_url = req.target_url
+        if req.use_sample_agent:
+            agent_process, effective_url = await asyncio.to_thread(_start_sample_agent, "rag_app")
+            if not effective_url:
+                raise ValueError("Failed to start sample agent")
+
+        await queue.put({"type": "status", "message": f"Loading {req.scenario} benchmark..."})
+
+        # ── Load benchmark scenarios ──────────────────────────────────────────
+        scenarios = await asyncio.to_thread(load_standard_benchmark, req.scenario, req.limit)
+
+        if not scenarios:
+            # Try local evals
+            evals = load_local_evals("custom_evals.yaml")
+            target = next((e for e in evals if e.id == req.scenario), None)
+            if target:
+                scenarios = [target]
+
+        if not scenarios:
+            raise ValueError(f"No scenarios found for benchmark: {req.scenario}")
+
+        sample_count = len(scenarios)
+        db.update_run(run_id, sample_count=sample_count)
+        await queue.put({"type": "start", "sample_count": sample_count})
+
+        # ── Run each sample ───────────────────────────────────────────────────
+        generator = LoadGenerator(effective_url, request_key=req.request_key, response_path=req.response_path)
+        judge = LLMJudgeProvider(model_name=req.eval_model)
+        asset_gen = EvalAssetGenerator(model_name=req.eval_model)
+        scores = []
+        first_reasoning = ""
+
+        for idx, eval_task in enumerate(scenarios):
+            await queue.put({"type": "progress", "index": idx, "total": sample_count, "message": f"Running sample {idx + 1}/{sample_count}..."})
+
+            # Fire request to agent
+            resp = await generator.run_scenario(req.scenario, eval_task.input_payload, chaos_profile=chaos_profile)
+            agent_output = str(resp.get("data", resp.get("error_message", "No output")))
+
+            # Score with LLM judge (sync → thread)
+            evaluation = await asyncio.to_thread(judge.score_run, eval_task.expected_behavior_rubric, agent_output)
+            score = evaluation.get("score", 0)
+            reasoning = evaluation.get("reasoning", "")
+            scores.append(score)
+            if idx == 0:
+                first_reasoning = reasoning
+
+            # Persist sample result
+            sample = SampleResult(
+                run_id=run_id,
+                sample_index=idx,
+                eval_id=eval_task.id,
+                question=str(eval_task.input_payload.get("question", str(eval_task.input_payload)))[:2000],
+                agent_output=agent_output[:2000],
+                expected_rubric=eval_task.expected_behavior_rubric[:1000],
+                score=score,
+                reasoning=reasoning[:1000],
+                chaos_profile=chaos_profile,
+            )
+            db.save_sample(sample)
+            db.update_run(run_id, completed_samples=idx + 1)
+
+            # Record failure for asset generation
+            asset_gen.record_failure(FailingTrace(
+                scenario=req.scenario,
+                eval_id=eval_task.id,
+                input_payload=eval_task.input_payload,
+                agent_output=agent_output,
+                expected_rubric=eval_task.expected_behavior_rubric,
+                score=score,
+                reasoning=reasoning,
+                chaos_profile=chaos_profile,
+            ))
+
+            current_avg = int(sum(scores) / len(scores))
+            await queue.put({
+                "type": "sample",
+                "index": idx,
+                "eval_id": eval_task.id,
+                "question": sample.question,
+                "agent_output": sample.agent_output,
+                "expected_rubric": sample.expected_rubric,
+                "score": score,
+                "reasoning": reasoning,
+                "current_score": current_avg,
+            })
+
+        # ── Finalize ──────────────────────────────────────────────────────────
+        final_score = int(sum(scores) / len(scores)) if scores else 0
+        run_type = "chaos" if chaos_profile else "baseline"
+
+        # Persist to CLI history for continuity
+        record_run(req.scenario, run_type, final_score, details={"reasoning": first_reasoning, "sample_size": len(scores)})
+
+        # Save failure assets if needed
+        output_path = None
+        if asset_gen.has_failures:
+            output_dir = build_output_dir(req.scenario if not chaos_profile else f"{req.scenario}_{chaos_profile}")
+            output_path = await asyncio.to_thread(asset_gen.save, output_dir)
+
+        db.update_run(
+            run_id,
+            status="completed",
+            score=final_score,
+            completed_at=datetime.now().isoformat(),
+            details={"reasoning": first_reasoning, "output_path": output_path, "failure_count": asset_gen.failure_count},
+        )
+
+        await queue.put({
+            "type": "complete",
+            "final_score": final_score,
+            "failure_count": asset_gen.failure_count,
+            "output_path": output_path,
+        })
+
+    except Exception as e:
+        db.update_run(run_id, status="failed", completed_at=datetime.now().isoformat(), details={"error": str(e)})
+        await queue.put({"type": "error", "message": str(e)})
+    finally:
+        if agent_process:
+            agent_process.terminate()
+        # Keep queue alive briefly for final reads then clean up
+        await asyncio.sleep(30)
+        cleanup_queue(run_id)
diff --git a/ui/backend/schemas.py b/ui/backend/schemas.py
new file mode 100644
index 0000000..392f538
--- /dev/null
+++ b/ui/backend/schemas.py
@@ -0,0 +1,75 @@
+"""Pydantic schemas for EvalMonkey UI API."""
+from __future__ import annotations
+
+import uuid
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class RunRecord(BaseModel):
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    scenario: str
+    run_type: str  # 'baseline' | 'chaos'
+    status: str = "running"  # 'running' | 'completed' | 'failed'
+    target_url: str
+    eval_model: str = "gpt-4o"
+    request_key: str = "question"
+    response_path: str = "data"
+    chaos_profile: Optional[str] = None
+    score: Optional[int] = None
+    sample_count: int = 0
+    completed_samples: int = 0
+    created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
+    completed_at: Optional[str] = None
+    details: Dict[str, Any] = {}
+
+
+class SampleResult(BaseModel):
+    id: Optional[int] = None
+    run_id: str
+    sample_index: int
+    eval_id: str
+    question: str
+    agent_output: Optional[str] = None
+    expected_rubric: Optional[str] = None
+    score: Optional[int] = None
+    reasoning: Optional[str] = None
+    chaos_profile: Optional[str] = None
+    created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
+
+
+class StartBenchmarkRequest(BaseModel):
+    scenario: str
+    target_url: str
+    eval_model: str = "gpt-4o"
+    request_key: str = "question"
+    response_path: str = "data"
+    limit: int = 5
+    use_sample_agent: bool = False
+
+
+class StartChaosRequest(StartBenchmarkRequest):
+    chaos_profile: str
+
+
+class BenchmarkInfo(BaseModel):
+    id: str
+    description: str
+    category: str
+
+
+class RunSummary(BaseModel):
+    id: str
+    scenario: str
+    run_type: str
+    status: str
+    score: Optional[int]
+    sample_count: int
+    completed_samples: int
+    eval_model: str
+    chaos_profile: Optional[str]
+    created_at: str
+    completed_at: Optional[str]
+    target_url: str
diff --git a/ui/frontend/app/globals.css b/ui/frontend/app/globals.css
new file mode 100644
index 0000000..2588b91
--- /dev/null
+++ b/ui/frontend/app/globals.css
@@ -0,0 +1,200 @@
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap');
+
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+:root {
+  --bg:            #0a0a0a;
+  --surface:       #111111;
+  --surface-2:     #161616;
+  --surface-3:     #1c1c1c;
+  --border:        #222222;
+  --border-strong: #333333;
+  --text-1:        #ffffff;
+  --text-2:        #a1a1a1;
+  --text-3:        #555555;
+  --accent:        #22c55e;
+  --accent-dim:    rgba(34, 197, 94, 0.10);
+  --accent-border: rgba(34, 197, 94, 0.25);
+  --red:           #ef4444;
+  --amber:         #f59e0b;
+  --radius:        6px;
+}
+
+* { box-sizing: border-box; margin: 0; padding: 0; }
+
+html, body {
+  background: var(--bg);
+  color: var(--text-1);
+  font-family: 'Inter', system-ui, sans-serif;
+  font-size: 14px;
+  line-height: 1.5;
+  -webkit-font-smoothing: antialiased;
+  -moz-osx-font-smoothing: grayscale;
+}
+
+/* Base card */
+.card {
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+}
+
+.card-hover:hover {
+  background: var(--surface-2);
+  border-color: var(--border-strong);
+}
+
+/* Monospace for IDs, code, endpoints */
+.mono { font-family: 'JetBrains Mono', 'Fira Code', monospace; }
+
+/* Score colors */
+.score-red    { color: #ef4444; }
+.score-amber  { color: #f59e0b; }
+.score-green  { color: #22c55e; }
+.score-emerald{ color: #4ade80; }
+
+/* Scrollbar */
+::-webkit-scrollbar { width: 5px; height: 5px; }
+::-webkit-scrollbar-track { background: transparent; }
+::-webkit-scrollbar-thumb { background: #2a2a2a; border-radius: 3px; }
+::-webkit-scrollbar-thumb:hover { background: #333; }
+
+/* Row animation — subtle fade in */
+@keyframes fadeUp {
+  from { opacity: 0; transform: translateY(4px); }
+  to   { opacity: 1; transform: translateY(0); }
+}
+.row-animate { animation: fadeUp 0.2s ease forwards; }
+
+/* Focus ring */
+:focus-visible {
+  outline: 1.5px solid var(--accent);
+  outline-offset: 2px;
+}
+
+/* Input base */
+input, select {
+  background: var(--surface-2);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  color: var(--text-1);
+  padding: 8px 12px;
+  font-family: inherit;
+  font-size: 13px;
+  width: 100%;
+  transition: border-color 0.15s;
+}
+input:focus, select:focus {
+  outline: none;
+  border-color: var(--border-strong);
+}
+input::placeholder { color: var(--text-3); }
+
+/* Button base */
+.btn {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  gap: 6px;
+  font-size: 13px;
+  font-weight: 500;
+  padding: 7px 14px;
+  border-radius: var(--radius);
+  border: 1px solid var(--border);
+  background: var(--surface-2);
+  color: var(--text-2);
+  cursor: pointer;
+  transition: background 0.15s, border-color 0.15s, color 0.15s;
+  white-space: nowrap;
+  font-family: inherit;
+}
+.btn:hover {
+  background: var(--surface-3);
+  border-color: var(--border-strong);
+  color: var(--text-1);
+}
+.btn-primary {
+  background: var(--accent);
+  border-color: var(--accent);
+  color: #000;
+  font-weight: 600;
+}
+.btn-primary:hover {
+  background: #16a34a;
+  border-color: #16a34a;
+  color: #000;
+}
+.btn-danger {
+  background: rgba(239, 68, 68, 0.12);
+  border-color: rgba(239, 68, 68, 0.3);
+  color: #ef4444;
+}
+.btn-danger:hover {
+  background: rgba(239, 68, 68, 0.2);
+  color: #f87171;
+}
+.btn:disabled {
+  opacity: 0.4;
+  cursor: not-allowed;
+}
+
+/* Tag / badge */
+.tag {
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+  font-size: 11px;
+  font-weight: 500;
+  padding: 2px 8px;
+  border-radius: 4px;
+  border: 1px solid var(--border);
+  color: var(--text-2);
+  background: var(--surface-2);
+  white-space: nowrap;
+}
+
+/* Status dot */
+.dot-live {
+  width: 6px; height: 6px;
+  border-radius: 50%;
+  background: var(--accent);
+  animation: pulse 1.8s ease-in-out infinite;
+  flex-shrink: 0;
+}
+@keyframes pulse {
+  0%, 100% { opacity: 1; }
+  50%       { opacity: 0.4; }
+}
+
+/* Divider */
+hr { border: none; border-top: 1px solid var(--border); }
+
+/* Score ring transition */
+.score-ring-fill {
+  transition: stroke-dashoffset 0.7s cubic-bezier(0.4, 0, 0.2, 1);
+}
+
+/* Toggle */
+.toggle {
+  width: 36px; height: 20px;
+  border-radius: 10px;
+  background: var(--surface-3);
+  border: 1px solid var(--border-strong);
+  position: relative;
+  cursor: pointer;
+  transition: background 0.15s, border-color 0.15s;
+  flex-shrink: 0;
+}
+.toggle.on { background: var(--accent); border-color: var(--accent); }
+.toggle::after {
+  content: '';
+  position: absolute;
+  top: 2px; left: 2px;
+  width: 14px; height: 14px;
+  border-radius: 50%;
+  background: #fff;
+  transition: left 0.15s;
+}
+.toggle.on::after { left: 18px; background: #000; }
diff --git a/ui/frontend/app/history/page.tsx b/ui/frontend/app/history/page.tsx
new file mode 100644
index 0000000..e65e998
--- /dev/null
+++ b/ui/frontend/app/history/page.tsx
@@ -0,0 +1,194 @@
+'use client'
+import { useEffect, useState } from 'react'
+import Link from 'next/link'
+import { api } from '@/lib/api'
+import { HistoryEntry, RunSummary, scoreColor } from '@/lib/types'
+import { SUPPORTED_BENCHMARK_CATEGORIES } from '@/lib/benchmarks'
+import { LineChart, Line, XAxis, YAxis, Tooltip, ResponsiveContainer, CartesianGrid, ReferenceLine } from 'recharts'
+import { Plus } from 'lucide-react'
+
+const CustomTooltip = ({ active, payload }: any) => {
+  if (!active || !payload?.length) return null
+  return (
+    <div className="card px-3 py-2 text-xs" style={{ background: '#111', border: '1px solid #252525' }}>
+      <div style={{ color: '#555' }}>{new Date(payload[0]?.payload?.date).toLocaleDateString()}</div>
+      {payload.map((p: any) => (
+        <div key={p.name} style={{ color: p.color }} className="font-medium mt-0.5">
+          {p.value}/100
+        </div>
+      ))}
+    </div>
+  )
+}
+
+export default function HistoryPage() {
+  const [runs, setRuns] = useState<RunSummary[]>([])
+  const [history, setHistory] = useState<HistoryEntry[]>([])
+  const [reliability, setReliability] = useState<Record<string, { reliability: number; baseline_count: number; chaos_count: number }>>({})
+  const [selectedScenario, setSelectedScenario] = useState<string | null>(null)
+  const [loading, setLoading] = useState(true)
+
+  useEffect(() => {
+    Promise.all([api.listRuns(100), api.getHistory(), api.getReliability()])
+      .then(([r, h, rel]) => { setRuns(r); setHistory(h); setReliability(rel) })
+      .finally(() => setLoading(false))
+  }, [])
+
+  const completed = runs.filter(r => r.status === 'completed')
+  const scenarios = Array.from(new Set(completed.map(r => r.scenario)))
+  const displayScenario = selectedScenario ?? scenarios[0] ?? null
+
+  const chartData = history
+    .filter(h => h.scenario === displayScenario)
+    .sort((a, b) => a.timestamp.localeCompare(b.timestamp))
+    .map(h => ({ date: h.timestamp, score: h.score, type: h.run_type }))
+
+  return (
+    <div className="p-8 max-w-6xl mx-auto">
+      <div className="flex items-center justify-between mb-8">
+        <div>
+          <h1 className="text-lg font-semibold text-white mb-0.5">History</h1>
+          <p className="text-sm" style={{ color: '#555' }}>Reliability trends over time</p>
+        </div>
+        <Link href="/run/new" className="btn btn-primary">
+          <Plus size={13} />New Run
+        </Link>
+      </div>
+
+      {loading ? (
+        <div className="grid grid-cols-4 gap-3">
+          {[...Array(4)].map((_, i) => <div key={i} className="card h-24 animate-pulse" />)}
+        </div>
+      ) : completed.length === 0 ? (
+        <div className="card p-14 text-center">
+          <div className="text-3xl mb-3">📊</div>
+          <div className="text-sm font-medium text-white mb-1">No history yet</div>
+          <div className="text-xs mb-5" style={{ color: '#555' }}>Complete your first run to see trends here</div>
+          <Link href="/run/new" className="btn btn-primary"><Plus size={13} />Run a Benchmark</Link>
+        </div>
+      ) : (
+        <>
+          {/* Scenario reliability tiles */}
+          <div className="mb-6">
+            <div className="text-xs font-medium mb-3" style={{ color: '#555' }}>RELIABILITY BY SCENARIO</div>
+            <div className="grid grid-cols-4 gap-2">
+              {scenarios.map(s => {
+                const rel = reliability[s]
+                const score = rel ? Math.round(rel.reliability) : null
+                const active = displayScenario === s
+                return (
+                  <button
+                    key={s}
+                    onClick={() => setSelectedScenario(s)}
+                    className="card card-hover text-left p-4 transition-all"
+                    style={{ borderColor: active ? '#333' : undefined }}
+                  >
+                    <div className="mono text-xs font-semibold text-white mb-2">{s}</div>
+                    <div className="flex items-end justify-between">
+                      <div className="text-xl font-semibold tabular-nums" style={{ color: scoreColor(score) }}>
+                        {score ?? '—'}
+                      </div>
+                      <div className="text-xs" style={{ color: '#3a3a3a' }}>
+                        {rel ? `${rel.baseline_count}B ${rel.chaos_count}C` : '—'}
+                      </div>
+                    </div>
+                  </button>
+                )
+              })}
+            </div>
+          </div>
+
+          {/* Chart */}
+          {displayScenario && chartData.length > 0 && (
+            <div className="card p-5 mb-6">
+              <div className="flex items-center justify-between mb-5">
+                <div>
+                  <div className="text-sm font-medium text-white">{displayScenario}</div>
+                  <div className="text-xs mt-0.5" style={{ color: '#555' }}>Score over time</div>
+                </div>
+                {reliability[displayScenario] && (
+                  <div className="text-right">
+                    <div className="text-xs mb-0.5" style={{ color: '#555' }}>Reliability</div>
+                    <div className="text-xl font-semibold" style={{ color: scoreColor(Math.round(reliability[displayScenario].reliability)) }}>
+                      {Math.round(reliability[displayScenario].reliability)}
+                    </div>
+                  </div>
+                )}
+              </div>
+              <ResponsiveContainer width="100%" height={200}>
+                <LineChart data={chartData} margin={{ top: 5, right: 5, bottom: 5, left: -25 }}>
+                  <CartesianGrid strokeDasharray="3 3" stroke="#1a1a1a" />
+                  <XAxis
+                    dataKey="date"
+                    tickFormatter={d => new Date(d).toLocaleDateString('en-US', { month: 'short', day: 'numeric' })}
+                    tick={{ fill: '#444', fontSize: 10 }}
+                    axisLine={false} tickLine={false}
+                  />
+                  <YAxis domain={[0, 100]} tick={{ fill: '#444', fontSize: 10 }} axisLine={false} tickLine={false} />
+                  <ReferenceLine y={70} stroke="#2a2a2a" strokeDasharray="4 4" />
+                  <Tooltip content={<CustomTooltip />} />
+                  <Line
+                    type="monotone" dataKey="score" stroke="#22c55e" strokeWidth={2}
+                    dot={{ fill: '#22c55e', r: 3, strokeWidth: 0 }}
+                    activeDot={{ r: 4, fill: '#4ade80' }}
+                  />
+                </LineChart>
+              </ResponsiveContainer>
+              <div className="text-xs mt-1.5" style={{ color: '#3a3a3a' }}>
+                — 70% minimum threshold
+              </div>
+            </div>
+          )}
+
+          {/* Runs table */}
+          <div>
+            <div className="text-xs font-medium mb-3" style={{ color: '#555' }}>ALL RUNS</div>
+            <div className="card overflow-hidden">
+              <table className="w-full text-sm">
+                <thead>
+                  <tr style={{ borderBottom: '1px solid #1a1a1a' }}>
+                    {['Scenario', 'Type', 'Score', 'Samples', 'Judge', 'Date', ''].map(h => (
+                      <th key={h} className="px-4 py-3 text-left text-xs font-medium" style={{ color: '#444' }}>{h}</th>
+                    ))}
+                  </tr>
+                </thead>
+                <tbody>
+                  {completed.map((r, i) => (
+                    <tr
+                      key={r.id}
+                      style={{ borderBottom: i < completed.length - 1 ? '1px solid #141414' : undefined }}
+                      className="transition-colors"
+                    >
+                      <td className="px-4 py-3 mono text-xs font-medium text-white">{r.scenario}</td>
+                      <td className="px-4 py-3">
+                        <span className="tag" style={r.run_type === 'chaos' ? { color: '#ef4444', borderColor: 'rgba(239,68,68,0.2)', background: 'rgba(239,68,68,0.06)' } : {}}>
+                          {r.run_type}
+                        </span>
+                      </td>
+                      <td className="px-4 py-3">
+                        <span className="font-semibold tabular-nums" style={{ color: scoreColor(r.score) }}>
+                          {r.score ?? '—'}
+                        </span>
+                        <span className="text-xs ml-1" style={{ color: '#3a3a3a' }}>/100</span>
+                      </td>
+                      <td className="px-4 py-3 text-xs" style={{ color: '#555' }}>{r.sample_count}</td>
+                      <td className="px-4 py-3 mono text-xs" style={{ color: '#444' }}>{r.eval_model.split('/').pop()}</td>
+                      <td className="px-4 py-3 text-xs" style={{ color: '#444' }}>
+                        {new Date(r.created_at).toLocaleDateString('en-US', { month: 'short', day: 'numeric', hour: '2-digit', minute: '2-digit' })}
+                      </td>
+                      <td className="px-4 py-3">
+                        <Link href={`/run/${r.id}`} className="text-xs transition-colors" style={{ color: '#555' }}>
+                          Details →
+                        </Link>
+                      </td>
+                    </tr>
+                  ))}
+                </tbody>
+              </table>
+            </div>
+          </div>
+        </>
+      )}
+    </div>
+  )
+}
diff --git a/ui/frontend/app/layout.tsx b/ui/frontend/app/layout.tsx
new file mode 100644
index 0000000..8503132
--- /dev/null
+++ b/ui/frontend/app/layout.tsx
@@ -0,0 +1,21 @@
+import type { Metadata } from 'next'
+import './globals.css'
+import Sidebar from '@/components/Sidebar'
+
+export const metadata: Metadata = {
+  title: 'EvalMonkey — Agent Benchmarking',
+  description: 'Run standard benchmarks and chaos tests against your AI agents. Track reliability over time.',
+}
+
+export default function RootLayout({ children }: { children: React.ReactNode }) {
+  return (
+    <html lang="en">
+      <body className="flex min-h-screen" style={{ background: '#0a0a0a', color: '#fff' }}>
+        <Sidebar />
+        <main className="flex-1 ml-56 min-h-screen overflow-auto">
+          {children}
+        </main>
+      </body>
+    </html>
+  )
+}
diff --git a/ui/frontend/app/page.tsx b/ui/frontend/app/page.tsx
new file mode 100644
index 0000000..a23efcc
--- /dev/null
+++ b/ui/frontend/app/page.tsx
@@ -0,0 +1,142 @@
+'use client'
+import { useEffect, useState } from 'react'
+import Link from 'next/link'
+import { api } from '@/lib/api'
+import { RunSummary, scoreColor } from '@/lib/types'
+import RunCard from '@/components/RunCard'
+import ScoreRing from '@/components/ScoreRing'
+import { Plus, RefreshCw } from 'lucide-react'
+
+export default function DashboardPage() {
+  const [runs, setRuns] = useState<RunSummary[]>([])
+  const [reliability, setReliability] = useState<Record<string, { reliability: number }>>({})
+  const [loading, setLoading] = useState(true)
+
+  const refresh = async () => {
+    try {
+      const [runsData, relData] = await Promise.all([api.listRuns(30), api.getReliability()])
+      setRuns(runsData)
+      setReliability(relData)
+    } catch {}
+    setLoading(false)
+  }
+
+  useEffect(() => {
+    refresh()
+    const iv = setInterval(refresh, 5000)
+    return () => clearInterval(iv)
+  }, [])
+
+  const overall = Object.values(reliability).length
+    ? Math.round(Object.values(reliability).reduce((s, v) => s + v.reliability, 0) / Object.values(reliability).length)
+    : null
+
+  const running   = runs.filter(r => r.status === 'running')
+  const completed = runs.filter(r => r.status === 'completed')
+  const avgScore  = completed.length
+    ? Math.round(completed.reduce((s, r) => s + (r.score ?? 0), 0) / completed.length)
+    : null
+
+  return (
+    <div className="p-8 max-w-6xl mx-auto">
+      {/* Page header */}
+      <div className="flex items-center justify-between mb-8">
+        <div>
+          <h1 className="text-lg font-semibold text-white mb-0.5">Dashboard</h1>
+          <p className="text-sm" style={{ color: '#555' }}>Agent benchmark scores &amp; reliability</p>
+        </div>
+        <div className="flex items-center gap-2">
+          <button onClick={refresh} className="btn">
+            <RefreshCw size={13} />Refresh
+          </button>
+          <Link href="/run/new" className="btn btn-primary">
+            <Plus size={13} />New Run
+          </Link>
+        </div>
+      </div>
+
+      {/* Stat cards */}
+      <div className="grid grid-cols-3 gap-3 mb-8">
+        {/* Reliability */}
+        <div className="card p-5">
+          <div className="text-xs font-medium mb-4" style={{ color: '#555' }}>PRODUCTION RELIABILITY</div>
+          <div className="flex items-center gap-4">
+            <ScoreRing score={overall} size={72} strokeWidth={6} />
+            <div>
+              <div className="text-2xl font-semibold text-white tabular-nums">
+                {overall !== null ? overall : '—'}<span className="text-base font-normal" style={{ color: '#555' }}>/100</span>
+              </div>
+              <div className="text-xs mt-1" style={{ color: '#555' }}>60% baseline · 40% chaos</div>
+            </div>
+          </div>
+        </div>
+
+        {/* Avg score */}
+        <div className="card p-5">
+          <div className="text-xs font-medium mb-4" style={{ color: '#555' }}>AVG SCORE</div>
+          <div className="text-2xl font-semibold text-white tabular-nums">
+            {avgScore !== null ? avgScore : '—'}
+            <span className="text-base font-normal" style={{ color: '#555' }}>/100</span>
+          </div>
+          <div className="text-xs mt-1" style={{ color: '#555' }}>{completed.length} completed</div>
+        </div>
+
+        {/* Active */}
+        <div className="card p-5">
+          <div className="text-xs font-medium mb-4" style={{ color: '#555' }}>ACTIVE RUNS</div>
+          <div className="flex items-center gap-2">
+            <div className="text-2xl font-semibold text-white tabular-nums">{running.length}</div>
+            {running.length > 0 && <span className="dot-live" />}
+          </div>
+          <div className="text-xs mt-1" style={{ color: '#555' }}>in progress</div>
+        </div>
+      </div>
+
+      {/* Live runs */}
+      {running.length > 0 && (
+        <div className="mb-8">
+          <div className="flex items-center gap-2 mb-3">
+            <span className="dot-live" />
+            <span className="text-xs font-medium" style={{ color: '#555' }}>LIVE RUNS</span>
+          </div>
+          <div className="grid grid-cols-3 gap-3">
+            {running.map(r => <RunCard key={r.id} run={r} />)}
+          </div>
+        </div>
+      )}
+
+      {/* Recent runs */}
+      <div>
+        <div className="flex items-center justify-between mb-3">
+          <span className="text-xs font-medium" style={{ color: '#555' }}>RECENT RUNS</span>
+          <Link href="/history" className="text-xs transition-colors" style={{ color: '#555' }}>
+            All history →
+          </Link>
+        </div>
+
+        {loading ? (
+          <div className="grid grid-cols-3 gap-3">
+            {[...Array(6)].map((_, i) => (
+              <div key={i} className="card h-32 animate-pulse" style={{ background: '#111' }} />
+            ))}
+          </div>
+        ) : completed.length === 0 ? (
+          <div className="card p-14 text-center">
+            <div className="text-3xl mb-3">🐵</div>
+            <div className="text-sm font-medium text-white mb-1">No runs yet</div>
+            <div className="text-xs mb-5" style={{ color: '#555' }}>
+              Run your first benchmark to track agent reliability
+            </div>
+            <Link href="/run/new" className="btn btn-primary">
+              <Plus size={13} />Run First Benchmark
+            </Link>
+          </div>
+        ) : (
+          <div className="grid grid-cols-3 gap-3">
+            {completed.map(r => <RunCard key={r.id} run={r} />)}
+          </div>
+        )}
+      </div>
+    </div>
+  )
+}
diff --git a/ui/frontend/app/run/[id]/page.tsx b/ui/frontend/app/run/[id]/page.tsx
new file mode 100644
index 0000000..2c52b46
--- /dev/null
+++ b/ui/frontend/app/run/[id]/page.tsx
@@ -0,0 +1,258 @@
+'use client'
+import { useEffect, useRef, useState } from 'react'
+import { useParams } from 'next/navigation'
+import { api } from '@/lib/api'
+import { RunSummary, SampleResult, SSEEvent, scoreColor } from '@/lib/types'
+import ScoreRing from '@/components/ScoreRing'
+import { SUPPORTED_BENCHMARK_CATEGORIES } from '@/lib/benchmarks'
+import { ArrowLeft, CheckCircle, XCircle, Zap } from 'lucide-react'
+import Link from 'next/link'
+
+export default function RunDetailPage() {
+  const { id } = useParams<{ id: string }>()
+  const [run, setRun] = useState<RunSummary | null>(null)
+  const [samples, setSamples] = useState<SampleResult[]>([])
+  const [currentScore, setCurrentScore] = useState<number | null>(null)
+  const [status, setStatus] = useState<string>('running')
+  const [statusMsg, setStatusMsg] = useState<string>('Initializing...')
+  const [selected, setSelected] = useState<number | null>(null)
+  const bottomRef = useRef<HTMLDivElement>(null)
+
+  useEffect(() => {
+    api.getRun(id).then(r => { setRun(r); setStatus(r.status) }).catch(() => {})
+  }, [id])
+
+  useEffect(() => {
+    // Connect directly to the backend — bypasses the Next.js dev-server proxy
+    // which buffers SSE responses and causes "Waiting for first result..." to hang.
+    const backendUrl = process.env.NEXT_PUBLIC_BACKEND_URL ?? 'http://localhost:8080'
+    const es = new EventSource(`${backendUrl}/api/run/${id}/stream`)
+    es.onmessage = (e) => {
+      try {
+        const ev: SSEEvent = JSON.parse(e.data)
+        if (ev.type === 'status' || ev.type === 'progress') {
+          setStatusMsg(ev.message ?? '')
+        } else if (ev.type === 'sample') {
+          const s: SampleResult = {
+            id: null, run_id: id,
+            sample_index: ev.index ?? 0,
+            eval_id: ev.eval_id ?? '',
+            question: ev.question ?? '',
+            agent_output: ev.agent_output ?? null,
+            expected_rubric: ev.expected_rubric ?? null,
+            score: ev.score ?? null,
+            reasoning: ev.reasoning ?? null,
+            chaos_profile: null,
+            created_at: new Date().toISOString(),
+          }
+          setSamples(prev => prev.find(x => x.sample_index === s.sample_index) ? prev : [...prev, s])
+          setCurrentScore(ev.current_score ?? null)
+          setTimeout(() => bottomRef.current?.scrollIntoView({ behavior: 'smooth' }), 80)
+        } else if (ev.type === 'complete') {
+          setStatus('completed'); setCurrentScore(ev.final_score ?? null)
+          setStatusMsg('Completed')
+          api.getRun(id).then(r => setRun(r)).catch(() => {})
+          es.close()
+        } else if (ev.type === 'error') {
+          setStatus('failed'); setStatusMsg(ev.message ?? 'Run failed')
+          es.close()
+        }
+      } catch {}
+    }
+    es.onerror = () => {
+      // On connection error, poll once to get the final run state from DB
+      api.getRun(id).then(r => {
+        setRun(r)
+        setStatus(r.status)
+        if (r.status === 'failed') setStatusMsg((r as any).details?.error ?? 'Run failed')
+        if (r.status === 'completed') setStatusMsg('Completed')
+      }).catch(() => {})
+      es.close()
+    }
+    return () => es.close()
+  }, [id])
+
+  const finalScore = status === 'completed' ? (run?.score ?? currentScore) : currentScore
+  const total = run?.sample_count ?? 0
+  const progress = total > 0 ? (samples.length / total) * 100 : 0
+
+  return (
+    <div className="p-8 max-w-5xl mx-auto">
+      {/* Back */}
+      <Link href="/" className="inline-flex items-center gap-1.5 text-xs mb-6 transition-colors"
+        style={{ color: '#555' }}>
+        <ArrowLeft size={12} />Dashboard
+      </Link>
+
+      {/* Run header card */}
+      <div className="card p-5 mb-5">
+        <div className="flex items-start justify-between gap-6">
+          <div className="flex-1 min-w-0">
+            <div className="flex items-center gap-2.5 mb-1.5">
+              <span className="mono text-base font-semibold text-white">{run?.scenario ?? id}</span>
+              {run?.run_type === 'chaos' && (
+                <span className="tag" style={{ color: '#ef4444', borderColor: 'rgba(239,68,68,0.25)', background: 'rgba(239,68,68,0.06)' }}>
+                  <Zap size={9} />chaos
+                </span>
+              )}
+            </div>
+            <div className="flex items-center gap-4 text-xs" style={{ color: '#555' }}>
+              <span>{run?.eval_model ?? '—'}</span>
+              {run?.chaos_profile && <span style={{ color: '#ef4444' }}>⚡ {run.chaos_profile}</span>}
+              <span className="mono truncate">{run?.target_url}</span>
+            </div>
+          </div>
+          <div className="text-right flex-shrink-0">
+            <div className="text-3xl font-semibold tabular-nums" style={{ color: scoreColor(finalScore) }}>
+              {finalScore !== null ? finalScore : '—'}
+            </div>
+            <div className="text-xs" style={{ color: '#555' }}>/ 100</div>
+          </div>
+        </div>
+
+        {/* Progress */}
+        <div className="mt-4">
+          <div className="flex items-center justify-between text-xs mb-2" style={{ color: '#555' }}>
+            <div className="flex items-center gap-1.5">
+              {status === 'running'   && <span className="dot-live" />}
+              {status === 'completed' && <CheckCircle size={11} style={{ color: '#22c55e' }} />}
+              {status === 'failed'    && <XCircle size={11} style={{ color: '#ef4444' }} />}
+              <span>{statusMsg}</span>
+            </div>
+            <span>{samples.length}/{total}</span>
+          </div>
+          <div className="h-1 rounded-full" style={{ background: '#1a1a1a' }}>
+            <div
+              className="h-full rounded-full transition-all duration-500"
+              style={{
+                width: `${progress}%`,
+                background: status === 'failed' ? '#ef4444'
+                          : status === 'completed' ? '#22c55e'
+                          : '#22c55e',
+              }}
+            />
+          </div>
+        </div>
+      </div>
+
+      {/* Sample results */}
+      <div className="flex gap-5">
+        <div className="flex-1 min-w-0">
+          <div className="text-xs font-medium mb-3" style={{ color: '#555' }}>SAMPLES</div>
+          {samples.length === 0 ? (
+            <div className="card p-10 text-center">
+              {status === 'failed' ? (
+                <>
+                  <XCircle size={24} style={{ color: '#ef4444', margin: '0 auto 12px' }} />
+                  <div className="text-sm font-medium text-white mb-1">Run failed</div>
+                  <div className="text-xs leading-relaxed" style={{ color: '#ef4444', maxWidth: 340, margin: '0 auto' }}>
+                    {statusMsg}
+                  </div>
+                  <Link href="/run/new" className="btn btn-primary mt-5 inline-flex">Try Again</Link>
+                </>
+              ) : (
+                <>
+                  <div className="w-7 h-7 border-2 border-[#222] border-t-[#22c55e] rounded-full animate-spin mx-auto mb-3" />
+                  <div className="text-xs" style={{ color: '#555' }}>{statusMsg || 'Waiting for first result...'}</div>
+                </>
+              )}
+            </div>
+          ) : (
+            <div className="space-y-1.5">
+              {samples.map((s, i) => {
+                const col = scoreColor(s.score)
+                const open = selected === s.sample_index
+                return (
+                  <div
+                    key={s.sample_index}
+                    className="card card-hover row-animate cursor-pointer transition-all"
+                    style={{ animationDelay: `${i * 0.03}s`, borderColor: open ? '#333' : undefined }}
+                    onClick={() => setSelected(open ? null : s.sample_index)}
+                  >
+                    <div className="flex items-center gap-3 p-3">
+                      {/* Index */}
+                      <div className="w-6 h-6 flex items-center justify-center text-xs flex-shrink-0"
+                        style={{ color: '#444', background: '#161616', borderRadius: '4px', fontVariantNumeric: 'tabular-nums' }}>
+                        {s.sample_index + 1}
+                      </div>
+
+                      {/* Question */}
+                      <div className="flex-1 min-w-0">
+                        <div className="text-sm text-white truncate">{s.question}</div>
+                        {s.agent_output && (
+                          <div className="text-xs truncate mt-0.5" style={{ color: '#555' }}>
+                            ↳ {s.agent_output}
+                          </div>
+                        )}
+                      </div>
+
+                      {/* Score */}
+                      <div className="text-right flex-shrink-0">
+                        <div className="text-base font-semibold tabular-nums" style={{ color: col }}>
+                          {s.score ?? '…'}
+                        </div>
+                      </div>
+                    </div>
+
+                    {/* Expanded */}
+                    {open && (
+                      <div className="px-3 pb-4 pt-1 border-t space-y-3 text-xs" style={{ borderColor: '#1e1e1e' }}>
+                        <div>
+                          <div className="font-medium mb-1" style={{ color: '#555' }}>QUESTION</div>
+                          <div className="text-white leading-relaxed whitespace-pre-wrap">{s.question}</div>
+                        </div>
+                        <div>
+                          <div className="font-medium mb-1" style={{ color: '#555' }}>AGENT OUTPUT</div>
+                          <div className="leading-relaxed whitespace-pre-wrap" style={{ color: '#888' }}>{s.agent_output || '(none)'}</div>
+                        </div>
+                        <div>
+                          <div className="font-medium mb-1" style={{ color: '#555' }}>EXPECTED</div>
+                          <div className="leading-relaxed" style={{ color: '#888' }}>{s.expected_rubric}</div>
+                        </div>
+                        <div className="p-3 rounded" style={{ background: '#0e0e0e', border: '1px solid #1e1e1e' }}>
+                          <div className="font-medium mb-1" style={{ color: '#555' }}>JUDGE REASONING</div>
+                          <div className="leading-relaxed" style={{ color: '#888' }}>{s.reasoning || '—'}</div>
+                        </div>
+                      </div>
+                    )}
+                  </div>
+                )
+              })}
+              <div ref={bottomRef} />
+            </div>
+          )}
+        </div>
+
+        {/* Breakdown (complete only) */}
+        {status === 'completed' && samples.length > 0 && (
+          <div className="w-48 flex-shrink-0 space-y-3">
+            <div className="text-xs font-medium" style={{ color: '#555' }}>BREAKDOWN</div>
+            <div className="card p-4 space-y-3">
+              {[
+                { label: '90–100', min: 90, max: 101, col: '#4ade80' },
+                { label: '75–89',  min: 75, max: 90,  col: '#22c55e' },
+                { label: '50–74',  min: 50, max: 75,  col: '#f59e0b' },
+                { label: '0–49',   min: 0,  max: 50,  col: '#ef4444' },
+              ].map(({ label, min, max, col }) => {
+                const count = samples.filter(s => (s.score ?? 0) >= min && (s.score ?? 0) < max).length
+                return (
+                  <div key={label}>
+                    <div className="flex justify-between text-xs mb-1" style={{ color: '#555' }}>
+                      <span>{label}</span>
+                      <span style={{ color: col }}>{count}</span>
+                    </div>
+                    <div className="h-1 rounded-full" style={{ background: '#1a1a1a' }}>
+                      <div className="h-full rounded-full" style={{ width: `${(count / samples.length) * 100}%`, background: col }} />
+                    </div>
+                  </div>
+                )
+              })}
+            </div>
+            <Link href="/run/new" className="btn w-full justify-center text-xs py-1.5">Run Again</Link>
+            <Link href="/history" className="btn w-full justify-center text-xs py-1.5">History</Link>
+          </div>
+        )}
+      </div>
+    </div>
+  )
+}
diff --git a/ui/frontend/app/run/new/page.tsx b/ui/frontend/app/run/new/page.tsx
new file mode 100644
index 0000000..6037cff
--- /dev/null
+++ b/ui/frontend/app/run/new/page.tsx
@@ -0,0 +1,354 @@
+'use client'
+import { useEffect, useState } from 'react'
+import { useRouter } from 'next/navigation'
+import { api } from '@/lib/api'
+import { BenchmarkInfo, CATEGORY_COLORS } from '@/lib/types'
+import { CHAOS_PROFILES, EVAL_MODELS } from '@/lib/benchmarks'
+import { ChevronRight, Zap, Bot, FlaskConical } from 'lucide-react'
+
+type Step = 1 | 2 | 3
+
+export default function NewRunPage() {
+  const router = useRouter()
+  const [step, setStep] = useState<Step>(1)
+  const [benchmarks, setBenchmarks] = useState<BenchmarkInfo[]>([])
+  const [loading, setLoading] = useState(false)
+
+  const [targetUrl, setTargetUrl] = useState('http://localhost:8000')
+  const [useSampleAgent, setUseSampleAgent] = useState(false)
+  const [requestKey, setRequestKey] = useState('question')
+  const [responsePath, setResponsePath] = useState('data')
+  const [selectedBenchmark, setSelectedBenchmark] = useState<string | null>(null)
+  const [selectedCategory, setSelectedCategory] = useState<string>('All')
+  const [evalModel, setEvalModel] = useState('gpt-4o')
+  const [limit, setLimit] = useState(5)
+  const [enableChaos, setEnableChaos] = useState(false)
+  const [chaosProfile, setChaosProfile] = useState('client_prompt_injection')
+  const [error, setError] = useState<string | null>(null)
+
+  useEffect(() => {
+    // Auto-select the judge model matching whatever is configured in .env
+    api.getConfig().then(cfg => setEvalModel(cfg.default_eval_model)).catch(() => {})
+    api.listBenchmarks().then(setBenchmarks).catch(() => {})
+  }, [])
+
+  const categories = Array.from(new Set(benchmarks.map(b => b.category)))
+
+  const handleLaunch = async () => {
+    if (!selectedBenchmark) return
+    setLoading(true); setError(null)
+    try {
+      const base = {
+        scenario: selectedBenchmark,
+        target_url: useSampleAgent ? 'http://127.0.0.1:8001/solve' : targetUrl,
+        eval_model: evalModel, request_key: requestKey,
+        response_path: responsePath, limit, use_sample_agent: useSampleAgent,
+      }
+      const result = enableChaos
+        ? await api.startChaos({ ...base, chaos_profile: chaosProfile })
+        : await api.startBenchmark(base)
+      router.push(`/run/${result.run_id}`)
+    } catch (e: any) { setError(e.message); setLoading(false) }
+  }
+
+  const stepLabels = ['Agent Setup', 'Benchmark', 'Configure']
+
+  return (
+    <div className="p-8 max-w-3xl mx-auto">
+      <div className="mb-7">
+        <h1 className="text-lg font-semibold text-white mb-0.5">New Benchmark Run</h1>
+        <p className="text-sm" style={{ color: '#555' }}>Configure and launch an evaluation against your agent</p>
+      </div>
+
+      {/* Step indicator */}
+      <div className="flex items-center gap-1 mb-7">
+        {([1, 2, 3] as Step[]).map((n, i) => (
+          <div key={n} className="flex items-center gap-1">
+            <button
+              onClick={() => n < step && setStep(n)}
+              className="flex items-center gap-2 text-xs font-medium transition-colors"
+              style={{ color: n === step ? '#fff' : n < step ? '#22c55e' : '#444' }}
+            >
+              <span
+                className="w-5 h-5 rounded flex items-center justify-center text-xs font-semibold flex-shrink-0"
+                style={{
+                  background: n === step ? '#fff' : n < step ? 'rgba(34,197,94,0.15)' : '#161616',
+                  color: n === step ? '#000' : n < step ? '#22c55e' : '#444',
+                  border: `1px solid ${n === step ? '#fff' : n < step ? 'rgba(34,197,94,0.3)' : '#222'}`,
+                }}
+              >
+                {n < step ? '✓' : n}
+              </span>
+              {stepLabels[i]}
+            </button>
+            {n < 3 && <ChevronRight size={12} style={{ color: '#2a2a2a' }} />}
+          </div>
+        ))}
+      </div>
+
+      {/* Step 1: Agent Setup */}
+      {step === 1 && (
+        <div className="card p-5 space-y-4">
+          {/* Sample agent toggle */}
+          <div
+            className="flex items-center justify-between p-4 rounded cursor-pointer transition-all"
+            style={{
+              background: useSampleAgent ? 'rgba(34,197,94,0.06)' : '#141414',
+              border: `1px solid ${useSampleAgent ? 'rgba(34,197,94,0.2)' : '#222'}`,
+              borderRadius: '5px',
+            }}
+            onClick={() => { setUseSampleAgent(!useSampleAgent); setSelectedBenchmark(null); }}
+          >
+            <div className="flex items-center gap-3">
+              <Bot size={15} style={{ color: useSampleAgent ? '#22c55e' : '#555' }} />
+              <div>
+                <div className="text-sm font-medium text-white">Use Built-in Demo Agent</div>
+                <div className="text-xs mt-0.5" style={{ color: '#555' }}>
+                  Auto-starts the sample RAG app on localhost:8001
+                </div>
+              </div>
+            </div>
+            <div className={`toggle ${useSampleAgent ? 'on' : ''}`} />
+          </div>
+
+          {!useSampleAgent && (
+            <>
+              <div>
+                <label className="block text-xs font-medium mb-1.5" style={{ color: '#555' }}>AGENT URL</label>
+                <input
+                  value={targetUrl}
+                  onChange={e => setTargetUrl(e.target.value)}
+                  className="mono"
+                  placeholder="http://localhost:8000/solve"
+                />
+              </div>
+              <div className="grid grid-cols-2 gap-3">
+                <div>
+                  <label className="block text-xs font-medium mb-1.5" style={{ color: '#555' }}>REQUEST KEY</label>
+                  <input value={requestKey} onChange={e => setRequestKey(e.target.value)}
+                    className="mono" placeholder="question" />
+                  <p className="text-xs mt-1" style={{ color: '#444' }}>JSON key sent with the question</p>
+                </div>
+                <div>
+                  <label className="block text-xs font-medium mb-1.5" style={{ color: '#555' }}>RESPONSE PATH</label>
+                  <input value={responsePath} onChange={e => setResponsePath(e.target.value)}
+                    className="mono" placeholder="data" />
+                  <p className="text-xs mt-1" style={{ color: '#444' }}>e.g. choices.0.message.content</p>
+                </div>
+              </div>
+            </>
+          )}
+
+          <button onClick={() => setStep(2)} className="btn btn-primary w-full justify-center py-2">
+            Continue →
+          </button>
+        </div>
+      )}
+
+      {/* Step 2: Benchmark Picker */}
+      {step === 2 && (
+        <div>
+          <div className="card p-5">
+            <p className="text-xs mb-4" style={{ color: '#555' }}>
+              Select a standard benchmark dataset:
+            </p>
+            <div className="flex gap-2 overflow-x-auto pb-2 mb-4 scrollbar-hide">
+              <button
+                onClick={() => setSelectedCategory('All')}
+                className={`px-3 py-1 text-xs rounded-full whitespace-nowrap transition-colors ${selectedCategory === 'All' ? 'bg-[#22c55e] text-black font-semibold' : 'bg-[#161616] text-[#888] hover:bg-[#222]'}`}
+              >
+                All
+              </button>
+              {categories.map(cat => (
+                <button
+                  key={cat}
+                  onClick={() => setSelectedCategory(cat)}
+                  className={`px-3 py-1 text-xs rounded-full whitespace-nowrap transition-colors ${selectedCategory === cat ? 'bg-[#22c55e] text-black font-semibold' : 'bg-[#161616] text-[#888] hover:bg-[#222]'}`}
+                >
+                  {cat.toUpperCase()}
+                </button>
+              ))}
+            </div>
+
+            {categories.map(cat => {
+              if (selectedCategory !== 'All' && selectedCategory !== cat) return null
+
+              // Filter out benchmarks that don't make sense for the demo agent
+              const allowedForDemo = ['gsm8k', 'mmlu', 'truthfulqa', 'toxigen', 'hella-swag', 'winogrande', 'arc']
+              const catBenchmarks = benchmarks.filter(b => b.category === cat && (!useSampleAgent || allowedForDemo.includes(b.id)))
+              
+              if (catBenchmarks.length === 0) return null
+
+              return (
+                <div key={cat} className="mb-4">
+                  <div className="text-xs font-medium mb-2" style={{ color: '#444' }}>{cat.toUpperCase()}</div>
+                  <div className="grid grid-cols-2 gap-1.5">
+                    {catBenchmarks.map(b => {
+                    const sel = selectedBenchmark === b.id
+                    return (
+                      <button
+                        key={b.id}
+                        onClick={() => setSelectedBenchmark(b.id)}
+                        className="text-left p-3 rounded transition-all"
+                        style={{
+                          background: sel ? '#161616' : '#0e0e0e',
+                          border: `1px solid ${sel ? '#333' : '#1e1e1e'}`,
+                          borderRadius: '5px',
+                        }}
+                      >
+                        <div className="mono text-xs font-semibold text-white mb-1">{b.id}</div>
+                        <div className="text-xs leading-snug" style={{ color: '#555' }}>
+                          {b.description.split(':')[0]}
+                        </div>
+                      </button>
+                    )
+                  })}
+                </div>
+              </div>
+            )})}
+          </div>
+          <div className="flex gap-2 mt-3">
+            <button onClick={() => setStep(1)} className="btn">← Back</button>
+            <button
+              onClick={() => setStep(3)}
+              disabled={!selectedBenchmark}
+              className="btn btn-primary flex-1 justify-center"
+            >
+              Continue →
+            </button>
+          </div>
+        </div>
+      )}
+
+      {/* Step 3: Configure */}
+      {step === 3 && (
+        <div className="space-y-3">
+          <div className="card p-5 space-y-5">
+            {/* Judge model */}
+            <div>
+              <label className="block text-xs font-medium mb-2" style={{ color: '#555' }}>JUDGE MODEL</label>
+              <div className="grid grid-cols-2 gap-1.5">
+                {EVAL_MODELS.map(m => (
+                  <button
+                    key={m.id}
+                    onClick={() => setEvalModel(m.id)}
+                    className="text-left px-3 py-2.5 rounded transition-all"
+                    style={{
+                      background: evalModel === m.id ? '#161616' : '#0e0e0e',
+                      border: `1px solid ${evalModel === m.id ? '#333' : '#1e1e1e'}`,
+                      borderRadius: '5px',
+                    }}
+                  >
+                    <div className="text-xs font-medium text-white">{m.label}</div>
+                    <div className="text-xs" style={{ color: '#555' }}>{m.provider}</div>
+                  </button>
+                ))}
+              </div>
+            </div>
+
+            {/* Sample count */}
+            <div>
+              <label className="block text-xs font-medium mb-2" style={{ color: '#555' }}>
+                SAMPLES <span className="text-white">{limit}</span>
+              </label>
+              <input
+                type="range" min={1} max={50} value={limit}
+                onChange={e => setLimit(+e.target.value)}
+                className="w-full"
+                style={{ accentColor: '#22c55e' }}
+              />
+              <div className="flex justify-between text-xs mt-1" style={{ color: '#3a3a3a' }}>
+                <span>1 (fast)</span><span>50 (thorough)</span>
+              </div>
+            </div>
+
+            {/* Chaos toggle */}
+            <div>
+              <div className="flex items-center justify-between mb-3">
+                <div className="flex items-center gap-2">
+                  <Zap size={13} style={{ color: enableChaos ? '#ef4444' : '#555' }} strokeWidth={2} />
+                  <span className="text-xs font-medium" style={{ color: enableChaos ? '#ef4444' : '#555' }}>
+                    CHAOS INJECTION
+                  </span>
+                </div>
+                <div
+                  className={`toggle ${enableChaos ? 'on' : ''}`}
+                  style={enableChaos ? { background: '#ef4444', borderColor: '#ef4444' } : {}}
+                  onClick={() => setEnableChaos(!enableChaos)}
+                />
+              </div>
+              {enableChaos && (
+                <div className="grid grid-cols-2 gap-1.5 mt-3">
+                  {CHAOS_PROFILES.map(p => (
+                    <button
+                      key={p.id}
+                      onClick={() => setChaosProfile(p.id)}
+                      className="text-left px-3 py-2 rounded transition-all"
+                      style={{
+                        background: chaosProfile === p.id ? 'rgba(239,68,68,0.08)' : '#0e0e0e',
+                        border: `1px solid ${chaosProfile === p.id ? 'rgba(239,68,68,0.25)' : '#1e1e1e'}`,
+                        borderRadius: '5px',
+                      }}
+                    >
+                      <div className="text-xs font-medium text-white">{p.label}</div>
+                      <div className="text-xs leading-snug" style={{ color: '#555' }}>{p.description}</div>
+                    </button>
+                  ))}
+                </div>
+              )}
+            </div>
+          </div>
+
+          {/* Summary */}
+          <div className="card p-4 text-xs space-y-2" style={{ background: '#0d0d0d' }}>
+            <div className="flex justify-between items-center">
+              <span style={{ color: '#555' }}>Benchmark</span>
+              <span className="mono font-medium text-white">{selectedBenchmark}</span>
+            </div>
+            <div className="flex justify-between items-center">
+              <span style={{ color: '#555' }}>Target</span>
+              <span className="mono truncate ml-4" style={{ color: '#666' }}>{useSampleAgent ? 'sample rag_app' : targetUrl}</span>
+            </div>
+            <div className="flex justify-between items-center">
+              <span style={{ color: '#555' }}>Judge</span>
+              <span style={{ color: '#22c55e' }}>{EVAL_MODELS.find(m => m.id === evalModel)?.label}</span>
+            </div>
+            <div className="flex justify-between items-center">
+              <span style={{ color: '#555' }}>Samples</span>
+              <span className="text-white">{limit}</span>
+            </div>
+            {enableChaos && (
+              <div className="flex justify-between items-center">
+                <span style={{ color: '#555' }}>Chaos</span>
+                <span style={{ color: '#ef4444' }}>{chaosProfile}</span>
+              </div>
+            )}
+          </div>
+
+          {error && (
+            <div className="card px-4 py-3 text-xs" style={{ color: '#ef4444', borderColor: 'rgba(239,68,68,0.2)', background: 'rgba(239,68,68,0.06)' }}>
+              {error}
+            </div>
+          )}
+
+          <div className="flex gap-2">
+            <button onClick={() => setStep(2)} className="btn">← Back</button>
+            <button
+              onClick={handleLaunch}
+              disabled={loading}
+              className={`btn flex-1 justify-center ${enableChaos ? 'btn-danger' : 'btn-primary'}`}
+              style={enableChaos ? { background: 'rgba(239,68,68,0.12)', color: '#ef4444', borderColor: 'rgba(239,68,68,0.3)' } : {}}
+            >
+              {loading
+                ? <><div className="w-3.5 h-3.5 border-2 border-current border-t-transparent rounded-full animate-spin" />Launching...</>
+                : enableChaos
+                ? <><Zap size={13} />Launch Chaos Run</>
+                : <><FlaskConical size={13} />Launch Benchmark</>
+              }
+            </button>
+          </div>
+        </div>
+      )}
+    </div>
+  )
+}
diff --git a/ui/frontend/components/CategoryBadge.tsx b/ui/frontend/components/CategoryBadge.tsx
new file mode 100644
index 0000000..b8b4150
--- /dev/null
+++ b/ui/frontend/components/CategoryBadge.tsx
@@ -0,0 +1,25 @@
+import { CATEGORY_COLORS } from '@/lib/types'
+
+interface Props {
+  category: string
+  size?: 'sm' | 'md'
+}
+
+export default function CategoryBadge({ category, size = 'sm' }: Props) {
+  const c = CATEGORY_COLORS[category] ?? { bg: 'rgba(255,255,255,0.04)', text: '#666', border: '#252525' }
+  return (
+    <span
+      className="inline-flex items-center font-medium whitespace-nowrap"
+      style={{
+        background: c.bg,
+        color: c.text,
+        border: `1px solid ${c.border}`,
+        fontSize: size === 'sm' ? '11px' : '12px',
+        padding: size === 'sm' ? '1px 7px' : '2px 9px',
+        borderRadius: '4px',
+      }}
+    >
+      {category}
+    </span>
+  )
+}
diff --git a/ui/frontend/components/RunCard.tsx b/ui/frontend/components/RunCard.tsx
new file mode 100644
index 0000000..32409ab
--- /dev/null
+++ b/ui/frontend/components/RunCard.tsx
@@ -0,0 +1,78 @@
+'use client'
+import Link from 'next/link'
+import { RunSummary, scoreColor } from '@/lib/types'
+import ScoreRing from './ScoreRing'
+import { SUPPORTED_BENCHMARK_CATEGORIES } from '@/lib/benchmarks'
+import { Zap } from 'lucide-react'
+
+interface Props { run: RunSummary }
+
+function timeAgo(iso: string): string {
+  const diff = Date.now() - new Date(iso).getTime()
+  const mins = Math.floor(diff / 60000)
+  if (mins < 1)  return 'just now'
+  if (mins < 60) return `${mins}m ago`
+  const hrs = Math.floor(mins / 60)
+  if (hrs < 24)  return `${hrs}h ago`
+  return `${Math.floor(hrs / 24)}d ago`
+}
+
+export default function RunCard({ run }: Props) {
+  const col = scoreColor(run.status === 'running' ? null : run.score)
+
+  return (
+    <Link
+      href={`/run/${run.id}`}
+      className="card card-hover block p-4 transition-all duration-100"
+      style={{ textDecoration: 'none' }}
+    >
+      {/* Header */}
+      <div className="flex items-start justify-between gap-3 mb-3">
+        <div className="flex-1 min-w-0">
+          <div className="flex items-center gap-2 mb-1.5">
+            <span className="mono text-sm font-medium text-white">{run.scenario}</span>
+            {run.run_type === 'chaos' && (
+              <span className="tag" style={{ color: '#ef4444', borderColor: 'rgba(239,68,68,0.25)', background: 'rgba(239,68,68,0.08)' }}>
+                <Zap size={9} strokeWidth={2.5} />chaos
+              </span>
+            )}
+          </div>
+          <span className="tag">{SUPPORTED_BENCHMARK_CATEGORIES[run.scenario] ?? 'Research'}</span>
+        </div>
+        <ScoreRing score={run.status === 'running' ? null : run.score} size={52} strokeWidth={4} />
+      </div>
+
+      {/* Progress bar (only when running) */}
+      {run.status === 'running' && (
+        <div className="mb-3">
+          <div
+            className="h-1 rounded-full overflow-hidden"
+            style={{ background: '#1e1e1e' }}
+          >
+            <div
+              className="h-full transition-all duration-500"
+              style={{
+                width: `${run.sample_count ? (run.completed_samples / run.sample_count) * 100 : 0}%`,
+                background: '#22c55e',
+              }}
+            />
+          </div>
+        </div>
+      )}
+
+      {/* Footer */}
+      <div className="flex items-center justify-between text-xs" style={{ color: '#444' }}>
+        <span>{timeAgo(run.created_at)}</span>
+        <span style={{
+          color: run.status === 'running'  ? '#22c55e'
+               : run.status === 'failed'   ? '#ef4444'
+               : '#444',
+        }}>
+          {run.status === 'running'  ? '● running'
+         : run.status === 'failed'   ? '✕ failed'
+         : `${run.sample_count} samples`}
+        </span>
+      </div>
+    </Link>
+  )
+}
diff --git a/ui/frontend/components/ScoreRing.tsx b/ui/frontend/components/ScoreRing.tsx
new file mode 100644
index 0000000..ff2b852
--- /dev/null
+++ b/ui/frontend/components/ScoreRing.tsx
@@ -0,0 +1,59 @@
+'use client'
+import { scoreColor } from '@/lib/types'
+
+interface Props {
+  score: number | null
+  size?: number
+  strokeWidth?: number
+  showLabel?: boolean
+  animate?: boolean
+}
+
+export default function ScoreRing({
+  score,
+  size = 64,
+  strokeWidth = 5,
+  showLabel = true,
+  animate = true,
+}: Props) {
+  const radius = (size - strokeWidth) / 2
+  const circumference = 2 * Math.PI * radius
+  const pct = score !== null ? Math.max(0, Math.min(100, score)) : 0
+  const offset = circumference - (pct / 100) * circumference
+  const color = scoreColor(score)
+
+  return (
+    <div className="relative inline-flex items-center justify-center flex-shrink-0"
+      style={{ width: size, height: size }}>
+      <svg width={size} height={size} style={{ transform: 'rotate(-90deg)' }}>
+        {/* Track */}
+        <circle
+          cx={size / 2} cy={size / 2} r={radius}
+          fill="none"
+          stroke="#1e1e1e"
+          strokeWidth={strokeWidth}
+        />
+        {/* Fill */}
+        <circle
+          cx={size / 2} cy={size / 2} r={radius}
+          fill="none"
+          stroke={color}
+          strokeWidth={strokeWidth}
+          strokeLinecap="round"
+          strokeDasharray={circumference}
+          strokeDashoffset={offset}
+          className="score-ring-fill"
+          style={{ transition: animate ? 'stroke-dashoffset 0.7s cubic-bezier(0.4,0,0.2,1)' : 'none' }}
+        />
+      </svg>
+      {showLabel && (
+        <span
+          className="absolute font-semibold tabular-nums"
+          style={{ color, fontSize: size < 52 ? '10px' : '13px', letterSpacing: '-0.02em' }}
+        >
+          {score !== null ? score : '—'}
+        </span>
+      )}
+    </div>
+  )
+}
diff --git a/ui/frontend/components/Sidebar.tsx b/ui/frontend/components/Sidebar.tsx
new file mode 100644
index 0000000..934fcaf
--- /dev/null
+++ b/ui/frontend/components/Sidebar.tsx
@@ -0,0 +1,57 @@
+'use client'
+import Link from 'next/link'
+import { usePathname } from 'next/navigation'
+import { BarChart3, History, Plus } from 'lucide-react'
+
+const navItems = [
+  { href: '/',        label: 'Dashboard', icon: BarChart3 },
+  { href: '/run/new', label: 'New Run',   icon: Plus },
+  { href: '/history', label: 'History',   icon: History },
+]
+
+export default function Sidebar() {
+  const path = usePathname()
+
+  return (
+    <aside
+      className="fixed top-0 left-0 h-screen w-56 flex flex-col z-50"
+      style={{ background: '#0a0a0a', borderRight: '1px solid #1a1a1a' }}
+    >
+      {/* Logo */}
+      <div className="flex items-center gap-2.5 px-4 h-14"
+        style={{ borderBottom: '1px solid #1a1a1a' }}>
+        <span className="text-base">🐵</span>
+        <div>
+          <span className="font-semibold text-white text-sm tracking-tight">EvalMonkey</span>
+        </div>
+      </div>
+
+      {/* Nav */}
+      <nav className="flex-1 px-2 py-3 space-y-0.5">
+        {navItems.map(({ href, label, icon: Icon }) => {
+          const active = path === href || (href !== '/' && path.startsWith(href))
+          return (
+            <Link
+              key={href}
+              href={href}
+              className="flex items-center gap-2.5 px-3 py-2 rounded text-sm transition-all duration-100"
+              style={{
+                background:  active ? '#161616' : 'transparent',
+                color:       active ? '#ffffff' : '#666666',
+                borderRadius: '5px',
+              }}
+            >
+              <Icon size={14} strokeWidth={1.8} />
+              {label}
+            </Link>
+          )
+        })}
+      </nav>
+
+      {/* Footer */}
+      <div className="px-4 py-4" style={{ borderTop: '1px solid #1a1a1a' }}>
+        <span className="text-xs" style={{ color: '#3a3a3a' }}>Apache 2.0 · Open Source</span>
+      </div>
+    </aside>
+  )
+}
diff --git a/ui/frontend/next.config.js b/ui/frontend/next.config.js
new file mode 100644
index 0000000..77d993c
--- /dev/null
+++ b/ui/frontend/next.config.js
@@ -0,0 +1,13 @@
+/** @type {import('next').NextConfig} */
+const nextConfig = {
+  async rewrites() {
+    return [
+      {
+        source: '/api/:path*',
+        destination: 'http://localhost:8080/api/:path*',
+      },
+    ]
+  },
+}
+
+module.exports = nextConfig
diff --git a/ui/frontend/package.json b/ui/frontend/package.json
new file mode 100644
index 0000000..03187a8
--- /dev/null
+++ b/ui/frontend/package.json
@@ -0,0 +1,27 @@
+{
+  "name": "evalmonkey-ui",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "dev": "next dev",
+    "build": "next build",
+    "start": "next start"
+  },
+  "dependencies": {
+    "next": "14.2.3",
+    "react": "^18",
+    "react-dom": "^18",
+    "recharts": "^2.12.7",
+    "lucide-react": "^0.378.0",
+    "clsx": "^2.1.1"
+  },
+  "devDependencies": {
+    "typescript": "^5",
+    "@types/node": "^20",
+    "@types/react": "^18",
+    "@types/react-dom": "^18",
+    "tailwindcss": "^3.4.1",
+    "postcss": "^8",
+    "autoprefixer": "^10.0.1"
+  }
+}
diff --git a/ui/frontend/postcss.config.js b/ui/frontend/postcss.config.js
new file mode 100644
index 0000000..33ad091
--- /dev/null
+++ b/ui/frontend/postcss.config.js
@@ -0,0 +1,6 @@
+module.exports = {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}
diff --git a/ui/frontend/tailwind.config.js b/ui/frontend/tailwind.config.js
new file mode 100644
index 0000000..fd55c9d
--- /dev/null
+++ b/ui/frontend/tailwind.config.js
@@ -0,0 +1,44 @@
+/** @type {import('tailwindcss').Config} */
+module.exports = {
+  content: [
+    './pages/**/*.{js,ts,jsx,tsx,mdx}',
+    './components/**/*.{js,ts,jsx,tsx,mdx}',
+    './app/**/*.{js,ts,jsx,tsx,mdx}',
+  ],
+  theme: {
+    extend: {
+      colors: {
+        bg:      '#0a0a0a',
+        surface: '#111111',
+        s2:      '#161616',
+        s3:      '#1c1c1c',
+        border:  '#222222',
+        bstrong: '#333333',
+        t1:      '#ffffff',
+        t2:      '#a1a1a1',
+        t3:      '#555555',
+        accent:  '#22c55e',
+      },
+      fontFamily: {
+        sans: ['Inter', 'system-ui', 'sans-serif'],
+        mono: ['JetBrains Mono', 'Fira Code', 'monospace'],
+      },
+      borderRadius: {
+        DEFAULT: '6px',
+        sm: '4px',
+        md: '6px',
+        lg: '8px',
+      },
+      animation: {
+        'fade-up': 'fadeUp 0.2s ease forwards',
+      },
+      keyframes: {
+        fadeUp: {
+          from: { opacity: '0', transform: 'translateY(4px)' },
+          to:   { opacity: '1', transform: 'translateY(0)' },
+        },
+      },
+    },
+  },
+  plugins: [],
+}
diff --git a/ui/frontend/tsconfig.json b/ui/frontend/tsconfig.json
new file mode 100644
index 0000000..109b22f
--- /dev/null
+++ b/ui/frontend/tsconfig.json
@@ -0,0 +1,21 @@
+{
+  "compilerOptions": {
+    "target": "es5",
+    "lib": ["dom", "dom.iterable", "esnext"],
+    "allowJs": true,
+    "skipLibCheck": true,
+    "strict": true,
+    "noEmit": true,
+    "esModuleInterop": true,
+    "module": "esnext",
+    "moduleResolution": "bundler",
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "jsx": "preserve",
+    "incremental": true,
+    "plugins": [{ "name": "next" }],
+    "paths": { "@/*": ["./*"] }
+  },
+  "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
+  "exclude": ["node_modules"]
+}
diff --git a/ui/requirements-ui.txt b/ui/requirements-ui.txt
new file mode 100644
index 0000000..7c608d9
--- /dev/null
+++ b/ui/requirements-ui.txt
@@ -0,0 +1,2 @@
+fastapi>=0.110.0
+uvicorn[standard]>=0.29.0