diff --git a/README.md b/README.md
index 150fedde..44214e96 100644
--- a/README.md
+++ b/README.md
@@ -170,6 +170,14 @@ assembly init voice-agent && assembly deploy --prod
 assembly eval librispeech --speech-model universal-3-pro --limit 50
 ```
 
+Add `--llm` to run an LLM-Gateway chain over each transcript (the WER score still
+uses the raw transcript), and `--llm-reduce` to run one prompt over every item's
+result and summarize the errors across the whole run:
+
+```sh
+assembly eval tedlium --limit 50 --llm-reduce "Summarize the common error patterns"
+```
+
 ## 📦 Installation
 
 Requires Python 3.12+ (Homebrew brings its own; for pipx/uv see the `--python` hint below).
diff --git a/REFERENCE.md b/REFERENCE.md
index b45f7643..56b75ce3 100644
--- a/REFERENCE.md
+++ b/REFERENCE.md
@@ -91,3 +91,9 @@ output printed to stdout (the progress table is routed to stderr so stdout stays
 clean for piping). `--llm-reduce` is repeatable, each prompt running on the
 previous one's output; for a single source it extends the `--llm` chain over
 that transcript.
+
+`assembly eval` takes the same `--llm`/`--llm-reduce` flags but emits a single
+JSON object (not NDJSON): `--llm` runs a chain over each transcript and attaches
+`{"model","steps"}` under the row's `llm` key (the WER score still uses the raw
+transcript), and `--llm-reduce` runs one prompt over every item's result and
+adds a top-level `reduce` (`{"model","prompts","output"}`) to the object.
diff --git a/aai_cli/commands/evaluate/__init__.py b/aai_cli/commands/evaluate/__init__.py
index e860e944..dac5c3ea 100644
--- a/aai_cli/commands/evaluate/__init__.py
+++ b/aai_cli/commands/evaluate/__init__.py
@@ -13,6 +13,7 @@
 from aai_cli.app.context import run_with_options
 from aai_cli.commands.evaluate import _exec as evaluate_exec
 from aai_cli.commands.evaluate._exec import EvalSpeechModel
+from aai_cli.core import llm
 from aai_cli.ui.help_text import examples_epilog
 
 app = typer.Typer()
@@ -45,6 +46,10 @@
                 "Evaluate non-English audio",
                 "assembly eval commonvoice --subset fr --language-code fr",
             ),
+            (
+                "Summarize error patterns across the set",
+                'assembly eval tedlium --llm-reduce "Summarize the common error patterns"',
+            ),
         ]
     ),
 )
@@ -79,6 +84,34 @@ def evaluate(
         min=1,
         help="How many items to transcribe at once (sequential by default)",
     ),
+    llm_prompt: list[str] | None = typer.Option(
+        None,
+        "--llm",
+        help="Transform each transcript through LLM Gateway before reporting (the WER "
+        "score still uses the raw transcript). Repeatable: each prompt runs on the "
+        "previous one's response, the first on the transcript.",
+        rich_help_panel=help_panels.OPT_LLM,
+    ),
+    llm_reduce: list[str] | None = typer.Option(
+        None,
+        "--llm-reduce",
+        help="Run one LLM-Gateway prompt over every item's result (a reduce). "
+        "Repeatable: each runs on the previous one's output.",
+        rich_help_panel=help_panels.OPT_LLM,
+    ),
+    model: str = typer.Option(
+        llm.DEFAULT_MODEL,
+        "--model",
+        help="LLM Gateway model",
+        rich_help_panel=help_panels.OPT_LLM,
+        autocompletion=llm.complete_model,
+    ),
+    max_tokens: int = typer.Option(
+        llm.DEFAULT_MAX_TOKENS,
+        "--max-tokens",
+        help="Max tokens",
+        rich_help_panel=help_panels.OPT_LLM,
+    ),
     json_out: bool = options.json_option("Output the rows and summary as one JSON object"),
 ) -> None:
     """Transcribe a dataset and score WER against its reference texts
@@ -99,6 +132,10 @@ def evaluate(
     (English; --subset fr etc. for its 98 other locales), voxpopuli
     (parliament speech), switchboard (phone calls), expresso (expressive
     speech), loquacious, and callhome (phone calls).
+
+    --llm runs an LLM-Gateway chain over each transcript (the WER score still
+    uses the raw transcript); --llm-reduce then runs one prompt over every
+    item's result to summarize patterns across the run.
     """
     opts = evaluate_exec.EvalOptions(
         dataset=dataset,
@@ -110,5 +147,9 @@ def evaluate(
         speech_model=speech_model,
         language_code=language_code,
         concurrency=concurrency,
+        llm_prompt=llm_prompt,
+        llm_reduce=llm_reduce,
+        model=model,
+        max_tokens=max_tokens,
     )
     run_with_options(ctx, evaluate_exec.run_evaluate, opts, json=json_out)
diff --git a/aai_cli/commands/evaluate/_exec.py b/aai_cli/commands/evaluate/_exec.py
index 05f9e47a..e525723c 100644
--- a/aai_cli/commands/evaluate/_exec.py
+++ b/aai_cli/commands/evaluate/_exec.py
@@ -20,10 +20,12 @@
 
 import assemblyai as aai
 from rich.console import RenderableType
+from rich.markup import escape
 
 from aai_cli.app.context import AppState
 from aai_cli.commands.evaluate import _data as eval_data
 from aai_cli.core import client, jsonshape, wer
+from aai_cli.core import llm as gateway
 from aai_cli.core.errors import CLIError, NotAuthenticated
 from aai_cli.ui import output
 
@@ -50,6 +52,31 @@ class EvalOptions:
     speech_model: EvalSpeechModel | None
     language_code: str | None
     concurrency: int
+    llm_prompt: list[str] | None
+    llm_reduce: list[str] | None
+    model: str
+    max_tokens: int
+
+    def llm_options(self) -> _LlmOptions:
+        """The ``--llm`` / ``--llm-reduce`` chain settings as plain data."""
+        return _LlmOptions(
+            prompts=list(self.llm_prompt or []),
+            reduce_prompts=list(self.llm_reduce or []),
+            model=self.model,
+            max_tokens=self.max_tokens,
+        )
+
+
+@dataclass(frozen=True)
+class _LlmOptions:
+    """The post-transcription LLM-Gateway transform: the per-item ``--llm`` chain
+    (a *map*) and the across-items ``--llm-reduce`` chain (a *reduce*), plus the
+    gateway model + token budget both run under."""
+
+    prompts: list[str]
+    reduce_prompts: list[str]
+    model: str
+    max_tokens: int
 
 
 def _pct(value: object) -> str:
@@ -75,11 +102,16 @@ def _percentile(values: list[float], q: float) -> float:
 
 @dataclass(frozen=True)
 class _ItemResult:
-    """One scored row: the emitted dict plus the score and latency kept for pooling."""
+    """One scored row: the emitted dict plus the score and latency kept for pooling.
+
+    ``hypothesis`` is the transcript text (``None`` for a failed row) — kept so the
+    optional ``--llm`` map / ``--llm-reduce`` reduce can run over it after scoring.
+    """
 
     row: dict[str, object]
     words: wer.Score | None
     latency: float
+    hypothesis: str | None = None
 
 
 def _failed_result(item: eval_data.EvalItem, err: CLIError, latency: float) -> _ItemResult:
@@ -94,7 +126,8 @@ def _failed_result(item: eval_data.EvalItem, err: CLIError, latency: float) -> _
 def _score_item(
     item: eval_data.EvalItem, transcript: aai.Transcript, latency: float
 ) -> _ItemResult:
-    words = wer.score(item.reference, str(transcript.text or ""))
+    hypothesis = str(transcript.text or "")
+    words = wer.score(item.reference, hypothesis)
     row: dict[str, object] = {
         "item": item.item_id,
         "words": words.words,
@@ -102,7 +135,7 @@ def _score_item(
         "wer": words.wer,
         "latency": latency,
     }
-    return _ItemResult(row=row, words=words, latency=latency)
+    return _ItemResult(row=row, words=words, latency=latency, hypothesis=hypothesis)
 
 
 def _pooled_metrics(results: list[_ItemResult]) -> dict[str, object]:
@@ -204,6 +237,87 @@ def _transcripts(
         )
 
 
+def _run_llm_map(
+    api_key: str,
+    results: list[_ItemResult],
+    llm_opts: _LlmOptions,
+    *,
+    json_mode: bool,
+    quiet: bool,
+) -> None:
+    """Run the ``--llm`` chain over each transcribed row and attach it under ``llm``.
+
+    A *map*: the chain runs over the row's transcript text (inline, like
+    ``stream --llm``) and lands as ``{"model", "steps"}`` on the row — the WER score
+    is untouched. Failed rows have no transcript, so they're skipped.
+    """
+    scored = [result for result in results if result.hypothesis is not None]
+    with output.status(
+        f"Running --llm over {len(scored)} transcripts…", json_mode=json_mode, quiet=quiet
+    ):
+        for result in scored:
+            steps = gateway.run_chain_steps(
+                api_key,
+                llm_opts.prompts,
+                transcript_text=result.hypothesis,
+                model=llm_opts.model,
+                max_tokens=llm_opts.max_tokens,
+            )
+            result.row["llm"] = {"model": llm_opts.model, "steps": steps}
+
+
+def _reduce_input(result: _ItemResult) -> str:
+    """A row's contribution to the reduce: its last ``--llm`` output, else its transcript."""
+    llm_data = jsonshape.as_mapping(result.row.get("llm"))
+    if llm_data is not None:
+        steps = jsonshape.mapping_list(llm_data.get("steps"))
+        if steps:
+            return str(steps[-1].get("output", "") or "")
+    return result.hypothesis or ""
+
+
+def _gather_reduce_inputs(results: list[_ItemResult]) -> str:
+    """Concatenate every transcribed row's reduce input under an item header."""
+    blocks: list[str] = []
+    for result in results:
+        if result.hypothesis is None:
+            continue
+        text = _reduce_input(result)
+        if text:
+            blocks.append(f"### Item: {result.row.get('item')}\n{text}")
+    return "\n\n".join(blocks)
+
+
+def _run_reduce(
+    api_key: str,
+    results: list[_ItemResult],
+    llm_opts: _LlmOptions,
+    *,
+    json_mode: bool,
+    quiet: bool,
+) -> dict[str, object] | None:
+    """Run the ``--llm-reduce`` chain once over every row's result; the payload entry.
+
+    ``None`` when there's nothing to aggregate (every row failed or transcribed to
+    empty text) so the caller skips the (billable) gateway call and the payload key.
+    """
+    combined = _gather_reduce_inputs(results)
+    if not combined:
+        output.emit_warning(
+            "Nothing to reduce: no transcript text across items.", json_mode=json_mode
+        )
+        return None
+    with output.status("Running --llm-reduce over all items…", json_mode=json_mode, quiet=quiet):
+        result = gateway.run_chain(
+            api_key,
+            llm_opts.reduce_prompts,
+            transcript_text=combined,
+            model=llm_opts.model,
+            max_tokens=llm_opts.max_tokens,
+        )
+    return {"model": llm_opts.model, "prompts": llm_opts.reduce_prompts, "output": result}
+
+
 def _payload(
     label: str, speech_model: EvalSpeechModel | None, results: list[_ItemResult]
 ) -> dict[str, object]:
@@ -249,6 +363,36 @@ def _secs_cell(row: dict[str, object], key: str) -> str:
     return _secs(row[key]) if key in row else ""
 
 
+def _final_llm_output(row: dict[str, object]) -> str | None:
+    """A row's last ``--llm`` step output, or ``None`` when no chain ran on it."""
+    llm_data = jsonshape.as_mapping(row.get("llm"))
+    if llm_data is None:
+        return None
+    steps = jsonshape.mapping_list(llm_data.get("steps"))
+    return str(steps[-1].get("output", "") or "") if steps else ""
+
+
+def _llm_block(payload: dict[str, object]) -> str | None:
+    """The per-item ``--llm`` outputs as a heading + one ``item: output`` line each,
+    or ``None`` when no ``--llm`` chain ran."""
+    lines: list[str] = []
+    for row in jsonshape.mapping_list(payload.get("rows")):
+        final = _final_llm_output(row)
+        if final is not None:
+            lines.append(f"{escape(str(row.get('item')))}: {escape(final)}")
+    if not lines:
+        return None
+    return "\n".join([output.heading("--llm"), *lines])
+
+
+def _reduce_block(payload: dict[str, object]) -> str | None:
+    """The ``--llm-reduce`` aggregate as a heading + the output, or ``None`` when unset."""
+    reduce = jsonshape.as_mapping(payload.get("reduce"))
+    if reduce is None:
+        return None
+    return f"{output.heading('--llm-reduce')}\n{escape(str(reduce.get('output', '')))}"
+
+
 def _render(payload: dict[str, object]) -> RenderableType:
     has_wer = "wer" in payload
     has_failed = "failed" in payload
@@ -271,7 +415,11 @@ def _render(payload: dict[str, object]) -> RenderableType:
         table.add_row(*cells)
     model = payload.get("speech_model") or "default model"
     return output.stack(
-        output.muted(f"{payload.get('dataset')} · {model}"), table, _summary(payload)
+        output.muted(f"{payload.get('dataset')} · {model}"),
+        table,
+        _summary(payload),
+        _llm_block(payload),
+        _reduce_block(payload),
     )
 
 
@@ -310,7 +458,14 @@ def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None
             strict=True,  # pragma: no mutate (defensive invariant; _transcripts returns one outcome per item)
         )
     ]
+    llm_opts = opts.llm_options()
+    if llm_opts.prompts:
+        _run_llm_map(api_key, results, llm_opts, json_mode=json_mode, quiet=state.quiet)
     payload = _payload(data.label, opts.speech_model, results)
+    if llm_opts.reduce_prompts:
+        reduce = _run_reduce(api_key, results, llm_opts, json_mode=json_mode, quiet=state.quiet)
+        if reduce is not None:
+            payload["reduce"] = reduce
     output.emit(payload, _render, json_mode=json_mode)
     failed = jsonshape.as_int(payload.get("failed"))
     if failed:
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index 74c27476..573b2a78 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -496,6 +496,10 @@
    (parliament speech), switchboard (phone calls), expresso (expressive
    speech), loquacious, and callhome (phone calls).
   
+   --llm runs an LLM-Gateway chain over each transcript (the WER score still
+   uses the raw transcript); --llm-reduce then runs one prompt over every
+   item's result to summarize patterns across the run.
+  
   ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
   │ *    dataset      TEXT  Hugging Face dataset id, or a local .csv/.jsonl      │
   │                         manifest with audio + text columns                   │
@@ -527,6 +531,19 @@
   │                                                    object                    │
   │ --help                                             Show this message and     │
   │                                                    exit.                     │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ LLM Transform ──────────────────────────────────────────────────────────────╮
+  │ --llm               TEXT     Transform each transcript through LLM Gateway   │
+  │                              before reporting (the WER score still uses the  │
+  │                              raw transcript). Repeatable: each prompt runs   │
+  │                              on the previous one's response, the first on    │
+  │                              the transcript.                                 │
+  │ --llm-reduce        TEXT     Run one LLM-Gateway prompt over every item's    │
+  │                              result (a reduce). Repeatable: each runs on the │
+  │                              previous one's output.                          │
+  │ --model             TEXT     LLM Gateway model                               │
+  │                              [default: claude-haiku-4-5-20251001]            │
+  │ --max-tokens        INTEGER  Max tokens [default: 1000]                      │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   
    Examples
@@ -538,6 +555,8 @@
    $ assembly eval librispeech --limit 50 --concurrency 4
    Evaluate non-English audio
    $ assembly eval commonvoice --subset fr --language-code fr
+   Summarize error patterns across the set
+   $ assembly eval tedlium --llm-reduce "Summarize the common error patterns"
   
   
   
diff --git a/tests/test_eval_llm.py b/tests/test_eval_llm.py
new file mode 100644
index 00000000..a0c0c7aa
--- /dev/null
+++ b/tests/test_eval_llm.py
@@ -0,0 +1,294 @@
+"""`assembly eval --llm` / `--llm-reduce`: the per-item map, the across-items
+reduce, and how both ride alongside the (unchanged) WER score.
+
+The transcription boundary and the LLM Gateway chain helpers are mocked; the
+dataset is a real temp manifest so the loader runs end to end.
+"""
+
+import contextlib
+import json
+
+import pytest
+from typer.testing import CliRunner
+
+from aai_cli.commands.evaluate import _exec as evaluate_exec
+from aai_cli.core import config
+from aai_cli.main import app
+
+runner = CliRunner()
+
+
+@pytest.fixture(autouse=True)
+def workdir(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+
+
+def _auth():
+    config.set_api_key("default", "sk_live")
+
+
+class _Transcript:
+    def __init__(self, text):
+        self.text = text
+
+
+def _write_manifest(tmp_path):
+    for name in ("a.wav", "b.wav"):
+        (tmp_path / name).write_bytes(b"fake-audio")
+    (tmp_path / "manifest.csv").write_text(
+        "audio,text\na.wav,hello there\nb.wav,goodbye now\n", encoding="utf-8"
+    )
+
+
+def _mock_transcribe(mocker, texts):
+    return mocker.patch(
+        "aai_cli.commands.evaluate._exec.client.transcribe",
+        autospec=True,
+        side_effect=[_Transcript(text) for text in texts],
+    )
+
+
+def _payload_of(result):
+    return next(
+        json.loads(line) for line in result.output.splitlines() if line.startswith('{"dataset"')
+    )
+
+
+def _mock_chain_steps(mocker):
+    """Stand in for the per-item --llm chain: one step echoing the transcript text."""
+
+    def fake(api_key, prompts, *, transcript_text=None, transcript_id=None, model, max_tokens):
+        return [{"prompt": prompts[0], "output": f"MAP::{transcript_text}"}]
+
+    return mocker.patch("aai_cli.commands.evaluate._exec.gateway.run_chain_steps", side_effect=fake)
+
+
+# the per-item map step
+
+
+def test_llm_map_attaches_steps_and_leaves_wer_on_raw_transcript(tmp_path, mocker):
+    _auth()
+    _write_manifest(tmp_path)
+    # b.wav: 1 of 2 words wrong -> row 50%, pooled 1/4 -> 25%.
+    _mock_transcribe(mocker, ["hello there", "goodbye cow"])
+    steps = _mock_chain_steps(mocker)
+    payload = _payload_of(runner.invoke(app, ["eval", "manifest.csv", "--llm", "fix", "--json"]))
+    # WER still scores the raw transcript, not the --llm output.
+    assert payload["wer"] == 0.25
+    assert payload["rows"][0]["wer"] == 0.0
+    # Each scored row carries its chain under "llm".
+    assert payload["rows"][0]["llm"] == {
+        "model": "claude-haiku-4-5-20251001",
+        "steps": [{"prompt": "fix", "output": "MAP::hello there"}],
+    }
+    # The chain ran over the raw transcript text of each row.
+    texts = {call.kwargs["transcript_text"] for call in steps.call_args_list}
+    assert texts == {"hello there", "goodbye cow"}
+    assert all(call.args[1] == ["fix"] for call in steps.call_args_list)
+
+
+def test_llm_map_renders_in_human_mode(tmp_path, mocker):
+    _auth()
+    _write_manifest(tmp_path)
+    _mock_transcribe(mocker, ["hello there", "goodbye now"])
+    _mock_chain_steps(mocker)
+    result = runner.invoke(app, ["eval", "manifest.csv", "--llm", "fix"])
+    assert result.exit_code == 0
+    assert "--llm" in result.output
+    assert "a.wav: MAP::hello there" in result.output
+
+
+def test_llm_map_skips_failed_rows(tmp_path, mocker):
+    from aai_cli.core.errors import APIError
+
+    _auth()
+    _write_manifest(tmp_path)
+    mocker.patch(
+        "aai_cli.commands.evaluate._exec.client.transcribe",
+        autospec=True,
+        side_effect=[_Transcript("hello there"), APIError("boom")],
+    )
+    steps = _mock_chain_steps(mocker)
+    # Whole run exits 1 (a row failed) but the map still ran on the one good row.
+    result = runner.invoke(app, ["eval", "manifest.csv", "--llm", "fix", "--json"])
+    assert result.exit_code == 1
+    payload = _payload_of(result)
+    good = next(row for row in payload["rows"] if "wer" in row)
+    bad = next(row for row in payload["rows"] if "error" in row)
+    assert "llm" in good and "llm" not in bad
+    assert steps.call_count == 1
+
+
+def test_model_and_max_tokens_reach_the_gateway(tmp_path, mocker):
+    _auth()
+    _write_manifest(tmp_path)
+    _mock_transcribe(mocker, ["hello there", "goodbye now"])
+    steps = _mock_chain_steps(mocker)
+    argv = ["eval", "manifest.csv", "--llm", "x", "--model", "gpt-5", "--max-tokens", "42"]
+    assert runner.invoke(app, argv).exit_code == 0
+    assert steps.call_args.kwargs["model"] == "gpt-5"
+    assert steps.call_args.kwargs["max_tokens"] == 42
+
+
+# the across-items reduce step
+
+
+def test_reduce_combines_map_outputs_and_lands_in_payload(tmp_path, mocker):
+    _auth()
+    _write_manifest(tmp_path)
+    _mock_transcribe(mocker, ["hello there", "goodbye now"])
+    _mock_chain_steps(mocker)
+    captured = {}
+
+    def fake_reduce(api_key, prompts, *, transcript_text, model, max_tokens):
+        captured["text"] = transcript_text
+        captured["prompts"] = prompts
+        return "FINAL"
+
+    mocker.patch("aai_cli.commands.evaluate._exec.gateway.run_chain", side_effect=fake_reduce)
+    payload = _payload_of(
+        runner.invoke(
+            app, ["eval", "manifest.csv", "--llm", "judge", "--llm-reduce", "rank", "--json"]
+        )
+    )
+    assert payload["reduce"] == {
+        "model": "claude-haiku-4-5-20251001",
+        "prompts": ["rank"],
+        "output": "FINAL",
+    }
+    # The reduce saw each item's last --llm output under an item header.
+    assert "### Item: a.wav" in captured["text"]
+    assert "MAP::hello there" in captured["text"] and "MAP::goodbye now" in captured["text"]
+    assert captured["prompts"] == ["rank"]
+
+
+def test_reduce_falls_back_to_transcript_text(tmp_path, mocker):
+    _auth()
+    _write_manifest(tmp_path)
+    _mock_transcribe(mocker, ["hello there", "goodbye now"])
+    captured = {}
+
+    def fake_reduce(api_key, prompts, *, transcript_text, model, max_tokens):
+        captured["text"] = transcript_text
+        return "FINAL"
+
+    mocker.patch("aai_cli.commands.evaluate._exec.gateway.run_chain", side_effect=fake_reduce)
+    # No --llm map ran, so the raw transcript text feeds the reduce.
+    result = runner.invoke(app, ["eval", "manifest.csv", "--llm-reduce", "sum"])
+    assert result.exit_code == 0
+    assert "hello there" in captured["text"]
+    assert "--llm-reduce" in result.output and "FINAL" in result.output
+
+
+def test_reduce_skips_gateway_when_nothing_to_aggregate(tmp_path, mocker):
+    from aai_cli.core.errors import APIError
+
+    _auth()
+    _write_manifest(tmp_path)
+    mocker.patch(
+        "aai_cli.commands.evaluate._exec.client.transcribe",
+        autospec=True,
+        side_effect=[APIError("boom"), APIError("boom")],
+    )
+    reduce = mocker.patch("aai_cli.commands.evaluate._exec.gateway.run_chain")
+    result = runner.invoke(app, ["eval", "manifest.csv", "--llm-reduce", "sum", "--json"])
+    reduce.assert_not_called()
+    assert "Nothing to reduce" in result.output
+    assert "reduce" not in _payload_of(result)
+
+
+def test_llm_status_messages(tmp_path, mocker, monkeypatch):
+    _auth()
+    _write_manifest(tmp_path)
+    _mock_transcribe(mocker, ["hello there", "goodbye now"])
+    _mock_chain_steps(mocker)
+    mocker.patch(
+        "aai_cli.commands.evaluate._exec.gateway.run_chain",
+        side_effect=lambda *a, **k: "FINAL",
+    )
+    seen = []
+
+    @contextlib.contextmanager
+    def fake_status(message, *, json_mode, quiet):
+        seen.append(message)
+        yield
+
+    monkeypatch.setattr("aai_cli.commands.evaluate._exec.output.status", fake_status)
+    argv = ["eval", "manifest.csv", "--llm", "judge", "--llm-reduce", "rank"]
+    assert runner.invoke(app, argv).exit_code == 0
+    assert "Running --llm over 2 transcripts…" in seen
+    assert "Running --llm-reduce over all items…" in seen
+
+
+# ---------------------------------------------------------------- pure helpers
+
+
+def test_llm_options_maps_fields():
+    opts = evaluate_exec.EvalOptions(
+        dataset="d", split=None, subset=None, limit=10, audio_column=None,
+        text_column=None, speech_model=None, language_code=None, concurrency=1,
+        llm_prompt=["a"], llm_reduce=["b", "c"], model="m", max_tokens=5,
+    )  # fmt: skip
+    llm_opts = opts.llm_options()
+    assert llm_opts.prompts == ["a"]
+    assert llm_opts.reduce_prompts == ["b", "c"]
+    assert llm_opts.model == "m"
+    assert llm_opts.max_tokens == 5
+
+
+def test_llm_options_default_to_empty_chains():
+    opts = evaluate_exec.EvalOptions(
+        dataset="d", split=None, subset=None, limit=10, audio_column=None,
+        text_column=None, speech_model=None, language_code=None, concurrency=1,
+        llm_prompt=None, llm_reduce=None, model="m", max_tokens=5,
+    )  # fmt: skip
+    llm_opts = opts.llm_options()
+    assert llm_opts.prompts == [] and llm_opts.reduce_prompts == []
+
+
+def _result(row, hypothesis):
+    return evaluate_exec._ItemResult(row=row, words=None, latency=0.0, hypothesis=hypothesis)
+
+
+def test_reduce_input_prefers_last_llm_output_else_transcript():
+    with_llm = _result(
+        {"item": "a", "llm": {"steps": [{"output": "first"}, {"output": "last"}]}}, "raw"
+    )
+    assert evaluate_exec._reduce_input(with_llm) == "last"
+    no_llm = _result({"item": "b"}, "raw text")
+    assert evaluate_exec._reduce_input(no_llm) == "raw text"
+    empty_steps = _result({"item": "c", "llm": {"steps": []}}, "fallback")
+    assert evaluate_exec._reduce_input(empty_steps) == "fallback"
+
+
+def test_gather_reduce_inputs_headers_and_skips_failures_and_blanks():
+    results = [
+        _result({"item": "a"}, "hello"),
+        _result({"item": "fail", "error": "boom"}, None),  # failed: no hypothesis
+        _result({"item": "blank"}, ""),  # empty transcript contributes nothing
+    ]
+    combined = evaluate_exec._gather_reduce_inputs(results)
+    assert combined == "### Item: a\nhello"
+
+
+def test_final_llm_output_distinguishes_absent_from_empty():
+    assert evaluate_exec._final_llm_output({"item": "a"}) is None
+    assert evaluate_exec._final_llm_output({"llm": {"steps": []}}) == ""
+    assert evaluate_exec._final_llm_output({"llm": {"steps": [{"output": "x"}]}}) == "x"
+
+
+def test_render_blocks_drop_out_when_absent():
+    assert evaluate_exec._llm_block({"rows": [{"item": "a"}]}) is None
+    assert evaluate_exec._reduce_block({}) is None
+
+
+def _assign(obj, attribute, value):
+    setattr(obj, attribute, value)
+
+
+def test_llm_options_is_immutable():
+    import dataclasses
+
+    llm_opts = evaluate_exec._LlmOptions(prompts=[], reduce_prompts=[], model="m", max_tokens=1)
+    with pytest.raises(dataclasses.FrozenInstanceError):
+        _assign(llm_opts, "model", "other")