diff --git a/README.md b/README.md index 150fedde..44214e96 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,14 @@ assembly init voice-agent && assembly deploy --prod assembly eval librispeech --speech-model universal-3-pro --limit 50 ``` +Add `--llm` to run an LLM-Gateway chain over each transcript (the WER score still +uses the raw transcript), and `--llm-reduce` to run one prompt over every item's +result and summarize the errors across the whole run: + +```sh +assembly eval tedlium --limit 50 --llm-reduce "Summarize the common error patterns" +``` + ## ๐Ÿ“ฆ Installation Requires Python 3.12+ (Homebrew brings its own; for pipx/uv see the `--python` hint below). diff --git a/REFERENCE.md b/REFERENCE.md index b45f7643..56b75ce3 100644 --- a/REFERENCE.md +++ b/REFERENCE.md @@ -91,3 +91,9 @@ output printed to stdout (the progress table is routed to stderr so stdout stays clean for piping). `--llm-reduce` is repeatable, each prompt running on the previous one's output; for a single source it extends the `--llm` chain over that transcript. + +`assembly eval` takes the same `--llm`/`--llm-reduce` flags but emits a single +JSON object (not NDJSON): `--llm` runs a chain over each transcript and attaches +`{"model","steps"}` under the row's `llm` key (the WER score still uses the raw +transcript), and `--llm-reduce` runs one prompt over every item's result and +adds a top-level `reduce` (`{"model","prompts","output"}`) to the object. diff --git a/aai_cli/commands/evaluate/__init__.py b/aai_cli/commands/evaluate/__init__.py index e860e944..dac5c3ea 100644 --- a/aai_cli/commands/evaluate/__init__.py +++ b/aai_cli/commands/evaluate/__init__.py @@ -13,6 +13,7 @@ from aai_cli.app.context import run_with_options from aai_cli.commands.evaluate import _exec as evaluate_exec from aai_cli.commands.evaluate._exec import EvalSpeechModel +from aai_cli.core import llm from aai_cli.ui.help_text import examples_epilog app = typer.Typer() @@ -45,6 +46,10 @@ "Evaluate non-English audio", "assembly eval commonvoice --subset fr --language-code fr", ), + ( + "Summarize error patterns across the set", + 'assembly eval tedlium --llm-reduce "Summarize the common error patterns"', + ), ] ), ) @@ -79,6 +84,34 @@ def evaluate( min=1, help="How many items to transcribe at once (sequential by default)", ), + llm_prompt: list[str] | None = typer.Option( + None, + "--llm", + help="Transform each transcript through LLM Gateway before reporting (the WER " + "score still uses the raw transcript). Repeatable: each prompt runs on the " + "previous one's response, the first on the transcript.", + rich_help_panel=help_panels.OPT_LLM, + ), + llm_reduce: list[str] | None = typer.Option( + None, + "--llm-reduce", + help="Run one LLM-Gateway prompt over every item's result (a reduce). " + "Repeatable: each runs on the previous one's output.", + rich_help_panel=help_panels.OPT_LLM, + ), + model: str = typer.Option( + llm.DEFAULT_MODEL, + "--model", + help="LLM Gateway model", + rich_help_panel=help_panels.OPT_LLM, + autocompletion=llm.complete_model, + ), + max_tokens: int = typer.Option( + llm.DEFAULT_MAX_TOKENS, + "--max-tokens", + help="Max tokens", + rich_help_panel=help_panels.OPT_LLM, + ), json_out: bool = options.json_option("Output the rows and summary as one JSON object"), ) -> None: """Transcribe a dataset and score WER against its reference texts @@ -99,6 +132,10 @@ def evaluate( (English; --subset fr etc. for its 98 other locales), voxpopuli (parliament speech), switchboard (phone calls), expresso (expressive speech), loquacious, and callhome (phone calls). + + --llm runs an LLM-Gateway chain over each transcript (the WER score still + uses the raw transcript); --llm-reduce then runs one prompt over every + item's result to summarize patterns across the run. """ opts = evaluate_exec.EvalOptions( dataset=dataset, @@ -110,5 +147,9 @@ def evaluate( speech_model=speech_model, language_code=language_code, concurrency=concurrency, + llm_prompt=llm_prompt, + llm_reduce=llm_reduce, + model=model, + max_tokens=max_tokens, ) run_with_options(ctx, evaluate_exec.run_evaluate, opts, json=json_out) diff --git a/aai_cli/commands/evaluate/_exec.py b/aai_cli/commands/evaluate/_exec.py index 05f9e47a..e525723c 100644 --- a/aai_cli/commands/evaluate/_exec.py +++ b/aai_cli/commands/evaluate/_exec.py @@ -20,10 +20,12 @@ import assemblyai as aai from rich.console import RenderableType +from rich.markup import escape from aai_cli.app.context import AppState from aai_cli.commands.evaluate import _data as eval_data from aai_cli.core import client, jsonshape, wer +from aai_cli.core import llm as gateway from aai_cli.core.errors import CLIError, NotAuthenticated from aai_cli.ui import output @@ -50,6 +52,31 @@ class EvalOptions: speech_model: EvalSpeechModel | None language_code: str | None concurrency: int + llm_prompt: list[str] | None + llm_reduce: list[str] | None + model: str + max_tokens: int + + def llm_options(self) -> _LlmOptions: + """The ``--llm`` / ``--llm-reduce`` chain settings as plain data.""" + return _LlmOptions( + prompts=list(self.llm_prompt or []), + reduce_prompts=list(self.llm_reduce or []), + model=self.model, + max_tokens=self.max_tokens, + ) + + +@dataclass(frozen=True) +class _LlmOptions: + """The post-transcription LLM-Gateway transform: the per-item ``--llm`` chain + (a *map*) and the across-items ``--llm-reduce`` chain (a *reduce*), plus the + gateway model + token budget both run under.""" + + prompts: list[str] + reduce_prompts: list[str] + model: str + max_tokens: int def _pct(value: object) -> str: @@ -75,11 +102,16 @@ def _percentile(values: list[float], q: float) -> float: @dataclass(frozen=True) class _ItemResult: - """One scored row: the emitted dict plus the score and latency kept for pooling.""" + """One scored row: the emitted dict plus the score and latency kept for pooling. + + ``hypothesis`` is the transcript text (``None`` for a failed row) โ€” kept so the + optional ``--llm`` map / ``--llm-reduce`` reduce can run over it after scoring. + """ row: dict[str, object] words: wer.Score | None latency: float + hypothesis: str | None = None def _failed_result(item: eval_data.EvalItem, err: CLIError, latency: float) -> _ItemResult: @@ -94,7 +126,8 @@ def _failed_result(item: eval_data.EvalItem, err: CLIError, latency: float) -> _ def _score_item( item: eval_data.EvalItem, transcript: aai.Transcript, latency: float ) -> _ItemResult: - words = wer.score(item.reference, str(transcript.text or "")) + hypothesis = str(transcript.text or "") + words = wer.score(item.reference, hypothesis) row: dict[str, object] = { "item": item.item_id, "words": words.words, @@ -102,7 +135,7 @@ def _score_item( "wer": words.wer, "latency": latency, } - return _ItemResult(row=row, words=words, latency=latency) + return _ItemResult(row=row, words=words, latency=latency, hypothesis=hypothesis) def _pooled_metrics(results: list[_ItemResult]) -> dict[str, object]: @@ -204,6 +237,87 @@ def _transcripts( ) +def _run_llm_map( + api_key: str, + results: list[_ItemResult], + llm_opts: _LlmOptions, + *, + json_mode: bool, + quiet: bool, +) -> None: + """Run the ``--llm`` chain over each transcribed row and attach it under ``llm``. + + A *map*: the chain runs over the row's transcript text (inline, like + ``stream --llm``) and lands as ``{"model", "steps"}`` on the row โ€” the WER score + is untouched. Failed rows have no transcript, so they're skipped. + """ + scored = [result for result in results if result.hypothesis is not None] + with output.status( + f"Running --llm over {len(scored)} transcriptsโ€ฆ", json_mode=json_mode, quiet=quiet + ): + for result in scored: + steps = gateway.run_chain_steps( + api_key, + llm_opts.prompts, + transcript_text=result.hypothesis, + model=llm_opts.model, + max_tokens=llm_opts.max_tokens, + ) + result.row["llm"] = {"model": llm_opts.model, "steps": steps} + + +def _reduce_input(result: _ItemResult) -> str: + """A row's contribution to the reduce: its last ``--llm`` output, else its transcript.""" + llm_data = jsonshape.as_mapping(result.row.get("llm")) + if llm_data is not None: + steps = jsonshape.mapping_list(llm_data.get("steps")) + if steps: + return str(steps[-1].get("output", "") or "") + return result.hypothesis or "" + + +def _gather_reduce_inputs(results: list[_ItemResult]) -> str: + """Concatenate every transcribed row's reduce input under an item header.""" + blocks: list[str] = [] + for result in results: + if result.hypothesis is None: + continue + text = _reduce_input(result) + if text: + blocks.append(f"### Item: {result.row.get('item')}\n{text}") + return "\n\n".join(blocks) + + +def _run_reduce( + api_key: str, + results: list[_ItemResult], + llm_opts: _LlmOptions, + *, + json_mode: bool, + quiet: bool, +) -> dict[str, object] | None: + """Run the ``--llm-reduce`` chain once over every row's result; the payload entry. + + ``None`` when there's nothing to aggregate (every row failed or transcribed to + empty text) so the caller skips the (billable) gateway call and the payload key. + """ + combined = _gather_reduce_inputs(results) + if not combined: + output.emit_warning( + "Nothing to reduce: no transcript text across items.", json_mode=json_mode + ) + return None + with output.status("Running --llm-reduce over all itemsโ€ฆ", json_mode=json_mode, quiet=quiet): + result = gateway.run_chain( + api_key, + llm_opts.reduce_prompts, + transcript_text=combined, + model=llm_opts.model, + max_tokens=llm_opts.max_tokens, + ) + return {"model": llm_opts.model, "prompts": llm_opts.reduce_prompts, "output": result} + + def _payload( label: str, speech_model: EvalSpeechModel | None, results: list[_ItemResult] ) -> dict[str, object]: @@ -249,6 +363,36 @@ def _secs_cell(row: dict[str, object], key: str) -> str: return _secs(row[key]) if key in row else "" +def _final_llm_output(row: dict[str, object]) -> str | None: + """A row's last ``--llm`` step output, or ``None`` when no chain ran on it.""" + llm_data = jsonshape.as_mapping(row.get("llm")) + if llm_data is None: + return None + steps = jsonshape.mapping_list(llm_data.get("steps")) + return str(steps[-1].get("output", "") or "") if steps else "" + + +def _llm_block(payload: dict[str, object]) -> str | None: + """The per-item ``--llm`` outputs as a heading + one ``item: output`` line each, + or ``None`` when no ``--llm`` chain ran.""" + lines: list[str] = [] + for row in jsonshape.mapping_list(payload.get("rows")): + final = _final_llm_output(row) + if final is not None: + lines.append(f"{escape(str(row.get('item')))}: {escape(final)}") + if not lines: + return None + return "\n".join([output.heading("--llm"), *lines]) + + +def _reduce_block(payload: dict[str, object]) -> str | None: + """The ``--llm-reduce`` aggregate as a heading + the output, or ``None`` when unset.""" + reduce = jsonshape.as_mapping(payload.get("reduce")) + if reduce is None: + return None + return f"{output.heading('--llm-reduce')}\n{escape(str(reduce.get('output', '')))}" + + def _render(payload: dict[str, object]) -> RenderableType: has_wer = "wer" in payload has_failed = "failed" in payload @@ -271,7 +415,11 @@ def _render(payload: dict[str, object]) -> RenderableType: table.add_row(*cells) model = payload.get("speech_model") or "default model" return output.stack( - output.muted(f"{payload.get('dataset')} ยท {model}"), table, _summary(payload) + output.muted(f"{payload.get('dataset')} ยท {model}"), + table, + _summary(payload), + _llm_block(payload), + _reduce_block(payload), ) @@ -310,7 +458,14 @@ def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None strict=True, # pragma: no mutate (defensive invariant; _transcripts returns one outcome per item) ) ] + llm_opts = opts.llm_options() + if llm_opts.prompts: + _run_llm_map(api_key, results, llm_opts, json_mode=json_mode, quiet=state.quiet) payload = _payload(data.label, opts.speech_model, results) + if llm_opts.reduce_prompts: + reduce = _run_reduce(api_key, results, llm_opts, json_mode=json_mode, quiet=state.quiet) + if reduce is not None: + payload["reduce"] = reduce output.emit(payload, _render, json_mode=json_mode) failed = jsonshape.as_int(payload.get("failed")) if failed: diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index 74c27476..573b2a78 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -496,6 +496,10 @@ (parliament speech), switchboard (phone calls), expresso (expressive speech), loquacious, and callhome (phone calls). + --llm runs an LLM-Gateway chain over each transcript (the WER score still + uses the raw transcript); --llm-reduce then runs one prompt over every + item's result to summarize patterns across the run. + โ•ญโ”€ Arguments โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ * dataset TEXT Hugging Face dataset id, or a local .csv/.jsonl โ”‚ โ”‚ manifest with audio + text columns โ”‚ @@ -527,6 +531,19 @@ โ”‚ object โ”‚ โ”‚ --help Show this message and โ”‚ โ”‚ exit. โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ•ญโ”€ LLM Transform โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ --llm TEXT Transform each transcript through LLM Gateway โ”‚ + โ”‚ before reporting (the WER score still uses the โ”‚ + โ”‚ raw transcript). Repeatable: each prompt runs โ”‚ + โ”‚ on the previous one's response, the first on โ”‚ + โ”‚ the transcript. โ”‚ + โ”‚ --llm-reduce TEXT Run one LLM-Gateway prompt over every item's โ”‚ + โ”‚ result (a reduce). Repeatable: each runs on the โ”‚ + โ”‚ previous one's output. โ”‚ + โ”‚ --model TEXT LLM Gateway model โ”‚ + โ”‚ [default: claude-haiku-4-5-20251001] โ”‚ + โ”‚ --max-tokens INTEGER Max tokens [default: 1000] โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ Examples @@ -538,6 +555,8 @@ $ assembly eval librispeech --limit 50 --concurrency 4 Evaluate non-English audio $ assembly eval commonvoice --subset fr --language-code fr + Summarize error patterns across the set + $ assembly eval tedlium --llm-reduce "Summarize the common error patterns" diff --git a/tests/test_eval_llm.py b/tests/test_eval_llm.py new file mode 100644 index 00000000..a0c0c7aa --- /dev/null +++ b/tests/test_eval_llm.py @@ -0,0 +1,294 @@ +"""`assembly eval --llm` / `--llm-reduce`: the per-item map, the across-items +reduce, and how both ride alongside the (unchanged) WER score. + +The transcription boundary and the LLM Gateway chain helpers are mocked; the +dataset is a real temp manifest so the loader runs end to end. +""" + +import contextlib +import json + +import pytest +from typer.testing import CliRunner + +from aai_cli.commands.evaluate import _exec as evaluate_exec +from aai_cli.core import config +from aai_cli.main import app + +runner = CliRunner() + + +@pytest.fixture(autouse=True) +def workdir(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + + +def _auth(): + config.set_api_key("default", "sk_live") + + +class _Transcript: + def __init__(self, text): + self.text = text + + +def _write_manifest(tmp_path): + for name in ("a.wav", "b.wav"): + (tmp_path / name).write_bytes(b"fake-audio") + (tmp_path / "manifest.csv").write_text( + "audio,text\na.wav,hello there\nb.wav,goodbye now\n", encoding="utf-8" + ) + + +def _mock_transcribe(mocker, texts): + return mocker.patch( + "aai_cli.commands.evaluate._exec.client.transcribe", + autospec=True, + side_effect=[_Transcript(text) for text in texts], + ) + + +def _payload_of(result): + return next( + json.loads(line) for line in result.output.splitlines() if line.startswith('{"dataset"') + ) + + +def _mock_chain_steps(mocker): + """Stand in for the per-item --llm chain: one step echoing the transcript text.""" + + def fake(api_key, prompts, *, transcript_text=None, transcript_id=None, model, max_tokens): + return [{"prompt": prompts[0], "output": f"MAP::{transcript_text}"}] + + return mocker.patch("aai_cli.commands.evaluate._exec.gateway.run_chain_steps", side_effect=fake) + + +# the per-item map step + + +def test_llm_map_attaches_steps_and_leaves_wer_on_raw_transcript(tmp_path, mocker): + _auth() + _write_manifest(tmp_path) + # b.wav: 1 of 2 words wrong -> row 50%, pooled 1/4 -> 25%. + _mock_transcribe(mocker, ["hello there", "goodbye cow"]) + steps = _mock_chain_steps(mocker) + payload = _payload_of(runner.invoke(app, ["eval", "manifest.csv", "--llm", "fix", "--json"])) + # WER still scores the raw transcript, not the --llm output. + assert payload["wer"] == 0.25 + assert payload["rows"][0]["wer"] == 0.0 + # Each scored row carries its chain under "llm". + assert payload["rows"][0]["llm"] == { + "model": "claude-haiku-4-5-20251001", + "steps": [{"prompt": "fix", "output": "MAP::hello there"}], + } + # The chain ran over the raw transcript text of each row. + texts = {call.kwargs["transcript_text"] for call in steps.call_args_list} + assert texts == {"hello there", "goodbye cow"} + assert all(call.args[1] == ["fix"] for call in steps.call_args_list) + + +def test_llm_map_renders_in_human_mode(tmp_path, mocker): + _auth() + _write_manifest(tmp_path) + _mock_transcribe(mocker, ["hello there", "goodbye now"]) + _mock_chain_steps(mocker) + result = runner.invoke(app, ["eval", "manifest.csv", "--llm", "fix"]) + assert result.exit_code == 0 + assert "--llm" in result.output + assert "a.wav: MAP::hello there" in result.output + + +def test_llm_map_skips_failed_rows(tmp_path, mocker): + from aai_cli.core.errors import APIError + + _auth() + _write_manifest(tmp_path) + mocker.patch( + "aai_cli.commands.evaluate._exec.client.transcribe", + autospec=True, + side_effect=[_Transcript("hello there"), APIError("boom")], + ) + steps = _mock_chain_steps(mocker) + # Whole run exits 1 (a row failed) but the map still ran on the one good row. + result = runner.invoke(app, ["eval", "manifest.csv", "--llm", "fix", "--json"]) + assert result.exit_code == 1 + payload = _payload_of(result) + good = next(row for row in payload["rows"] if "wer" in row) + bad = next(row for row in payload["rows"] if "error" in row) + assert "llm" in good and "llm" not in bad + assert steps.call_count == 1 + + +def test_model_and_max_tokens_reach_the_gateway(tmp_path, mocker): + _auth() + _write_manifest(tmp_path) + _mock_transcribe(mocker, ["hello there", "goodbye now"]) + steps = _mock_chain_steps(mocker) + argv = ["eval", "manifest.csv", "--llm", "x", "--model", "gpt-5", "--max-tokens", "42"] + assert runner.invoke(app, argv).exit_code == 0 + assert steps.call_args.kwargs["model"] == "gpt-5" + assert steps.call_args.kwargs["max_tokens"] == 42 + + +# the across-items reduce step + + +def test_reduce_combines_map_outputs_and_lands_in_payload(tmp_path, mocker): + _auth() + _write_manifest(tmp_path) + _mock_transcribe(mocker, ["hello there", "goodbye now"]) + _mock_chain_steps(mocker) + captured = {} + + def fake_reduce(api_key, prompts, *, transcript_text, model, max_tokens): + captured["text"] = transcript_text + captured["prompts"] = prompts + return "FINAL" + + mocker.patch("aai_cli.commands.evaluate._exec.gateway.run_chain", side_effect=fake_reduce) + payload = _payload_of( + runner.invoke( + app, ["eval", "manifest.csv", "--llm", "judge", "--llm-reduce", "rank", "--json"] + ) + ) + assert payload["reduce"] == { + "model": "claude-haiku-4-5-20251001", + "prompts": ["rank"], + "output": "FINAL", + } + # The reduce saw each item's last --llm output under an item header. + assert "### Item: a.wav" in captured["text"] + assert "MAP::hello there" in captured["text"] and "MAP::goodbye now" in captured["text"] + assert captured["prompts"] == ["rank"] + + +def test_reduce_falls_back_to_transcript_text(tmp_path, mocker): + _auth() + _write_manifest(tmp_path) + _mock_transcribe(mocker, ["hello there", "goodbye now"]) + captured = {} + + def fake_reduce(api_key, prompts, *, transcript_text, model, max_tokens): + captured["text"] = transcript_text + return "FINAL" + + mocker.patch("aai_cli.commands.evaluate._exec.gateway.run_chain", side_effect=fake_reduce) + # No --llm map ran, so the raw transcript text feeds the reduce. + result = runner.invoke(app, ["eval", "manifest.csv", "--llm-reduce", "sum"]) + assert result.exit_code == 0 + assert "hello there" in captured["text"] + assert "--llm-reduce" in result.output and "FINAL" in result.output + + +def test_reduce_skips_gateway_when_nothing_to_aggregate(tmp_path, mocker): + from aai_cli.core.errors import APIError + + _auth() + _write_manifest(tmp_path) + mocker.patch( + "aai_cli.commands.evaluate._exec.client.transcribe", + autospec=True, + side_effect=[APIError("boom"), APIError("boom")], + ) + reduce = mocker.patch("aai_cli.commands.evaluate._exec.gateway.run_chain") + result = runner.invoke(app, ["eval", "manifest.csv", "--llm-reduce", "sum", "--json"]) + reduce.assert_not_called() + assert "Nothing to reduce" in result.output + assert "reduce" not in _payload_of(result) + + +def test_llm_status_messages(tmp_path, mocker, monkeypatch): + _auth() + _write_manifest(tmp_path) + _mock_transcribe(mocker, ["hello there", "goodbye now"]) + _mock_chain_steps(mocker) + mocker.patch( + "aai_cli.commands.evaluate._exec.gateway.run_chain", + side_effect=lambda *a, **k: "FINAL", + ) + seen = [] + + @contextlib.contextmanager + def fake_status(message, *, json_mode, quiet): + seen.append(message) + yield + + monkeypatch.setattr("aai_cli.commands.evaluate._exec.output.status", fake_status) + argv = ["eval", "manifest.csv", "--llm", "judge", "--llm-reduce", "rank"] + assert runner.invoke(app, argv).exit_code == 0 + assert "Running --llm over 2 transcriptsโ€ฆ" in seen + assert "Running --llm-reduce over all itemsโ€ฆ" in seen + + +# ---------------------------------------------------------------- pure helpers + + +def test_llm_options_maps_fields(): + opts = evaluate_exec.EvalOptions( + dataset="d", split=None, subset=None, limit=10, audio_column=None, + text_column=None, speech_model=None, language_code=None, concurrency=1, + llm_prompt=["a"], llm_reduce=["b", "c"], model="m", max_tokens=5, + ) # fmt: skip + llm_opts = opts.llm_options() + assert llm_opts.prompts == ["a"] + assert llm_opts.reduce_prompts == ["b", "c"] + assert llm_opts.model == "m" + assert llm_opts.max_tokens == 5 + + +def test_llm_options_default_to_empty_chains(): + opts = evaluate_exec.EvalOptions( + dataset="d", split=None, subset=None, limit=10, audio_column=None, + text_column=None, speech_model=None, language_code=None, concurrency=1, + llm_prompt=None, llm_reduce=None, model="m", max_tokens=5, + ) # fmt: skip + llm_opts = opts.llm_options() + assert llm_opts.prompts == [] and llm_opts.reduce_prompts == [] + + +def _result(row, hypothesis): + return evaluate_exec._ItemResult(row=row, words=None, latency=0.0, hypothesis=hypothesis) + + +def test_reduce_input_prefers_last_llm_output_else_transcript(): + with_llm = _result( + {"item": "a", "llm": {"steps": [{"output": "first"}, {"output": "last"}]}}, "raw" + ) + assert evaluate_exec._reduce_input(with_llm) == "last" + no_llm = _result({"item": "b"}, "raw text") + assert evaluate_exec._reduce_input(no_llm) == "raw text" + empty_steps = _result({"item": "c", "llm": {"steps": []}}, "fallback") + assert evaluate_exec._reduce_input(empty_steps) == "fallback" + + +def test_gather_reduce_inputs_headers_and_skips_failures_and_blanks(): + results = [ + _result({"item": "a"}, "hello"), + _result({"item": "fail", "error": "boom"}, None), # failed: no hypothesis + _result({"item": "blank"}, ""), # empty transcript contributes nothing + ] + combined = evaluate_exec._gather_reduce_inputs(results) + assert combined == "### Item: a\nhello" + + +def test_final_llm_output_distinguishes_absent_from_empty(): + assert evaluate_exec._final_llm_output({"item": "a"}) is None + assert evaluate_exec._final_llm_output({"llm": {"steps": []}}) == "" + assert evaluate_exec._final_llm_output({"llm": {"steps": [{"output": "x"}]}}) == "x" + + +def test_render_blocks_drop_out_when_absent(): + assert evaluate_exec._llm_block({"rows": [{"item": "a"}]}) is None + assert evaluate_exec._reduce_block({}) is None + + +def _assign(obj, attribute, value): + setattr(obj, attribute, value) + + +def test_llm_options_is_immutable(): + import dataclasses + + llm_opts = evaluate_exec._LlmOptions(prompts=[], reduce_prompts=[], model="m", max_tokens=1) + with pytest.raises(dataclasses.FrozenInstanceError): + _assign(llm_opts, "model", "other")