diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..f1900a66 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python", + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "evenBetterToml.schema.enabled": false, + "mypy-type-checker.importStrategy": "fromEnvironment", + "mypy-type-checker.preferDaemon": true +} diff --git a/aai_cli/app/init_exec.py b/aai_cli/app/init_exec.py index bf1602ef..10bfd731 100644 --- a/aai_cli/app/init_exec.py +++ b/aai_cli/app/init_exec.py @@ -61,7 +61,11 @@ def _pick_template() -> str: choice = questionary.select( "Pick a template", choices=[ - questionary.Choice(title=templates.title_for(t), value=t) + questionary.Choice( + title=templates.title_for(t), + value=t, + description=templates.description_for(t), + ) for t in templates.TEMPLATE_ORDER ], ).ask() @@ -101,6 +105,10 @@ def _active_env_vars() -> dict[str, str]: "ASSEMBLYAI_STREAMING_HOST": env.streaming_host, # Voice Agent host mirrors the streaming host's naming across environments. "ASSEMBLYAI_AGENTS_HOST": env.streaming_host.replace("streaming", "agents", 1), + # Streaming-TTS host for the cascade (agent-framework) template. Empty in + # production, where streaming TTS has no host; that template then refuses to + # run and points at --sandbox. + "ASSEMBLYAI_TTS_HOST": env.streaming_tts_host, } diff --git a/aai_cli/init/scaffold.py b/aai_cli/init/scaffold.py index a25159ee..55d851df 100644 --- a/aai_cli/init/scaffold.py +++ b/aai_cli/init/scaffold.py @@ -39,7 +39,7 @@ def _template_root(template: str) -> Traversable: ) # Navigate from the `aai_cli.init` package (templates/ has no __init__.py, so it # is not itself an importable package). - root = resources.files("aai_cli.init") / "templates" / template + root = resources.files("aai_cli.init") / "templates" / templates.dir_for(template) # Defense in depth: the registry should only list shipped templates, but if it ever # drifts ahead of the on-disk directories, fail cleanly instead of with a traceback. if not root.is_dir(): @@ -76,10 +76,17 @@ def existing_env_key(target: Path) -> str | None: return None -def _copy_tree(node: Traversable, dest: Path) -> None: +def _copy_tree(node: Traversable, dest: Path, *, top_level: bool = True) -> None: for child in node.iterdir(): if child.name in _SKIP_NAMES or child.name.endswith(".pyc"): continue + # The template dir is an importable package in-repo (so it can be type-checked), + # but its root __init__.py is just that in-repo marker — not part of the shipped + # app. Skip it so the scaffolded project root doesn't become a stray package. + # (api/'s own __init__.py is one level down and IS copied — the shipped app's + # `from . import settings` needs it.) + if top_level and child.name == "__init__.py": + continue name = _DOTFILE_RENAMES.get(child.name, child.name) out = dest / name if child.is_dir(): @@ -87,7 +94,7 @@ def _copy_tree(node: Traversable, dest: Path) -> None: # node's parent before descending, so `dest` (and `out.parent`) already # exists. exist_ok is exercised by the idempotent re-scaffold test. out.mkdir(parents=True, exist_ok=True) # pragma: no mutate - _copy_tree(child, out) + _copy_tree(child, out, top_level=False) else: out.parent.mkdir(parents=True, exist_ok=True) # pragma: no mutate out.write_bytes(child.read_bytes()) diff --git a/aai_cli/init/templates.py b/aai_cli/init/templates.py deleted file mode 100644 index a16ee625..00000000 --- a/aai_cli/init/templates.py +++ /dev/null @@ -1,24 +0,0 @@ -from __future__ import annotations - -# id -> human-facing title shown in the picker. Ids are Vercel-style -# project/example slugs rather than CLI command names. -# -# Every id here MUST have a directory under templates// (a test enforces both -# directions) — the picker must never advertise a template that would crash on scaffold. -TEMPLATES: dict[str, str] = { - "audio-transcription": "Audio Transcription", - "live-captions": "Live Captions", - "voice-agent": "Voice Agent", -} - -# Display order for the picker and `--help`. -TEMPLATE_ORDER: tuple[str, ...] = ("audio-transcription", "live-captions", "voice-agent") - - -def is_template(name: str) -> bool: - return name in TEMPLATES - - -def title_for(name: str) -> str: - """The human title for a template id, or the raw id if unknown.""" - return TEMPLATES.get(name, name) diff --git a/aai_cli/init/templates/__init__.py b/aai_cli/init/templates/__init__.py new file mode 100644 index 00000000..3fa4dcd5 --- /dev/null +++ b/aai_cli/init/templates/__init__.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +# id -> human-facing title shown in the picker. Ids are Vercel-style +# project/example slugs rather than CLI command names. +# +# Every id here MUST have a directory under templates// (a test enforces both +# directions) — the picker must never advertise a template that would crash on scaffold. +TEMPLATES: dict[str, str] = { + "audio-transcription": "Audio Transcription", + "live-captions": "Live Captions", + "voice-agent": "Voice Agent", + "agent-framework": "Agent Framework", +} + +# Display order for the picker and `--help`. +TEMPLATE_ORDER: tuple[str, ...] = ( + "audio-transcription", + "live-captions", + "voice-agent", + "agent-framework", +) + + +# One-line description shown beside each title in the interactive picker. Keys must +# match TEMPLATES exactly (a test enforces both directions). +DESCRIPTIONS: dict[str, str] = { + "audio-transcription": "Transcribe audio & video files, URLs, and YouTube — speaker labels and audio intelligence", + "live-captions": "Live real-time captions from your microphone over the Streaming API", + "voice-agent": "Full-duplex voice agent (speech in, LLM reply, speech out) via the Voice Agent API", + "agent-framework": "Cascaded voice agent you orchestrate: Streaming STT, the LLM Gateway, and sandbox TTS", +} + + +def dir_for(name: str) -> str: + """The on-disk template directory for an id: kebab id -> underscore package dir.""" + return name.replace("-", "_") + + +def is_template(name: str) -> bool: + return name in TEMPLATES + + +def title_for(name: str) -> str: + """The human title for a template id, or the raw id if unknown.""" + return TEMPLATES.get(name, name) + + +def description_for(name: str) -> str: + """The one-line picker description for a template id, or '' when unknown.""" + return DESCRIPTIONS.get(name, "") diff --git a/aai_cli/init/templates/agent_framework/AGENTS.md b/aai_cli/init/templates/agent_framework/AGENTS.md new file mode 100644 index 00000000..4f98593c --- /dev/null +++ b/aai_cli/init/templates/agent_framework/AGENTS.md @@ -0,0 +1,37 @@ +# Agent Notes + +This is a buildless FastAPI + browser starter for a **cascaded** voice agent +(Streaming STT -> LLM Gateway -> streaming TTS), orchestrated server-side. Run it with: + +```sh +assembly dev +``` + +## Map + +- `api/settings.py`: API key, hosts, model, voice, system prompt, greeting, sample rates. +- `api/cascade.py`: the orchestrator — STT/TTS socket helpers, the LLM stream, turn + detection, barge-in, and the `/ws` browser adapter. Built with injected `Deps` so it + is tested against fakes. +- `api/index.py`: FastAPI app — serves the page/assets and the `/ws` WebSocket. +- `static/app.js`: WebSocket lifecycle, mic capture, UI state, and event handling + (`_CONFIG` block at the top is the primary edit point). +- `static/audio.js`: microphone pipeline, PCM conversion, playback queue, barge-in. +- `static/styles.css`: visual styling only; the top `:root` block is the theme edit point. +- `static/index.html`: page structure and static asset links. + +## Change Points + +- Model, voice, prompt, greeting, sample rates: edit `api/settings.py`. +- Cascade behavior (turn detection, barge-in, LLM->TTS piping): edit `api/cascade.py`. +- Transcript log rendering: edit `addTurn` in `static/app.js`. +- Playback, barge-in, or PCM conversion: edit `static/audio.js`. + +## Invariants + +- Never expose `ASSEMBLYAI_API_KEY` or any server secret in `static/`. +- Streaming TTS is sandbox-only; keep this app pointed at the sandbox hosts. +- `reply.audio` carries base64 PCM on the `data` field. +- The browser <-> backend event protocol matches the `voice-agent` template — keep it + stable so `static/audio.js` and the UI stay reusable. +- Keep the app buildless unless the user explicitly asks for a frontend toolchain. diff --git a/aai_cli/init/templates/audio-transcription/Dockerfile b/aai_cli/init/templates/agent_framework/Dockerfile similarity index 100% rename from aai_cli/init/templates/audio-transcription/Dockerfile rename to aai_cli/init/templates/agent_framework/Dockerfile diff --git a/aai_cli/init/templates/audio-transcription/Procfile b/aai_cli/init/templates/agent_framework/Procfile similarity index 100% rename from aai_cli/init/templates/audio-transcription/Procfile rename to aai_cli/init/templates/agent_framework/Procfile diff --git a/aai_cli/init/templates/agent_framework/README.md b/aai_cli/init/templates/agent_framework/README.md new file mode 100644 index 00000000..019152a1 --- /dev/null +++ b/aai_cli/init/templates/agent_framework/README.md @@ -0,0 +1,49 @@ +# Talk to a cascaded voice agent — AssemblyAI agent-framework starter + +Click connect and talk. Unlike the `voice-agent` template (which uses AssemblyAI's +all-in-one Voice Agent API), this app is a **cascade your own backend orchestrates**: +Streaming STT transcribes you, the LLM Gateway generates a reply, and streaming TTS +speaks it back — with turn detection and barge-in handled server-side. The browser +holds one WebSocket to your backend, so your API key never reaches the client. + +## Sandbox-only + +Streaming TTS has no production host, so the whole cascade runs against the AssemblyAI +sandbox with a sandbox key. Scaffold it that way: + +```sh +assembly --sandbox init agent-framework +``` + +That pins the sandbox hosts in `.env`. Running against production exits with a hint. + +## Run locally + +```sh +assembly dev # opens http://localhost:3000 (allow microphone access; headphones recommended) +``` + +`ASSEMBLYAI_API_KEY` is read from `.env` (created for you by `assembly init`). + +## Deploy + +This app keeps a **long-running WebSocket**, so it needs a persistent process — not +Vercel's serverless functions. Use the shipped `Procfile`/`Dockerfile` on Render, +Railway, Fly.io, or Google Cloud Run (`gcloud run deploy --source .`): + +```sh +uvicorn api.index:app --host 0.0.0.0 --port $PORT +``` + +Set `ASSEMBLYAI_API_KEY` and the three sandbox host vars (`ASSEMBLYAI_STREAMING_HOST`, +`ASSEMBLYAI_TTS_HOST`, `ASSEMBLYAI_LLM_GATEWAY_URL`) in the platform's environment. + +## Ideas to extend + +- Change the `MODEL`, `VOICE`, `SYSTEM_PROMPT`, `GREETING`, or `MAX_HISTORY` in + `api/settings.py`. +- Replies already stream into TTS sentence-by-sentence as the LLM produces them + (`_generate_reply` flushes on each `.`/`!`/`?`), and a sliding window of + `MAX_HISTORY` messages gives the agent memory of the conversation. Tune the + sentence boundary or `MAX_HISTORY` to trade latency, cost, and recall. +- Add tools (function calling) on the LLM leg so the agent can look things up. diff --git a/aai_cli/init/templates/audio-transcription/api/__init__.py b/aai_cli/init/templates/agent_framework/__init__.py similarity index 100% rename from aai_cli/init/templates/audio-transcription/api/__init__.py rename to aai_cli/init/templates/agent_framework/__init__.py diff --git a/aai_cli/init/templates/live-captions/api/__init__.py b/aai_cli/init/templates/agent_framework/api/__init__.py similarity index 100% rename from aai_cli/init/templates/live-captions/api/__init__.py rename to aai_cli/init/templates/agent_framework/api/__init__.py diff --git a/aai_cli/init/templates/agent_framework/api/cascade.py b/aai_cli/init/templates/agent_framework/api/cascade.py new file mode 100644 index 00000000..e2b27bcf --- /dev/null +++ b/aai_cli/init/templates/agent_framework/api/cascade.py @@ -0,0 +1,404 @@ +"""Server-side cascade orchestrator for the agent-framework template. + +The browser opens one WebSocket to FastAPI and the backend wires three AssemblyAI +primitives together — Streaming STT, the LLM Gateway, and streaming TTS — so every +credential stays on the server. The orchestrator takes injected connect-factories and +an LLM callable (`Deps`) so it runs hermetically against fakes in tests, the same +seam `aai_cli/tts/session.py` uses. + +Browser protocol (identical to the voice-agent template): + in : {"type": "input.audio", "audio": } + out: transcript.user / transcript.agent / reply.audio (base64 in `data`) / + input.speech.started / reply.done / session.error +""" + +from __future__ import annotations + +import asyncio +import base64 +import contextlib +import json +from collections.abc import AsyncIterator, Awaitable, Callable +from dataclasses import dataclass +from typing import TYPE_CHECKING, Protocol +from urllib.parse import urlencode + +from fastapi import WebSocket + +if TYPE_CHECKING: + from openai.types.chat import ChatCompletionMessageParam + from websockets.asyncio.client import ClientConnection + + +class _Settings(Protocol): + API_KEY: str + STREAMING_HOST: str + TTS_HOST: str + LLM_GATEWAY_URL: str + MODEL: str + VOICE: str + SYSTEM_PROMPT: str + GREETING: str + MAX_HISTORY: int + INPUT_SAMPLE_RATE: int + OUTPUT_SAMPLE_RATE: int + + +class _Browser(Protocol): + async def send(self, event: dict[str, object]) -> None: + """Send one protocol event to the browser.""" + + async def recv(self) -> dict[str, object] | None: + """Receive the next browser message, or None once the socket closes.""" + + +def unavailable_reason(settings: _Settings) -> str | None: + """Why the cascade can't run, or None when it can. + + Streaming TTS has no production host, so an empty TTS host means the user must + re-scaffold against the sandbox. + """ + if not settings.API_KEY: + return "ASSEMBLYAI_API_KEY is not set — configure it in your deployment's environment." + if not settings.TTS_HOST: + return ( + "Streaming TTS has no production host, so this cascade is sandbox-only. " + "Re-scaffold against the sandbox: assembly --sandbox init agent-framework." + ) + return None + + +def stt_url(settings: _Settings) -> str: + """The Streaming v3 WebSocket URL with PCM + turn-formatting params.""" + params = urlencode( + { + "sample_rate": settings.INPUT_SAMPLE_RATE, + "encoding": "pcm_s16le", + "speech_model": "u3-rt-pro", + "format_turns": "true", + } + ) + return f"wss://{settings.STREAMING_HOST}/v3/ws?{params}" + + +def tts_url(settings: _Settings) -> str: + """The streaming-TTS WebSocket URL for the configured voice and sample rate.""" + params = urlencode({"voice": settings.VOICE, "sample_rate": settings.OUTPUT_SAMPLE_RATE}) + return f"wss://{settings.TTS_HOST}/v1/ws/?{params}" + + +def is_final_user_turn(msg: dict[str, object]) -> bool: + """True for a finalized, formatted end-of-turn (the cue to reply).""" + return bool(msg.get("end_of_turn")) and bool(msg.get("turn_is_formatted")) + + +def build_messages( + system_prompt: str, history: list[ChatCompletionMessageParam] +) -> list[ChatCompletionMessageParam]: + """The chat `messages` array: the system prompt followed by the conversation so far.""" + return [{"role": "system", "content": system_prompt}, *history] + + +def _trim_history(history: list[ChatCompletionMessageParam], max_messages: int) -> None: + """Cap the running history to the most recent ``max_messages`` (sliding window).""" + if len(history) > max_messages: + del history[: len(history) - max_messages] + + +def _split_sentences(buffer: str) -> tuple[list[str], str]: + """Split off complete sentences (each ending in . ! ?). Return (sentences, remainder).""" + sentences: list[str] = [] + start = 0 + for index, char in enumerate(buffer): + if char in ".!?": + sentence = buffer[start : index + 1].strip() + if sentence: + sentences.append(sentence) + start = index + 1 + return sentences, buffer[start:] + + +@dataclass +class Deps: + """Injected cascade dependencies. `Deps.real(settings)` wires the live clients; + tests pass fakes with the same shapes.""" + + connect_stt: Callable[[], Awaitable[ClientConnection]] + connect_tts: Callable[[], Awaitable[ClientConnection]] + llm_stream: Callable[[list[ChatCompletionMessageParam]], AsyncIterator[str]] + settings: _Settings + + @classmethod + def real(cls, settings: _Settings) -> Deps: + return cls( + connect_stt=lambda: _connect_stt(settings), + connect_tts=lambda: _connect_tts(settings), + llm_stream=lambda messages: _llm_stream(settings, messages), + settings=settings, + ) + + +class Session: + """Tracks the in-flight reply so a new user turn can barge in and cancel it.""" + + def __init__(self) -> None: + self.reply_task: asyncio.Task[None] | None = None + self.history: list[ChatCompletionMessageParam] = [] + + async def cancel_reply(self) -> None: + task, self.reply_task = self.reply_task, None + if task is not None and not task.done(): + task.cancel() + with contextlib.suppress(asyncio.CancelledError, Exception): + await task + + async def drain(self) -> None: + """Await the in-flight reply to natural completion (used when STT closes).""" + task = self.reply_task + if task is not None: + with contextlib.suppress(Exception): + await task + + +async def _connect_stt(settings: _Settings) -> ClientConnection: + import websockets + + return await websockets.connect( + stt_url(settings), additional_headers={"Authorization": settings.API_KEY} + ) + + +async def _connect_tts(settings: _Settings) -> ClientConnection: + import websockets + + # max_size=None: a synthesis's Audio frames can exceed the 1 MiB default. + return await websockets.connect( + tts_url(settings), + additional_headers={"Authorization": settings.API_KEY}, + max_size=None, + ) + + +async def _llm_stream( + settings: _Settings, messages: list[ChatCompletionMessageParam] +) -> AsyncIterator[str]: + from openai import AsyncOpenAI + + client = AsyncOpenAI(base_url=settings.LLM_GATEWAY_URL, api_key=settings.API_KEY) + stream = await client.chat.completions.create( + model=settings.MODEL, messages=messages, stream=True + ) + async for chunk in stream: + # The gateway (Anthropic-backed, OpenAI-compatible) ends the stream with a + # usage/final chunk that carries no choices — skip it instead of IndexError-ing. + if not chunk.choices: + continue + delta = chunk.choices[0].delta.content + if delta: + yield delta + + +async def _safe_close(conn: ClientConnection) -> None: + with contextlib.suppress(Exception): + await conn.close() + + +async def _pump_mic(browser: _Browser, stt: ClientConnection) -> None: + """Forward each base64 mic frame from the browser to the STT socket.""" + while True: + msg = await browser.recv() + if msg is None: + return + audio = msg.get("audio") if msg.get("type") == "input.audio" else None + if isinstance(audio, str): + try: + pcm = base64.b64decode(audio) + except ValueError: + continue # ignore a malformed audio frame rather than kill the session + await stt.send(pcm) + + +async def _synthesize(browser: _Browser, tts: ClientConnection, text: str) -> None: + """Drive the TTS protocol on an open socket, forwarding Audio as reply.audio.""" + begin = json.loads(await tts.recv()) + if begin.get("type") != "Begin": + raise RuntimeError(f"TTS did not begin (got {begin.get('type')!r}).") + await tts.send(json.dumps({"type": "Generate", "text": text})) + await tts.send(json.dumps({"type": "Flush"})) + # Iterate the socket (like _pump_stt) so a close before the final Audio frame ends + # the loop cleanly instead of raising ConnectionClosed out of the reply. + async for raw in tts: + frame = json.loads(raw) + kind = frame.get("type") + if kind == "Audio": + await browser.send({"type": "reply.audio", "data": frame.get("audio", "")}) + if frame.get("is_final"): + break + elif kind == "Error": + raise RuntimeError(frame.get("error") or "TTS error") + with contextlib.suppress(Exception): + await tts.send(json.dumps({"type": "Terminate"})) + await _safe_close(tts) + + +async def _speak(browser: _Browser, deps: Deps, text: str) -> None: + """Emit agent text, synthesize it, and mark the reply done. A synthesis failure + becomes one clean session.error (mirroring _generate_reply) — without this the + greeting runs as a bare task whose exception would only ever be swallowed by + cancel_reply/drain, leaving the user with no audio and no error.""" + await browser.send({"type": "transcript.agent", "text": text}) + try: + tts = await deps.connect_tts() + try: + await _synthesize(browser, tts, text) + finally: + await _safe_close(tts) + except asyncio.CancelledError: + raise + except Exception as exc: + await browser.send({"type": "session.error", "message": str(exc)}) + return + await browser.send({"type": "reply.done", "status": "completed"}) + + +async def _speak_sentence(browser: _Browser, deps: Deps, text: str) -> None: + """Show + synthesize one sentence of a streamed reply (no reply.done).""" + await browser.send({"type": "transcript.agent", "text": text}) + tts = await deps.connect_tts() + try: + await _synthesize(browser, tts, text) + finally: + await _safe_close(tts) + + +async def _generate_reply(browser: _Browser, deps: Deps, session: Session) -> None: + """Stream the LLM reply sentence-by-sentence into TTS (low perceived latency), then + record it in the conversation history. Errors surface as session.error.""" + messages = build_messages(deps.settings.SYSTEM_PROMPT, session.history) + spoken: list[str] = [] + try: + buffer = "" + async for delta in deps.llm_stream(messages): + buffer += delta + sentences, buffer = _split_sentences(buffer) + for sentence in sentences: + spoken.append(sentence) + await _speak_sentence(browser, deps, sentence) + tail = buffer.strip() + if tail: + spoken.append(tail) + await _speak_sentence(browser, deps, tail) + reply = " ".join(spoken).strip() + if not reply: + await browser.send({"type": "reply.done", "status": "empty"}) + return + session.history.append({"role": "assistant", "content": reply}) + _trim_history(session.history, deps.settings.MAX_HISTORY) + await browser.send({"type": "reply.done", "status": "completed"}) + except asyncio.CancelledError: + # Barged-in mid-reply: record what was actually spoken so history keeps its + # user/assistant alternation (otherwise the next user turn would follow this + # one with no assistant turn between them). + partial = " ".join(spoken).strip() + if partial: + session.history.append({"role": "assistant", "content": partial}) + _trim_history(session.history, deps.settings.MAX_HISTORY) + raise + except Exception as exc: # any leg failure becomes one clean session.error event + await browser.send({"type": "session.error", "message": str(exc)}) + + +async def maybe_barge_in(browser: _Browser, session: Session) -> None: + """If a reply is playing, tell the browser to stop and cancel it.""" + if session.reply_task is not None and not session.reply_task.done(): + await browser.send({"type": "input.speech.started"}) + await session.cancel_reply() + + +async def _pump_stt(browser: _Browser, stt: ClientConnection, deps: Deps, session: Session) -> None: + """Read STT turns: display only the finalized (formatted end-of-turn) user + transcript and reply to it. An interim turn isn't shown — it only barges in on a + playing reply. Drain the last reply when the socket closes.""" + async for raw in stt: + msg = json.loads(raw) + if msg.get("type") != "Turn": + continue + text = msg.get("transcript", "") + if not text: + continue + if is_final_user_turn(msg): + await browser.send({"type": "transcript.user", "text": text}) + # Stop any reply still playing AND tell the browser to flush its queued + # audio (cancel_reply alone is server-side only — the old reply keeps + # playing in the browser). + await maybe_barge_in(browser, session) + session.history.append({"role": "user", "content": text}) + _trim_history(session.history, deps.settings.MAX_HISTORY) + session.reply_task = asyncio.create_task(_generate_reply(browser, deps, session)) + else: + await maybe_barge_in(browser, session) + await session.drain() + + +class _SessionClosed(Exception): + """Sentinel that unwinds the session TaskGroup when one pump returns — i.e. the + browser disconnected or the STT socket closed. Raising it cancels the sibling pump.""" + + +async def _until_closed(pump: Awaitable[None]) -> None: + """Run a pump to its natural end, then raise to close the session TaskGroup.""" + await pump + raise _SessionClosed + + +async def run_session(browser: _Browser, deps: Deps) -> None: + """Run one browser session: greet, then cascade STT -> LLM -> TTS until either + side closes. All credentials stay server-side.""" + reason = unavailable_reason(deps.settings) + if reason is not None: + await browser.send({"type": "session.error", "message": reason}) + return + try: + stt = await deps.connect_stt() + except Exception as exc: # any connect/setup failure becomes one clean session.error + await browser.send( + {"type": "session.error", "message": f"Could not start the session: {exc}"} + ) + return + + session = Session() + # Seed history with the greeting so the model has a record of its opening line. + session.history.append({"role": "assistant", "content": deps.settings.GREETING}) + session.reply_task = asyncio.create_task(_speak(browser, deps, deps.settings.GREETING)) + try: + # Race the two pumps: whichever returns first (browser hangs up → mic; STT + # socket closes → listen) raises _SessionClosed, and the TaskGroup cancels the + # other pump for us — no manual cancel/gather bookkeeping. + async with asyncio.TaskGroup() as tg: + tg.create_task(_until_closed(_pump_mic(browser, stt))) + tg.create_task(_until_closed(_pump_stt(browser, stt, deps, session))) + except* _SessionClosed: + pass + finally: + await session.cancel_reply() + await _safe_close(stt) + + +class FastAPIBrowser: + """Adapts a Starlette WebSocket to the (send, recv) shape run_session expects. + recv() returns None when the client disconnects, so the pumps exit cleanly.""" + + def __init__(self, websocket: WebSocket) -> None: + self._ws = websocket + + async def send(self, event: dict[str, object]) -> None: + await self._ws.send_json(event) + + async def recv(self) -> dict[str, object] | None: + from fastapi import WebSocketDisconnect + + try: + data: dict[str, object] = await self._ws.receive_json() + except WebSocketDisconnect: + return None + return data diff --git a/aai_cli/init/templates/agent_framework/api/index.py b/aai_cli/init/templates/agent_framework/api/index.py new file mode 100644 index 00000000..f65a6990 --- /dev/null +++ b/aai_cli/init/templates/agent_framework/api/index.py @@ -0,0 +1,37 @@ +"""Talk to a cascaded voice agent — AssemblyAI agent-framework starter (FastAPI). + +The browser opens one WebSocket to this backend, which runs the cascade itself — +Streaming STT -> LLM Gateway -> streaming TTS — so your API key never reaches the +client. Streaming TTS is sandbox-only, so scaffold with `assembly --sandbox init +agent-framework` and use a sandbox key. + + WS /ws <- {"type":"input.audio","audio":} ; -> transcripts + reply.audio +""" + +from __future__ import annotations + +from pathlib import Path + +from fastapi import FastAPI, WebSocket +from fastapi.responses import FileResponse +from fastapi.staticfiles import StaticFiles + +from . import cascade, settings + +ROOT = Path(__file__).resolve().parent.parent +STATIC = ROOT / "static" +app = FastAPI() +app.mount("/static", StaticFiles(directory=STATIC), name="static") + + +@app.get("/") +def index() -> FileResponse: + return FileResponse(STATIC / "index.html") + + +@app.websocket("/ws") +async def ws(websocket: WebSocket) -> None: + """Accept the browser socket and run one cascade session over it.""" + await websocket.accept() + browser = cascade.FastAPIBrowser(websocket) + await cascade.run_session(browser, cascade.Deps.real(settings)) diff --git a/aai_cli/init/templates/agent_framework/api/settings.py b/aai_cli/init/templates/agent_framework/api/settings.py new file mode 100644 index 00000000..8cfda0a5 --- /dev/null +++ b/aai_cli/init/templates/agent_framework/api/settings.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +API_KEY: str = os.environ.get("ASSEMBLYAI_API_KEY", "") + +# Hosts. `assembly init` pins these to the active environment. Streaming TTS only +# exists in the sandbox, so this whole cascade is sandbox-only (see README); the +# defaults point at the sandbox so a bare clone works with a sandbox key. +STREAMING_HOST: str = os.environ.get( + "ASSEMBLYAI_STREAMING_HOST", "streaming.sandbox000.assemblyai-labs.com" +) +TTS_HOST: str = os.environ.get( + "ASSEMBLYAI_TTS_HOST", "streaming-tts.sandbox000.assemblyai-labs.com" +) +LLM_GATEWAY_URL: str = os.environ.get( + "ASSEMBLYAI_LLM_GATEWAY_URL", "https://llm-gateway.sandbox000.assemblyai-labs.com/v1" +) + +# The cascade's knobs — edit these to change behavior. +MODEL: str = "claude-haiku-4-5-20251001" +VOICE: str = "jane" +SYSTEM_PROMPT: str = ( + "You are a friendly, concise voice assistant. Keep replies short and conversational. " + "Your reply is read aloud by a text-to-speech engine, so write plain spoken prose — " + "no markdown, emoji, bullet lists, or code." +) +GREETING: str = "Hi! I'm your AssemblyAI voice agent. What can I help you with?" +MAX_HISTORY: int = 40 # keep the last N messages of conversation context (sliding window) + +# 16 kHz PCM in (Streaming v3); 24 kHz PCM out (streaming TTS). +INPUT_SAMPLE_RATE: int = 16000 +OUTPUT_SAMPLE_RATE: int = 24000 diff --git a/aai_cli/init/templates/audio-transcription/dockerignore b/aai_cli/init/templates/agent_framework/dockerignore similarity index 100% rename from aai_cli/init/templates/audio-transcription/dockerignore rename to aai_cli/init/templates/agent_framework/dockerignore diff --git a/aai_cli/init/templates/agent_framework/env.example b/aai_cli/init/templates/agent_framework/env.example new file mode 100644 index 00000000..6a119b9e --- /dev/null +++ b/aai_cli/init/templates/agent_framework/env.example @@ -0,0 +1,6 @@ +ASSEMBLYAI_API_KEY=your_assemblyai_api_key_here +# This cascade uses streaming TTS, which is sandbox-only — use a sandbox key and the +# sandbox hosts (assembly --sandbox init agent-framework fills these in for you): +# ASSEMBLYAI_STREAMING_HOST=streaming.sandbox000.assemblyai-labs.com +# ASSEMBLYAI_TTS_HOST=streaming-tts.sandbox000.assemblyai-labs.com +# ASSEMBLYAI_LLM_GATEWAY_URL=https://llm-gateway.sandbox000.assemblyai-labs.com/v1 diff --git a/aai_cli/init/templates/audio-transcription/gitignore b/aai_cli/init/templates/agent_framework/gitignore similarity index 100% rename from aai_cli/init/templates/audio-transcription/gitignore rename to aai_cli/init/templates/agent_framework/gitignore diff --git a/aai_cli/init/templates/agent_framework/requirements.txt b/aai_cli/init/templates/agent_framework/requirements.txt new file mode 100644 index 00000000..72fea20c --- /dev/null +++ b/aai_cli/init/templates/agent_framework/requirements.txt @@ -0,0 +1,8 @@ +fastapi>=0.136.3 +uvicorn>=0.30.0 +websockets>=14.1 +openai>=1.54.0 +python-dotenv>=1.2.2 +# Pin starlette directly: FastAPI's own floor still admits versions with known CVEs, +# so raise the transitive floor above them. +starlette>=1.2.1 diff --git a/aai_cli/init/templates/audio-transcription/runtime.txt b/aai_cli/init/templates/agent_framework/runtime.txt similarity index 100% rename from aai_cli/init/templates/audio-transcription/runtime.txt rename to aai_cli/init/templates/agent_framework/runtime.txt diff --git a/aai_cli/init/templates/agent_framework/static/app.js b/aai_cli/init/templates/agent_framework/static/app.js new file mode 100644 index 00000000..388edaea --- /dev/null +++ b/aai_cli/init/templates/agent_framework/static/app.js @@ -0,0 +1,130 @@ +const SESSION_CONFIG = { + inputSampleRate: 16000, + outputSampleRate: 24000, + processorBufferSize: 4096, + microphone: { audio: { echoCancellation: true, noiseSuppression: false } }, +}; + +const connBtn = document.getElementById("conn"); +const statusEl = document.getElementById("status"); +const logEl = document.getElementById("log"); + +let ws = null; +let micPipeline = null; +let player = null; +let connected = false; + +connBtn.addEventListener("click", () => + connected ? hangup() : connect().catch(fail), +); + +function setStatus(message, state) { + statusEl.textContent = message; + statusEl.dataset.state = state; +} + +function wsUrl() { + const scheme = location.protocol === "https:" ? "wss" : "ws"; + return `${scheme}://${location.host}/ws`; +} + +async function connect() { + setStatus("Connecting...", "idle"); + ws = new WebSocket(wsUrl()); + ws.onopen = () => startMic().catch(fail); + ws.onmessage = (event) => onEvent(JSON.parse(event.data)); + ws.onerror = () => fail("WebSocket error"); + ws.onclose = () => { + if (connected) hangup(); + }; +} + +async function startMic() { + // Create the player first: the server speaks the greeting the instant the + // socket opens, so `reply.audio` can arrive before getUserMedia's permission + // prompt resolves. Setting `player` synchronously here (before the first + // await) guarantees it exists when onEvent handles that first audio frame. + player = AudioHelpers.createPcmPlayer({ + sampleRate: SESSION_CONFIG.outputSampleRate, + }); + await player.resume(); + const stream = await navigator.mediaDevices.getUserMedia( + SESSION_CONFIG.microphone, + ); + micPipeline = AudioHelpers.createMicrophonePipeline(stream, { + bufferSize: SESSION_CONFIG.processorBufferSize, + }); + await micPipeline.start((frame, sampleRate) => { + if (!ws || ws.readyState !== WebSocket.OPEN) return; + const pcm = AudioHelpers.downsampleToPCM( + frame, + sampleRate, + SESSION_CONFIG.inputSampleRate, + ); + ws.send( + JSON.stringify({ + type: "input.audio", + audio: AudioHelpers.bytesToB64(pcm), + }), + ); + }); + + connected = true; + connBtn.textContent = "■ Hang up"; + connBtn.dataset.state = "connected"; + setStatus("● Connected - just talk", "live"); +} + +function onEvent(event) { + switch (event.type) { + case "transcript.user": + return addTurn("you", "You", event.text); + case "transcript.agent": + return addTurn("agent", "Agent", event.text); + case "reply.audio": + if (player) player.playBase64Chunk(event.data); + return; + case "input.speech.started": + return bargeIn(); + case "reply.done": + return; + case "session.error": + return fail(event.message || "session error"); + } +} + +function bargeIn() { + if (player) player.stopQueuedAudio(); +} + +function addTurn(speakerKind, speaker, text) { + if (!text) return; + const turn = document.createElement("div"); + turn.className = "conversation-turn"; + turn.dataset.speaker = speakerKind; + const who = document.createElement("span"); + who.className = "turn-speaker"; + who.textContent = speaker + ": "; + turn.append(who, document.createTextNode(text)); + logEl.appendChild(turn); + turn.scrollIntoView({ block: "end" }); +} + +function hangup() { + connected = false; + connBtn.textContent = "● Connect"; + connBtn.dataset.state = "idle"; + setStatus("Disconnected", "idle"); + bargeIn(); + if (ws && ws.readyState === WebSocket.OPEN) ws.close(); + if (micPipeline) micPipeline.close(); + if (player) player.close(); + ws = null; + micPipeline = null; + player = null; +} + +function fail(message) { + setStatus("Error: " + message, "error"); + if (connected) hangup(); +} diff --git a/aai_cli/init/templates/voice-agent/static/audio.js b/aai_cli/init/templates/agent_framework/static/audio.js similarity index 100% rename from aai_cli/init/templates/voice-agent/static/audio.js rename to aai_cli/init/templates/agent_framework/static/audio.js diff --git a/aai_cli/init/templates/agent_framework/static/index.html b/aai_cli/init/templates/agent_framework/static/index.html new file mode 100644 index 00000000..f6809cfc --- /dev/null +++ b/aai_cli/init/templates/agent_framework/static/index.html @@ -0,0 +1,37 @@ + + + + + + Talk to a cascaded voice agent · AssemblyAI + + + +
+ + + + + + +
+ + +
+ +
+ + +
+ + + + + diff --git a/aai_cli/init/templates/voice-agent/static/styles.css b/aai_cli/init/templates/agent_framework/static/styles.css similarity index 100% rename from aai_cli/init/templates/voice-agent/static/styles.css rename to aai_cli/init/templates/agent_framework/static/styles.css diff --git a/aai_cli/init/templates/audio-transcription/vercel.json b/aai_cli/init/templates/agent_framework/vercel.json similarity index 100% rename from aai_cli/init/templates/audio-transcription/vercel.json rename to aai_cli/init/templates/agent_framework/vercel.json diff --git a/aai_cli/init/templates/audio-transcription/AGENTS.md b/aai_cli/init/templates/audio_transcription/AGENTS.md similarity index 100% rename from aai_cli/init/templates/audio-transcription/AGENTS.md rename to aai_cli/init/templates/audio_transcription/AGENTS.md diff --git a/aai_cli/init/templates/live-captions/Dockerfile b/aai_cli/init/templates/audio_transcription/Dockerfile similarity index 100% rename from aai_cli/init/templates/live-captions/Dockerfile rename to aai_cli/init/templates/audio_transcription/Dockerfile diff --git a/aai_cli/init/templates/live-captions/Procfile b/aai_cli/init/templates/audio_transcription/Procfile similarity index 100% rename from aai_cli/init/templates/live-captions/Procfile rename to aai_cli/init/templates/audio_transcription/Procfile diff --git a/aai_cli/init/templates/audio-transcription/README.md b/aai_cli/init/templates/audio_transcription/README.md similarity index 100% rename from aai_cli/init/templates/audio-transcription/README.md rename to aai_cli/init/templates/audio_transcription/README.md diff --git a/aai_cli/init/templates/voice-agent/api/__init__.py b/aai_cli/init/templates/audio_transcription/__init__.py similarity index 100% rename from aai_cli/init/templates/voice-agent/api/__init__.py rename to aai_cli/init/templates/audio_transcription/__init__.py diff --git a/aai_cli/init/templates/audio_transcription/api/__init__.py b/aai_cli/init/templates/audio_transcription/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aai_cli/init/templates/audio-transcription/api/index.py b/aai_cli/init/templates/audio_transcription/api/index.py similarity index 81% rename from aai_cli/init/templates/audio-transcription/api/index.py rename to aai_cli/init/templates/audio_transcription/api/index.py index 6ab7dbb5..5a29fd14 100644 --- a/aai_cli/init/templates/audio-transcription/api/index.py +++ b/aai_cli/init/templates/audio_transcription/api/index.py @@ -16,6 +16,7 @@ import tempfile import uuid from pathlib import Path +from typing import Protocol import assemblyai as aai from assemblyai.api import get_transcript # single non-blocking GET (see status()) @@ -25,7 +26,7 @@ from fastapi.staticfiles import StaticFiles from openai import OpenAI # the LLM Gateway is OpenAI-compatible -from api import settings +from . import settings aai.settings.api_key = settings.API_KEY # Target the same AssemblyAI environment the key was minted for. `assembly init` writes @@ -33,7 +34,17 @@ if settings.ASSEMBLYAI_BASE_URL: aai.settings.base_url = settings.ASSEMBLYAI_BASE_URL -CONFIG = aai.TranscriptionConfig(**settings.TRANSCRIPTION_CONFIG_KWARGS) +# Build the config from settings.TRANSCRIPTION_CONFIG_KWARGS by reading each flag by name. +# (A bare `**dict[str, bool]` unpack can't type-check against the SDK's heterogeneous +# __init__, so we pass each boolean feature explicitly.) +_FEATURES = settings.TRANSCRIPTION_CONFIG_KWARGS +CONFIG = aai.TranscriptionConfig( + speaker_labels=_FEATURES.get("speaker_labels", False), + auto_chapters=_FEATURES.get("auto_chapters", False), + sentiment_analysis=_FEATURES.get("sentiment_analysis", False), + entity_detection=_FEATURES.get("entity_detection", False), + auto_highlights=_FEATURES.get("auto_highlights", False), +) ROOT = Path(__file__).resolve().parent.parent STATIC = ROOT / "static" @@ -104,6 +115,17 @@ def ask(transcript_id: str = Body(...), question: str = Body(...)) -> dict[str, return {"answer": resp.choices[0].message.content or ""} +class _Serializable(Protocol): + """The pydantic-model surface we use: a `.dict()` returning the full JSON.""" + + def dict(self) -> dict[str, object]: ... + + +def _to_payload(model: _Serializable) -> dict[str, object]: + """Serialize the transcript model to a JSON-ready dict (typed via the protocol).""" + return model.dict() + + @app.get("/api/status/{transcript_id}") def status(transcript_id: str) -> dict[str, object]: _require_key() @@ -116,5 +138,5 @@ def status(transcript_id: str) -> dict[str, object]: if t.status == aai.TranscriptStatus.error: raise HTTPException(status_code=502, detail=t.error or "Transcription failed") if t.status == aai.TranscriptStatus.completed: - return {"status": "completed", "transcript": t.dict()} + return {"status": "completed", "transcript": _to_payload(t)} return {"status": str(getattr(t.status, "value", t.status))} diff --git a/aai_cli/init/templates/audio-transcription/api/settings.py b/aai_cli/init/templates/audio_transcription/api/settings.py similarity index 74% rename from aai_cli/init/templates/audio-transcription/api/settings.py rename to aai_cli/init/templates/audio_transcription/api/settings.py index 6ee69e06..3ec9ecc9 100644 --- a/aai_cli/init/templates/audio-transcription/api/settings.py +++ b/aai_cli/init/templates/audio_transcription/api/settings.py @@ -16,8 +16,9 @@ # Public sample so the app works immediately without uploading a local file. SAMPLE_URL = "https://assembly.ai/wildfires.mp3" -# Main backend customization point. Add, remove, or tune AssemblyAI features here. -TRANSCRIPTION_CONFIG_KWARGS = { +# Main backend customization point. Toggle AssemblyAI audio-intelligence features here; +# api/index.py reads each flag by name when it builds the TranscriptionConfig. +TRANSCRIPTION_CONFIG_KWARGS: dict[str, bool] = { "speaker_labels": True, "auto_chapters": True, "sentiment_analysis": True, diff --git a/aai_cli/init/templates/live-captions/dockerignore b/aai_cli/init/templates/audio_transcription/dockerignore similarity index 100% rename from aai_cli/init/templates/live-captions/dockerignore rename to aai_cli/init/templates/audio_transcription/dockerignore diff --git a/aai_cli/init/templates/audio-transcription/env.example b/aai_cli/init/templates/audio_transcription/env.example similarity index 100% rename from aai_cli/init/templates/audio-transcription/env.example rename to aai_cli/init/templates/audio_transcription/env.example diff --git a/aai_cli/init/templates/live-captions/gitignore b/aai_cli/init/templates/audio_transcription/gitignore similarity index 100% rename from aai_cli/init/templates/live-captions/gitignore rename to aai_cli/init/templates/audio_transcription/gitignore diff --git a/aai_cli/init/templates/audio-transcription/requirements.txt b/aai_cli/init/templates/audio_transcription/requirements.txt similarity index 100% rename from aai_cli/init/templates/audio-transcription/requirements.txt rename to aai_cli/init/templates/audio_transcription/requirements.txt diff --git a/aai_cli/init/templates/live-captions/runtime.txt b/aai_cli/init/templates/audio_transcription/runtime.txt similarity index 100% rename from aai_cli/init/templates/live-captions/runtime.txt rename to aai_cli/init/templates/audio_transcription/runtime.txt diff --git a/aai_cli/init/templates/audio-transcription/static/app.js b/aai_cli/init/templates/audio_transcription/static/app.js similarity index 100% rename from aai_cli/init/templates/audio-transcription/static/app.js rename to aai_cli/init/templates/audio_transcription/static/app.js diff --git a/aai_cli/init/templates/audio-transcription/static/index.html b/aai_cli/init/templates/audio_transcription/static/index.html similarity index 100% rename from aai_cli/init/templates/audio-transcription/static/index.html rename to aai_cli/init/templates/audio_transcription/static/index.html diff --git a/aai_cli/init/templates/audio-transcription/static/styles.css b/aai_cli/init/templates/audio_transcription/static/styles.css similarity index 100% rename from aai_cli/init/templates/audio-transcription/static/styles.css rename to aai_cli/init/templates/audio_transcription/static/styles.css diff --git a/aai_cli/init/templates/live-captions/vercel.json b/aai_cli/init/templates/audio_transcription/vercel.json similarity index 100% rename from aai_cli/init/templates/live-captions/vercel.json rename to aai_cli/init/templates/audio_transcription/vercel.json diff --git a/aai_cli/init/templates/live-captions/AGENTS.md b/aai_cli/init/templates/live_captions/AGENTS.md similarity index 100% rename from aai_cli/init/templates/live-captions/AGENTS.md rename to aai_cli/init/templates/live_captions/AGENTS.md diff --git a/aai_cli/init/templates/voice-agent/Dockerfile b/aai_cli/init/templates/live_captions/Dockerfile similarity index 100% rename from aai_cli/init/templates/voice-agent/Dockerfile rename to aai_cli/init/templates/live_captions/Dockerfile diff --git a/aai_cli/init/templates/voice-agent/Procfile b/aai_cli/init/templates/live_captions/Procfile similarity index 100% rename from aai_cli/init/templates/voice-agent/Procfile rename to aai_cli/init/templates/live_captions/Procfile diff --git a/aai_cli/init/templates/live-captions/README.md b/aai_cli/init/templates/live_captions/README.md similarity index 100% rename from aai_cli/init/templates/live-captions/README.md rename to aai_cli/init/templates/live_captions/README.md diff --git a/aai_cli/init/templates/live_captions/__init__.py b/aai_cli/init/templates/live_captions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aai_cli/init/templates/live_captions/api/__init__.py b/aai_cli/init/templates/live_captions/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aai_cli/init/templates/live-captions/api/index.py b/aai_cli/init/templates/live_captions/api/index.py similarity index 69% rename from aai_cli/init/templates/live-captions/api/index.py rename to aai_cli/init/templates/live_captions/api/index.py index 0b91b3ce..81848273 100644 --- a/aai_cli/init/templates/live-captions/api/index.py +++ b/aai_cli/init/templates/live_captions/api/index.py @@ -11,14 +11,12 @@ from pathlib import Path -# httpx2 is Pydantic's maintained fork of httpx (API-identical, just renamed) — not a -# typo. Keep the "2"; see requirements.txt. -import httpx2 +from assemblyai.streaming.v3 import StreamingClient, StreamingClientOptions from fastapi import FastAPI, HTTPException from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles -from api import settings +from . import settings ROOT = Path(__file__).resolve().parent.parent STATIC = ROOT / "static" @@ -42,21 +40,20 @@ def index() -> FileResponse: @app.post("/api/token") def token() -> dict[str, str]: - """Mint a one-time streaming token. The browser uses it to open the WebSocket.""" + """Mint a one-time streaming token via the AssemblyAI SDK. The browser uses it to open the WebSocket.""" _require_key() - # NOTE: the streaming token uses the raw API key as Authorization (no 'Bearer'). try: - resp = httpx2.get( - f"https://{settings.STREAMING_HOST}{settings.TOKEN_PATH}", - params={"expires_in_seconds": settings.TOKEN_EXPIRES_IN_SECONDS}, - headers={"Authorization": settings.API_KEY}, + client = StreamingClient( + StreamingClientOptions(api_key=settings.API_KEY, api_host=settings.STREAMING_HOST) + ) + streaming_token = client.create_temporary_token( + expires_in_seconds=settings.TOKEN_EXPIRES_IN_SECONDS ) - resp.raise_for_status() except Exception as exc: # missing/invalid key, network -> clean 502, not a 500 raise HTTPException( status_code=502, detail=f"Could not mint streaming token: {exc}" ) from exc return { - "token": resp.json()["token"], + "token": streaming_token, "ws_url": f"wss://{settings.STREAMING_HOST}{settings.WEBSOCKET_PATH}", } diff --git a/aai_cli/init/templates/live-captions/api/settings.py b/aai_cli/init/templates/live_captions/api/settings.py similarity index 93% rename from aai_cli/init/templates/live-captions/api/settings.py rename to aai_cli/init/templates/live_captions/api/settings.py index a0b61862..33ffd0fb 100644 --- a/aai_cli/init/templates/live-captions/api/settings.py +++ b/aai_cli/init/templates/live_captions/api/settings.py @@ -10,5 +10,4 @@ # `assembly init` writes this for you; defaults to production. Host only, no scheme. STREAMING_HOST = os.environ.get("ASSEMBLYAI_STREAMING_HOST", "streaming.assemblyai.com") TOKEN_EXPIRES_IN_SECONDS = 60 -TOKEN_PATH = "/v3/token" WEBSOCKET_PATH = "/v3/ws" diff --git a/aai_cli/init/templates/voice-agent/dockerignore b/aai_cli/init/templates/live_captions/dockerignore similarity index 100% rename from aai_cli/init/templates/voice-agent/dockerignore rename to aai_cli/init/templates/live_captions/dockerignore diff --git a/aai_cli/init/templates/live-captions/env.example b/aai_cli/init/templates/live_captions/env.example similarity index 100% rename from aai_cli/init/templates/live-captions/env.example rename to aai_cli/init/templates/live_captions/env.example diff --git a/aai_cli/init/templates/voice-agent/gitignore b/aai_cli/init/templates/live_captions/gitignore similarity index 100% rename from aai_cli/init/templates/voice-agent/gitignore rename to aai_cli/init/templates/live_captions/gitignore diff --git a/aai_cli/init/templates/voice-agent/requirements.txt b/aai_cli/init/templates/live_captions/requirements.txt similarity index 90% rename from aai_cli/init/templates/voice-agent/requirements.txt rename to aai_cli/init/templates/live_captions/requirements.txt index 7d5bb7a2..fe0f2b61 100644 --- a/aai_cli/init/templates/voice-agent/requirements.txt +++ b/aai_cli/init/templates/live_captions/requirements.txt @@ -1,6 +1,6 @@ fastapi>=0.136.3 uvicorn>=0.30.0 -httpx2>=2.3.0 +assemblyai>=0.64.4,<1 python-dotenv>=1.2.2 # Pin starlette directly: FastAPI's own floor (starlette>=0.46.0) still admits # versions with known CVEs, so raise the transitive floor above them. diff --git a/aai_cli/init/templates/voice-agent/runtime.txt b/aai_cli/init/templates/live_captions/runtime.txt similarity index 100% rename from aai_cli/init/templates/voice-agent/runtime.txt rename to aai_cli/init/templates/live_captions/runtime.txt diff --git a/aai_cli/init/templates/live-captions/static/app.js b/aai_cli/init/templates/live_captions/static/app.js similarity index 100% rename from aai_cli/init/templates/live-captions/static/app.js rename to aai_cli/init/templates/live_captions/static/app.js diff --git a/aai_cli/init/templates/live-captions/static/audio.js b/aai_cli/init/templates/live_captions/static/audio.js similarity index 100% rename from aai_cli/init/templates/live-captions/static/audio.js rename to aai_cli/init/templates/live_captions/static/audio.js diff --git a/aai_cli/init/templates/live-captions/static/index.html b/aai_cli/init/templates/live_captions/static/index.html similarity index 100% rename from aai_cli/init/templates/live-captions/static/index.html rename to aai_cli/init/templates/live_captions/static/index.html diff --git a/aai_cli/init/templates/live-captions/static/styles.css b/aai_cli/init/templates/live_captions/static/styles.css similarity index 100% rename from aai_cli/init/templates/live-captions/static/styles.css rename to aai_cli/init/templates/live_captions/static/styles.css diff --git a/aai_cli/init/templates/voice-agent/vercel.json b/aai_cli/init/templates/live_captions/vercel.json similarity index 100% rename from aai_cli/init/templates/voice-agent/vercel.json rename to aai_cli/init/templates/live_captions/vercel.json diff --git a/aai_cli/init/templates/voice-agent/AGENTS.md b/aai_cli/init/templates/voice_agent/AGENTS.md similarity index 100% rename from aai_cli/init/templates/voice-agent/AGENTS.md rename to aai_cli/init/templates/voice_agent/AGENTS.md diff --git a/aai_cli/init/templates/voice_agent/Dockerfile b/aai_cli/init/templates/voice_agent/Dockerfile new file mode 100644 index 00000000..73deb13c --- /dev/null +++ b/aai_cli/init/templates/voice_agent/Dockerfile @@ -0,0 +1,24 @@ +# Container image for Fly.io, Railway, Render (Docker), and Cloudflare Containers. +# Vercel ignores this and builds api/index.py as a serverless function instead. +FROM python:3.13-slim + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 +WORKDIR /app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +# Run as a non-root user (container hardening; the app only reads its files and binds +# a non-privileged port, so it needs no elevated privileges). python:3.13-slim is +# Debian-based, so `useradd` is available. +RUN useradd --create-home appuser +USER appuser + +# Fly reads EXPOSE to set its fly.toml internal_port; keep it in sync with the CMD +# default so the proxy and the app agree on the port. +EXPOSE 8080 +# Shell form so ${PORT} expands. Railway/Render inject $PORT; Fly maps to 8080. +CMD python -m uvicorn api.index:app --host 0.0.0.0 --port ${PORT:-8080} diff --git a/aai_cli/init/templates/voice_agent/Procfile b/aai_cli/init/templates/voice_agent/Procfile new file mode 100644 index 00000000..8837c118 --- /dev/null +++ b/aai_cli/init/templates/voice_agent/Procfile @@ -0,0 +1 @@ +web: python -m uvicorn api.index:app --host 0.0.0.0 --port ${PORT:-3000} diff --git a/aai_cli/init/templates/voice-agent/README.md b/aai_cli/init/templates/voice_agent/README.md similarity index 100% rename from aai_cli/init/templates/voice-agent/README.md rename to aai_cli/init/templates/voice_agent/README.md diff --git a/aai_cli/init/templates/voice_agent/__init__.py b/aai_cli/init/templates/voice_agent/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aai_cli/init/templates/voice_agent/api/__init__.py b/aai_cli/init/templates/voice_agent/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aai_cli/init/templates/voice-agent/api/index.py b/aai_cli/init/templates/voice_agent/api/index.py similarity index 98% rename from aai_cli/init/templates/voice-agent/api/index.py rename to aai_cli/init/templates/voice_agent/api/index.py index d6cda54b..d86f38a6 100644 --- a/aai_cli/init/templates/voice-agent/api/index.py +++ b/aai_cli/init/templates/voice_agent/api/index.py @@ -18,7 +18,7 @@ from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles -from api import settings +from . import settings ROOT = Path(__file__).resolve().parent.parent STATIC = ROOT / "static" diff --git a/aai_cli/init/templates/voice-agent/api/settings.py b/aai_cli/init/templates/voice_agent/api/settings.py similarity index 100% rename from aai_cli/init/templates/voice-agent/api/settings.py rename to aai_cli/init/templates/voice_agent/api/settings.py diff --git a/aai_cli/init/templates/voice_agent/dockerignore b/aai_cli/init/templates/voice_agent/dockerignore new file mode 100644 index 00000000..c6c282ad --- /dev/null +++ b/aai_cli/init/templates/voice_agent/dockerignore @@ -0,0 +1,6 @@ +.env +.venv +__pycache__/ +*.pyc +.git/ +.gitignore diff --git a/aai_cli/init/templates/voice-agent/env.example b/aai_cli/init/templates/voice_agent/env.example similarity index 100% rename from aai_cli/init/templates/voice-agent/env.example rename to aai_cli/init/templates/voice_agent/env.example diff --git a/aai_cli/init/templates/voice_agent/gitignore b/aai_cli/init/templates/voice_agent/gitignore new file mode 100644 index 00000000..5b01785a --- /dev/null +++ b/aai_cli/init/templates/voice_agent/gitignore @@ -0,0 +1,3 @@ +.env +.venv +__pycache__/ diff --git a/aai_cli/init/templates/live-captions/requirements.txt b/aai_cli/init/templates/voice_agent/requirements.txt similarity index 100% rename from aai_cli/init/templates/live-captions/requirements.txt rename to aai_cli/init/templates/voice_agent/requirements.txt diff --git a/aai_cli/init/templates/voice_agent/runtime.txt b/aai_cli/init/templates/voice_agent/runtime.txt new file mode 100644 index 00000000..d2aca3a7 --- /dev/null +++ b/aai_cli/init/templates/voice_agent/runtime.txt @@ -0,0 +1 @@ +python-3.12 diff --git a/aai_cli/init/templates/voice-agent/static/app.js b/aai_cli/init/templates/voice_agent/static/app.js similarity index 100% rename from aai_cli/init/templates/voice-agent/static/app.js rename to aai_cli/init/templates/voice_agent/static/app.js diff --git a/aai_cli/init/templates/voice_agent/static/audio.js b/aai_cli/init/templates/voice_agent/static/audio.js new file mode 100644 index 00000000..bda694c9 --- /dev/null +++ b/aai_cli/init/templates/voice_agent/static/audio.js @@ -0,0 +1,101 @@ +function createMicrophonePipeline(stream, options = {}) { + const bufferSize = options.bufferSize || 4096; + const AudioContextClass = window.AudioContext || window.webkitAudioContext; + const audioCtx = new AudioContextClass(); + const source = audioCtx.createMediaStreamSource(stream); + const processor = audioCtx.createScriptProcessor(bufferSize, 1, 1); + + return { + audioCtx, + async start(onFrame) { + await audioCtx.resume(); + source.connect(processor); + processor.connect(audioCtx.destination); + processor.onaudioprocess = (event) => { + onFrame(event.inputBuffer.getChannelData(0), audioCtx.sampleRate); + }; + }, + close() { + processor.disconnect(); + stream.getTracks().forEach((track) => track.stop()); + audioCtx.close(); + }, + }; +} + +function createPcmPlayer(options = {}) { + const sampleRate = options.sampleRate || 24000; + const AudioContextClass = window.AudioContext || window.webkitAudioContext; + const playCtx = new AudioContextClass({ sampleRate }); + let playHead = 0; + let sources = []; + + return { + async resume() { + await playCtx.resume(); + }, + playBase64Chunk(base64Audio) { + const int16 = b64ToInt16(base64Audio); + const buffer = playCtx.createBuffer(1, int16.length, sampleRate); + const channel = buffer.getChannelData(0); + for (let i = 0; i < int16.length; i++) channel[i] = int16[i] / 0x8000; + + const source = playCtx.createBufferSource(); + source.buffer = buffer; + source.connect(playCtx.destination); + const startAt = Math.max(playCtx.currentTime, playHead); + source.start(startAt); + playHead = startAt + buffer.duration; + sources.push(source); + source.onended = () => { + sources = sources.filter((item) => item !== source); + }; + }, + stopQueuedAudio() { + sources.forEach((source) => { + try { + source.stop(); + } catch (_) {} + }); + sources = []; + playHead = 0; + }, + close() { + this.stopQueuedAudio(); + playCtx.close(); + }, + }; +} + +function downsampleToPCM(input, inputRate, outputRate) { + const ratio = inputRate / outputRate; + const outputLength = Math.floor(input.length / ratio); + const output = new Int16Array(outputLength); + for (let i = 0; i < outputLength; i++) { + const sample = Math.max(-1, Math.min(1, input[Math.floor(i * ratio)])); + output[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff; + } + return output.buffer; +} + +function bytesToB64(buffer) { + let binary = ""; + const bytes = new Uint8Array(buffer); + for (let i = 0; i < bytes.length; i++) + binary += String.fromCharCode(bytes[i]); + return btoa(binary); +} + +function b64ToInt16(base64Audio) { + const binary = atob(base64Audio); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i); + return new Int16Array(bytes.buffer); +} + +window.AudioHelpers = { + createMicrophonePipeline, + createPcmPlayer, + downsampleToPCM, + bytesToB64, +}; diff --git a/aai_cli/init/templates/voice-agent/static/index.html b/aai_cli/init/templates/voice_agent/static/index.html similarity index 100% rename from aai_cli/init/templates/voice-agent/static/index.html rename to aai_cli/init/templates/voice_agent/static/index.html diff --git a/aai_cli/init/templates/voice_agent/static/styles.css b/aai_cli/init/templates/voice_agent/static/styles.css new file mode 100644 index 00000000..2ea6cc6f --- /dev/null +++ b/aai_cli/init/templates/voice_agent/static/styles.css @@ -0,0 +1,329 @@ +/* THEME TOKENS — AssemblyAI design system. Edit this block first to retheme. + Brand fonts hotlink from assemblyai.com (CORS-open) with system fallbacks. */ +@font-face { + font-family: "Oceanic Text"; + src: url("https://www.assemblyai.com/_aai/fonts/rebrand/OceanicText-Regular.otf") + format("opentype"); + font-weight: 400; + font-display: swap; +} +@font-face { + font-family: "UN 11ST"; + src: url("https://www.assemblyai.com/_aai/fonts/rebrand/UN_11ST_Regular.woff2") + format("woff2"); + font-weight: 400; + font-display: swap; +} +@font-face { + font-family: "UN 11ST"; + src: url("https://www.assemblyai.com/_aai/fonts/rebrand/UN_11ST_Bold.woff2") + format("woff2"); + font-weight: 700; + font-display: swap; +} +@font-face { + font-family: "Modern Gothic Mono"; + src: url("https://www.assemblyai.com/_aai/fonts/rebrand/ModernGothicMono-Regular.woff2") + format("woff2"); + font-weight: 400; + font-display: swap; +} + +:root { + /* Brand fonts */ + --font-display: "Oceanic Text", Georgia, serif; + --font-body: "UN 11ST", system-ui, sans-serif; + --font-mono: "Modern Gothic Mono", "JetBrains Mono", monospace; + + /* Cobolt (brand purple — the only accent) */ + --color-cobolt-500: #3923c7; + --color-cobolt-300: #887bdd; + --color-cobolt-200: #b0a7e9; + --color-cobolt-100: #d7d3f4; + + /* Warm neutrals */ + --color-black-500: #1d1b16; + --color-black-400: #4a4945; + --color-black-300: #777673; + --color-white-100: #ffffff; + --color-white-200: #fdfcf8; + --color-white-300: #f5f3eb; + --color-neutral-100: #ecebe5; + --color-neutral-200: #dad7cb; + --color-neutral-300: #c7c3b2; + + /* Blush (error highlight only) */ + --color-blush-200: #f4d4d0; + --color-blush-500: #e39389; + + /* Semantic aliases — downstream rules use these */ + --color-bg: var(--color-white-200); + --color-surface: var(--color-white-100); + --color-text: var(--color-black-400); + --color-text-dark: var(--color-black-500); + --color-text-muted: var(--color-black-300); + --color-border: var(--color-neutral-300); + --color-accent: var(--color-cobolt-500); + --color-accent-hover: var(--color-cobolt-300); + --color-accent-contrast: var(--color-white-100); + --color-user: var(--color-black-400); + --color-connected: var(--color-black-500); + + --shadow-focus: 0 0 0 3px var(--color-cobolt-100); + + --radius-cta: 4px; + --radius-control: 8px; + --radius-panel: 12px; + --radius-pill: 9999px; + --space-page-block: 64px; + --space-page-inline: 24px; + --content-width: 720px; +} + +/* BASE */ +* { + box-sizing: border-box; +} + +body { + min-height: 100vh; + margin: 0; + padding: var(--space-page-block) var(--space-page-inline); + background: var(--color-bg); + color: var(--color-text); + font: 16px/1.3 var(--font-body); + -moz-osx-font-smoothing: grayscale; + -webkit-font-smoothing: antialiased; + text-rendering: optimizeLegibility; +} + +h1, +h2, +h3, +h4, +h5, +h6 { + font-family: var(--font-display); + font-weight: 400; + line-height: 1; + color: var(--color-text-dark); +} + +button { + appearance: none; + font: inherit; +} + +/* LAYOUT */ +.app-shell { + max-width: var(--content-width); + margin: 0 auto; +} + +/* BRAND CHROME */ +.brand { + display: inline-flex; + align-items: center; + margin-bottom: 40px; +} + +.brand-logo { + display: block; + width: auto; + height: 20px; +} + +.eyebrow { + display: inline-flex; + align-items: center; + margin-bottom: 16px; + padding: 8px 16px; + border: 1px solid var(--color-border); + border-radius: var(--radius-cta); + color: var(--color-black-400); + font-family: var(--font-mono); + font-size: 12px; + letter-spacing: 1.2px; + text-transform: uppercase; + font-feature-settings: "ss09" 1; +} + +.page-header { + margin-bottom: 32px; +} + +.page-title { + margin: 0 0 12px; + font-size: 48px; + letter-spacing: -2.4px; +} + +.page-subtitle { + max-width: 60ch; + margin: 0; + font-size: 18px; + color: var(--color-text-muted); +} + +.page-footer { + display: flex; + flex-wrap: wrap; + align-items: center; + justify-content: space-between; + gap: 12px; + margin-top: 64px; + padding-top: 24px; + border-top: 1px solid var(--color-border); + color: var(--color-text-muted); + font-size: 14px; +} + +.footer-link { + color: var(--color-accent); + font-family: var(--font-mono); + font-size: 12px; + letter-spacing: 1.2px; + text-transform: uppercase; + text-decoration: none; + transition: color 0.2s ease; +} + +.footer-link:hover { + color: var(--color-accent-hover); +} + +.control-bar { + display: flex; + align-items: center; + flex-wrap: wrap; + gap: 12px; +} + +/* CONTROLS */ +.button { + --button-bg: var(--color-accent); + --button-fg: var(--color-accent-contrast); + display: inline-flex; + min-height: 40px; + align-items: center; + justify-content: center; + border: 0; + border-radius: var(--radius-cta); + background: var(--button-bg); + color: var(--button-fg); + cursor: pointer; + font-family: var(--font-mono); + font-size: 14px; + letter-spacing: 1.4px; + text-transform: uppercase; + padding: 0 18px; + white-space: nowrap; + transition: background-color 0.2s ease; +} + +.button:hover:not(:disabled) { + --button-bg: var(--color-accent-hover); +} + +.button:focus-visible { + box-shadow: var(--shadow-focus); + outline: none; +} + +.button:disabled { + cursor: default; + opacity: 0.55; +} + +.connection-button[data-state="connected"] { + --button-bg: var(--color-connected); +} + +.connection-button[data-state="connected"]:hover:not(:disabled) { + --button-bg: var(--color-black-400); +} + +/* STATUS STATES */ +.status-pill { + display: inline-flex; + align-items: center; + border: 1px solid var(--color-neutral-300); + border-radius: var(--radius-pill); + background: var(--color-neutral-200); + color: var(--color-black-300); + font-size: 14px; + padding: 5px 14px; +} + +.status-pill:empty { + display: none; +} + +.status-pill[data-state="live"] { + border-color: var(--color-cobolt-200); + background: var(--color-cobolt-100); + color: var(--color-cobolt-500); +} + +.status-pill[data-state="idle"] { + border-color: var(--color-neutral-300); + background: var(--color-neutral-200); + color: var(--color-black-300); +} + +.status-pill[data-state="error"] { + border-color: var(--color-blush-500); + background: var(--color-blush-200); + color: var(--color-black-500); +} + +/* CONVERSATION VIEW */ +.conversation-log { + margin-top: 24px; +} + +.conversation-turn { + margin: 8px 0; + border-left: 3px solid var(--color-border); + padding: 8px 0 8px 12px; + overflow-wrap: anywhere; +} + +.conversation-turn[data-speaker="you"] { + border-left-color: var(--color-user); +} + +.conversation-turn[data-speaker="agent"] { + border-left-color: var(--color-accent); +} + +.turn-speaker { + font-family: var(--font-mono); + font-size: 12px; + letter-spacing: 1.2px; + text-transform: uppercase; +} + +.conversation-turn[data-speaker="you"] .turn-speaker { + color: var(--color-user); +} + +.conversation-turn[data-speaker="agent"] .turn-speaker { + color: var(--color-accent); +} + +@media (max-width: 768px) { + :root { + --space-page-block: 40px; + } + + .page-title { + font-size: 30px; + letter-spacing: -1.5px; + } + + .button, + .status-pill { + width: 100%; + } +} diff --git a/aai_cli/init/templates/voice_agent/vercel.json b/aai_cli/init/templates/voice_agent/vercel.json new file mode 100644 index 00000000..10e8a7c1 --- /dev/null +++ b/aai_cli/init/templates/voice_agent/vercel.json @@ -0,0 +1,4 @@ +{ + "$schema": "https://openapi.vercel.sh/vercel.json", + "framework": "fastapi" +} diff --git a/docs/superpowers/plans/2026-06-15-agent-framework-template.md b/docs/superpowers/plans/2026-06-15-agent-framework-template.md new file mode 100644 index 00000000..56237064 --- /dev/null +++ b/docs/superpowers/plans/2026-06-15-agent-framework-template.md @@ -0,0 +1,1570 @@ +# `agent-framework` init template — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a fourth `assembly init` starter template, `agent-framework`, with the same browser UI as `voice-agent` but built on a server-orchestrated **cascade** — Streaming STT → LLM Gateway → sandbox TTS — instead of the all-in-one Voice Agent endpoint. + +**Architecture:** The browser opens one same-origin WebSocket (`/ws`) to a FastAPI backend. The backend runs the cascade: forwards mic PCM to the Streaming v3 STT socket, detects end-of-turn, streams the transcript through the OpenAI-compatible LLM Gateway, synthesizes the reply over the sandbox streaming-TTS socket, and streams audio back. All three API credentials stay server-side. The orchestrator (`api/cascade.py`) is built with injected connect-factories + LLM callable so it is fully testable with fakes (mirroring `aai_cli/tts/session.py`). + +**Tech Stack:** FastAPI + Starlette WebSockets, `websockets` async client (STT + TTS), `openai.AsyncOpenAI` (streamed gateway completion), `uvicorn`. Buildless static HTML/CSS/JS frontend. + +--- + +## Key constraints discovered (read before starting) + +- **Sandbox-only.** `streaming_tts_host` is empty in production, so the whole cascade must target `sandbox000` with a sandbox key. The backend fails fast (a `session.error` event, *not* an import error) when the TTS host is empty. +- **Settings must not raise at import.** `tests/test_init_template_serve.py::test_serves_root_and_static_assets` is parametrized over every template dir; it imports `api.index` and hits `GET /`. The empty-TTS-host guard therefore lives in the WS handler, never at module import. +- **Template `.py` is coverage- and mutation-gated.** Confirmed: `coverage.xml` includes `init/templates/.../api/*.py`. diff-cover requires 100% patch coverage of new template lines and the mutation gate mutates them, so `cascade.py` needs real, asserting tests. +- **The contract `_STDLIB` set is incomplete.** `tests/test_init_template_contract.py::test_requirements_cover_backend_imports` treats any import not in its `_STDLIB` set as third-party and demands it in `requirements.txt`. `asyncio`, `base64`, `contextlib`, `dataclasses`, `collections`, `urllib` are stdlib we use — extend `_STDLIB` (Task 1). +- **Two hard-coded test assertions break when the registry grows:** `tests/test_init_command.py::test_init_template_arg_help_is_derived_from_registry` (exact help string) and the `assembly init --help` snapshot `tests/__snapshots__/test_snapshots_help_build.ambr`. Update the first by hand (Task 1); regenerate the second with `--snapshot-update` (Task 9). +- `openai>=2.41.0` and `websockets>=16.0` are **main project deps**, so the dev env can import `cascade.py` for the serve test. The template's own `requirements.txt` pins its independent floors. + +## File structure + +New template dir `aai_cli/init/templates/agent-framework/`: +- `api/__init__.py` — empty package marker. +- `api/settings.py` — env-derived config; no import-time raise. +- `api/cascade.py` — pure helpers + the injectable async orchestrator + the FastAPI browser adapter. +- `api/index.py` — FastAPI app: static mount, `GET /`, `@app.websocket("/ws")` adapter. +- `static/index.html` — copy of voice-agent's page, cascade-worded. +- `static/styles.css` — verbatim copy of voice-agent's. +- `static/audio.js` — verbatim copy of voice-agent's. +- `static/app.js` — same event handling as voice-agent; connects to `/ws` directly. +- `README.md`, `AGENTS.md`, `env.example`, `gitignore`, `requirements.txt`, `Procfile`, `Dockerfile`, `dockerignore`, `runtime.txt`, `vercel.json`. + +Shared CLI edits: +- `aai_cli/init/templates.py` — register the template. +- `aai_cli/app/init_exec.py` — inject `ASSEMBLYAI_TTS_HOST` into scaffolded `.env`. + +Test edits: +- `tests/test_init_template_contract.py` — extend `_STDLIB`. +- `tests/test_init_command.py` — update the exact help-string assertion. +- `tests/test_init_template_agent_framework.py` — NEW bespoke tests. +- `tests/__snapshots__/test_snapshots_help_build.ambr` — regenerated. + +--- + +## Task 1: CLI wiring (register template, inject TTS host, fix gated assertions) + +**Files:** +- Modify: `aai_cli/init/templates.py` +- Modify: `aai_cli/app/init_exec.py:91-104` +- Modify: `tests/test_init_template_contract.py` (the `_STDLIB` constant) +- Modify: `tests/test_init_command.py` (exact help string) +- Test: `tests/test_init_command.py`, `tests/test_init_templates.py` + +- [ ] **Step 1: Update the failing registry expectations first (TDD red)** + +In `tests/test_init_command.py`, update the exact-help assertion to include the new id (appended last): + +```python + assert default.help == ( + "Template to scaffold: audio-transcription, live-captions, voice-agent, " + "agent-framework (omit to pick interactively)" + ) +``` + +- [ ] **Step 2: Run it to confirm it now fails (registry not updated yet)** + +Run: `uv run pytest tests/test_init_command.py::test_init_template_arg_help_is_derived_from_registry tests/test_init_templates.py -q` +Expected: FAIL — `test_order_matches_registry`/`test_every_shipped_directory_is_registered` and the help-string test disagree with the registry. + +- [ ] **Step 3: Register the template** + +In `aai_cli/init/templates.py`, add the entry and order (append after `voice-agent`): + +```python +TEMPLATES: dict[str, str] = { + "audio-transcription": "Audio Transcription", + "live-captions": "Live Captions", + "voice-agent": "Voice Agent", + "agent-framework": "Agent Framework", +} + +# Display order for the picker and `--help`. +TEMPLATE_ORDER: tuple[str, ...] = ( + "audio-transcription", + "live-captions", + "voice-agent", + "agent-framework", +) +``` + +- [ ] **Step 4: Inject the TTS host into scaffolded `.env`** + +In `aai_cli/app/init_exec.py`, add the TTS host to `_active_env_vars()` (the cascade template reads it; empty in prod, which the template treats as "sandbox required"): + +```python + return { + "ASSEMBLYAI_BASE_URL": env.api_base, + "ASSEMBLYAI_LLM_GATEWAY_URL": env.llm_gateway_base, + "ASSEMBLYAI_STREAMING_HOST": env.streaming_host, + # Voice Agent host mirrors the streaming host's naming across environments. + "ASSEMBLYAI_AGENTS_HOST": env.streaming_host.replace("streaming", "agents", 1), + # Streaming-TTS host for the cascade (agent-framework) template. Empty in + # production, where streaming TTS has no host; that template then refuses to + # run and points at --sandbox. + "ASSEMBLYAI_TTS_HOST": env.streaming_tts_host, + } +``` + +- [ ] **Step 5: Extend the contract test's stdlib set** + +In `tests/test_init_template_contract.py`, widen `_STDLIB` so the cascade's stdlib imports aren't mistaken for third-party packages: + +```python +_STDLIB = { + "os", + "tempfile", + "uuid", + "pathlib", + "__future__", + "json", + "typing", + "asyncio", + "base64", + "contextlib", + "dataclasses", + "collections", + "urllib", +} +``` + +- [ ] **Step 6: Add an assertion pinning the new env var (mutation coverage for Step 4)** + +In `tests/test_init_command.py`, beside the existing `_active_env_vars` test (~line 312), add: + +```python +def test_active_env_vars_includes_streaming_tts_host(monkeypatch): + fake = SimpleNamespace( + api_base="https://api.x", + llm_gateway_base="https://llm.x/v1", + streaming_host="streaming.x", + streaming_tts_host="streaming-tts.x", + ) + monkeypatch.setattr(init_exec.environments, "active", lambda: fake) + assert init_exec._active_env_vars()["ASSEMBLYAI_TTS_HOST"] == "streaming-tts.x" +``` + +(Use the same `SimpleNamespace`/`monkeypatch` shape as the neighboring test; import `SimpleNamespace` from `types` if not already imported.) + +- [ ] **Step 7: Run the registry + command tests (they pass except for the missing dir)** + +Run: `uv run pytest tests/test_init_templates.py tests/test_init_command.py -q` +Expected: `test_every_registered_template_has_a_directory` FAILS (dir not created yet); everything else PASSES. This failure is resolved in Task 6 when `api/index.py` lands. Proceed. + +- [ ] **Step 8: Commit** + +```bash +git add aai_cli/init/templates.py aai_cli/app/init_exec.py tests/test_init_template_contract.py tests/test_init_command.py +git commit -m "feat(init): register agent-framework template + inject TTS host" +``` + +--- + +## Task 2: Template skeleton + verbatim static assets + +**Files:** +- Create: `aai_cli/init/templates/agent-framework/api/__init__.py` +- Create (copy): `aai_cli/init/templates/agent-framework/static/styles.css` +- Create (copy): `aai_cli/init/templates/agent-framework/static/audio.js` + +- [ ] **Step 1: Create the directory and copy the verbatim assets** + +Run: + +```bash +SRC=aai_cli/init/templates/voice-agent +DST=aai_cli/init/templates/agent-framework +mkdir -p "$DST/api" "$DST/static" +: > "$DST/api/__init__.py" +cp "$SRC/static/styles.css" "$DST/static/styles.css" +cp "$SRC/static/audio.js" "$DST/static/audio.js" +``` + +`styles.css` and `audio.js` are reused unchanged — the UI and the mic-pipeline/PCM-player/barge-in helpers are identical to `voice-agent`. + +- [ ] **Step 2: Verify the copies are byte-identical** + +Run: `diff aai_cli/init/templates/voice-agent/static/styles.css aai_cli/init/templates/agent-framework/static/styles.css && diff aai_cli/init/templates/voice-agent/static/audio.js aai_cli/init/templates/agent-framework/static/audio.js && echo OK` +Expected: `OK` + +- [ ] **Step 3: Commit** + +```bash +git add aai_cli/init/templates/agent-framework/api/__init__.py aai_cli/init/templates/agent-framework/static/styles.css aai_cli/init/templates/agent-framework/static/audio.js +git commit -m "feat(agent-framework): skeleton + shared static assets" +``` + +--- + +## Task 3: `settings.py` + availability guard + +**Files:** +- Create: `aai_cli/init/templates/agent-framework/api/settings.py` +- Test: `tests/test_init_template_agent_framework.py` + +- [ ] **Step 1: Write the failing test** + +Create `tests/test_init_template_agent_framework.py`: + +```python +"""Hermetic tests for the agent-framework (cascaded voice agent) template. + +The template ships a standalone FastAPI app under api/; load it by path with its +own `api` package, evicting any other template's cached `api` modules so imports +stay collision-free under pytest-xdist / pytest-randomly. +""" + +from __future__ import annotations + +import asyncio +import base64 +import importlib +import json +import sys +from pathlib import Path +from types import ModuleType + +import pytest + +TEMPLATE_DIR = Path("aai_cli/init/templates/agent-framework") + + +def _load(module: str, monkeypatch: pytest.MonkeyPatch, **env: str) -> ModuleType: + for key, value in env.items(): + monkeypatch.setenv(key, value) + for name in ("api.index", "api.cascade", "api.settings", "api"): + sys.modules.pop(name, None) + monkeypatch.syspath_prepend(str(TEMPLATE_DIR)) + return importlib.import_module(module) + + +def test_settings_imports_without_key_or_tts_host(monkeypatch): + # isolate_env strips ambient vars; with nothing set the module must still import + # (the empty-host guard lives in the WS handler, not at import). + monkeypatch.delenv("ASSEMBLYAI_API_KEY", raising=False) + monkeypatch.delenv("ASSEMBLYAI_TTS_HOST", raising=False) + settings = _load("api.settings", monkeypatch) + assert settings.API_KEY == "" + assert settings.MODEL == "claude-haiku-4-5-20251001" + assert settings.VOICE == "ivy" + assert settings.INPUT_SAMPLE_RATE == 16000 + assert settings.OUTPUT_SAMPLE_RATE == 24000 + + +def test_settings_reads_env(monkeypatch): + settings = _load( + "api.settings", + monkeypatch, + ASSEMBLYAI_API_KEY="sk-test", + ASSEMBLYAI_STREAMING_HOST="streaming.example", + ASSEMBLYAI_TTS_HOST="tts.example", + ASSEMBLYAI_LLM_GATEWAY_URL="https://llm.example/v1", + ) + assert settings.API_KEY == "sk-test" + assert settings.STREAMING_HOST == "streaming.example" + assert settings.TTS_HOST == "tts.example" + assert settings.LLM_GATEWAY_URL == "https://llm.example/v1" +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `uv run pytest tests/test_init_template_agent_framework.py -q` +Expected: FAIL with `ModuleNotFoundError: No module named 'api.settings'`. + +- [ ] **Step 3: Write `settings.py`** + +Create `aai_cli/init/templates/agent-framework/api/settings.py`: + +```python +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +API_KEY = os.environ.get("ASSEMBLYAI_API_KEY", "") + +# Hosts. `assembly init` pins these to the active environment. Streaming TTS only +# exists in the sandbox, so this whole cascade is sandbox-only (see README); the +# defaults point at the sandbox so a bare clone works with a sandbox key. +STREAMING_HOST = os.environ.get( + "ASSEMBLYAI_STREAMING_HOST", "streaming.sandbox000.assemblyai-labs.com" +) +TTS_HOST = os.environ.get("ASSEMBLYAI_TTS_HOST", "streaming-tts.sandbox000.assemblyai-labs.com") +LLM_GATEWAY_URL = os.environ.get( + "ASSEMBLYAI_LLM_GATEWAY_URL", "https://llm-gateway.sandbox000.assemblyai-labs.com/v1" +) + +# The cascade's three knobs — edit these to change behavior. +MODEL = "claude-haiku-4-5-20251001" +VOICE = "ivy" +SYSTEM_PROMPT = ( + "You are a friendly, concise voice assistant. Keep replies short and conversational." +) +GREETING = "Hi! I'm your AssemblyAI voice agent. What can I help you with?" + +# 16 kHz PCM in (Streaming v3); 24 kHz PCM out (streaming TTS). +INPUT_SAMPLE_RATE = 16000 +OUTPUT_SAMPLE_RATE = 24000 +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `uv run pytest tests/test_init_template_agent_framework.py -q` +Expected: PASS (3 tests). + +- [ ] **Step 5: Commit** + +```bash +git add aai_cli/init/templates/agent-framework/api/settings.py tests/test_init_template_agent_framework.py +git commit -m "feat(agent-framework): settings module" +``` + +--- + +## Task 4: `cascade.py` pure helpers + +**Files:** +- Create: `aai_cli/init/templates/agent-framework/api/cascade.py` +- Test: `tests/test_init_template_agent_framework.py` (append) + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_init_template_agent_framework.py`: + +```python +def _cascade(monkeypatch) -> ModuleType: + return _load("api.cascade", monkeypatch, ASSEMBLYAI_API_KEY="sk-test") + + +def test_unavailable_reason_missing_key(monkeypatch): + cascade = _cascade(monkeypatch) + settings = importlib.import_module("api.settings") + settings.API_KEY = "" + settings.TTS_HOST = "tts.example" + assert "ASSEMBLYAI_API_KEY" in cascade.unavailable_reason(settings) + + +def test_unavailable_reason_missing_tts_host(monkeypatch): + cascade = _cascade(monkeypatch) + settings = importlib.import_module("api.settings") + settings.API_KEY = "sk-test" + settings.TTS_HOST = "" + reason = cascade.unavailable_reason(settings) + assert "sandbox" in reason and "assembly --sandbox init agent-framework" in reason + + +def test_unavailable_reason_ok(monkeypatch): + cascade = _cascade(monkeypatch) + settings = importlib.import_module("api.settings") + settings.API_KEY = "sk-test" + settings.TTS_HOST = "tts.example" + assert cascade.unavailable_reason(settings) is None + + +def test_stt_url_carries_streaming_params(monkeypatch): + cascade = _cascade(monkeypatch) + settings = importlib.import_module("api.settings") + settings.STREAMING_HOST = "streaming.example" + settings.INPUT_SAMPLE_RATE = 16000 + url = cascade.stt_url(settings) + assert url.startswith("wss://streaming.example/v3/ws?") + assert "sample_rate=16000" in url + assert "encoding=pcm_s16le" in url + assert "format_turns=true" in url + + +def test_tts_url_carries_voice_and_rate(monkeypatch): + cascade = _cascade(monkeypatch) + settings = importlib.import_module("api.settings") + settings.TTS_HOST = "tts.example" + settings.VOICE = "ivy" + settings.OUTPUT_SAMPLE_RATE = 24000 + url = cascade.tts_url(settings) + assert url.startswith("wss://tts.example/v1/ws/?") + assert "voice=ivy" in url + assert "sample_rate=24000" in url + + +def test_is_final_user_turn(monkeypatch): + cascade = _cascade(monkeypatch) + assert cascade.is_final_user_turn({"end_of_turn": True, "turn_is_formatted": True}) is True + assert cascade.is_final_user_turn({"end_of_turn": True, "turn_is_formatted": False}) is False + assert cascade.is_final_user_turn({"end_of_turn": False, "turn_is_formatted": True}) is False + assert cascade.is_final_user_turn({}) is False + + +def test_build_messages(monkeypatch): + cascade = _cascade(monkeypatch) + messages = cascade.build_messages("be brief", "hello there") + assert messages == [ + {"role": "system", "content": "be brief"}, + {"role": "user", "content": "hello there"}, + ] +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `uv run pytest tests/test_init_template_agent_framework.py -q -k "unavailable or url or final or build"` +Expected: FAIL with `ModuleNotFoundError: No module named 'api.cascade'`. + +- [ ] **Step 3: Write the helper section of `cascade.py`** + +Create `aai_cli/init/templates/agent-framework/api/cascade.py` with the imports and pure helpers (the orchestrator is added in Task 5, the adapter in Task 6 — write them as one growing file): + +```python +"""Server-side cascade orchestrator for the agent-framework template. + +The browser opens one WebSocket to FastAPI and the backend wires three AssemblyAI +primitives together — Streaming STT, the LLM Gateway, and streaming TTS — so every +credential stays on the server. The orchestrator takes injected connect-factories and +an LLM callable (`Deps`) so it runs hermetically against fakes in tests, the same +seam `aai_cli/tts/session.py` uses. + +Browser protocol (identical to the voice-agent template): + in : {"type": "input.audio", "audio": } + out: transcript.user / transcript.agent / reply.audio (base64 in `data`) / + input.speech.started / reply.done / session.error +""" + +from __future__ import annotations + +import asyncio +import base64 +import contextlib +import json +from collections.abc import AsyncIterator, Awaitable, Callable +from dataclasses import dataclass +from typing import Any +from urllib.parse import urlencode + + +def unavailable_reason(settings: Any) -> str | None: + """Why the cascade can't run, or None when it can. + + Streaming TTS has no production host, so an empty TTS host means the user must + re-scaffold against the sandbox. + """ + if not settings.API_KEY: + return "ASSEMBLYAI_API_KEY is not set — configure it in your deployment's environment." + if not settings.TTS_HOST: + return ( + "Streaming TTS has no production host, so this cascade is sandbox-only. " + "Re-scaffold against the sandbox: assembly --sandbox init agent-framework." + ) + return None + + +def stt_url(settings: Any) -> str: + """The Streaming v3 WebSocket URL with PCM + turn-formatting params.""" + params = urlencode( + { + "sample_rate": settings.INPUT_SAMPLE_RATE, + "encoding": "pcm_s16le", + "speech_model": "u3-rt-pro", + "format_turns": "true", + } + ) + return f"wss://{settings.STREAMING_HOST}/v3/ws?{params}" + + +def tts_url(settings: Any) -> str: + """The streaming-TTS WebSocket URL for the configured voice and sample rate.""" + params = urlencode({"voice": settings.VOICE, "sample_rate": settings.OUTPUT_SAMPLE_RATE}) + return f"wss://{settings.TTS_HOST}/v1/ws/?{params}" + + +def is_final_user_turn(msg: dict[str, Any]) -> bool: + """True for a finalized, formatted end-of-turn (the cue to reply).""" + return bool(msg.get("end_of_turn")) and bool(msg.get("turn_is_formatted")) + + +def build_messages(system_prompt: str, user_text: str) -> list[dict[str, str]]: + """The chat `messages` array for one user turn.""" + return [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_text}, + ] +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `uv run pytest tests/test_init_template_agent_framework.py -q -k "unavailable or url or final or build"` +Expected: PASS (8 tests). + +- [ ] **Step 5: Commit** + +```bash +git add aai_cli/init/templates/agent-framework/api/cascade.py tests/test_init_template_agent_framework.py +git commit -m "feat(agent-framework): cascade pure helpers" +``` + +--- + +## Task 5: `cascade.py` orchestrator (the cascade itself) + +**Files:** +- Modify: `aai_cli/init/templates/agent-framework/api/cascade.py` (append) +- Test: `tests/test_init_template_agent_framework.py` (append) + +- [ ] **Step 1: Write the failing tests (fakes + each stage + happy path)** + +Append to `tests/test_init_template_agent_framework.py`: + +```python +class FakeBrowser: + """A browser side: hands out queued inbound messages, then blocks forever so the + mic pump stays alive until the test cancels it (mirrors a still-connected client).""" + + def __init__(self, inbound: list[dict] | None = None): + self._inbound = list(inbound or []) + self.sent: list[dict] = [] + self._idle = asyncio.Event() # never set -> recv() blocks after the queue drains + + async def send(self, event: dict) -> None: + self.sent.append(event) + + async def recv(self) -> dict | None: + if self._inbound: + return self._inbound.pop(0) + await self._idle.wait() + return None + + def types(self) -> list[str]: + return [event["type"] for event in self.sent] + + +class FakeWS: + """A fake STT/TTS socket: yields the given frames as JSON strings, records sends.""" + + def __init__(self, frames: list[dict] | None = None): + self._frames = [json.dumps(f) for f in (frames or [])] + self.sent: list[Any] = [] + self.closed = False + + def __aiter__(self) -> "FakeWS": + return self + + async def __anext__(self) -> str: + if not self._frames: + raise StopAsyncIteration + return self._frames.pop(0) + + async def recv(self) -> str: + if not self._frames: + raise AssertionError("recv() past end of fake frames") + return self._frames.pop(0) + + async def send(self, data: Any) -> None: + self.sent.append(data) + + async def close(self) -> None: + self.closed = True + + +def _deps(monkeypatch, *, stt, tts_frames, llm_text): + cascade = _cascade(monkeypatch) + settings = importlib.import_module("api.settings") + settings.API_KEY = "sk-test" + settings.TTS_HOST = "tts.example" + settings.GREETING = "hello!" + settings.SYSTEM_PROMPT = "be brief" + + async def llm_stream(_messages): + for piece in llm_text: + yield piece + + deps = cascade.Deps( + connect_stt=_async_return(stt), + connect_tts=_async_return(FakeWS(tts_frames)), + llm_stream=llm_stream, + settings=settings, + ) + return cascade, deps + + +def _async_return(value): + async def factory(): + return value + + return factory + + +def test_pump_mic_forwards_decoded_audio(monkeypatch): + cascade = _cascade(monkeypatch) + pcm = b"\x01\x02\x03\x04" + browser = FakeBrowser([{"type": "input.audio", "audio": base64.b64encode(pcm).decode()}]) + stt = FakeWS() + + async def drive(): + # recv() returns the one message, then we cancel by feeding a disconnect. + browser._inbound.append(None) # type: ignore[arg-type] + await cascade._pump_mic(browser, stt) + + asyncio.run(drive()) + assert stt.sent == [pcm] + + +def test_pump_mic_ignores_non_audio_and_stops_on_disconnect(monkeypatch): + cascade = _cascade(monkeypatch) + browser = FakeBrowser([{"type": "noise"}, None]) # type: ignore[list-item] + stt = FakeWS() + asyncio.run(cascade._pump_mic(browser, stt)) + assert stt.sent == [] + + +def test_synthesize_streams_audio_frames(monkeypatch): + cascade, deps = _deps( + monkeypatch, + stt=FakeWS(), + tts_frames=[ + {"type": "Begin", "configuration": {"sample_rate": 24000}}, + {"type": "Audio", "audio": "AAA="}, + {"type": "Audio", "audio": "BBB=", "is_final": True}, + ], + llm_text=[], + ) + browser = FakeBrowser() + tts = FakeWS( + [ + {"type": "Begin", "configuration": {"sample_rate": 24000}}, + {"type": "Audio", "audio": "AAA="}, + {"type": "Audio", "audio": "BBB=", "is_final": True}, + ] + ) + asyncio.run(cascade._synthesize(browser, tts, "hi")) + assert browser.sent == [ + {"type": "reply.audio", "data": "AAA="}, + {"type": "reply.audio", "data": "BBB="}, + ] + # Generate + ForceFlushTextBuffer + Terminate were sent. + kinds = [json.loads(s)["type"] for s in tts.sent] + assert kinds == ["Generate", "ForceFlushTextBuffer", "Terminate"] + assert tts.closed is True + + +def test_synthesize_raises_on_error_frame(monkeypatch): + cascade = _cascade(monkeypatch) + browser = FakeBrowser() + tts = FakeWS( + [{"type": "Begin", "configuration": {}}, {"type": "Error", "error": "bad voice"}] + ) + with pytest.raises(RuntimeError, match="bad voice"): + asyncio.run(cascade._synthesize(browser, tts, "hi")) + + +def test_synthesize_raises_when_no_begin(monkeypatch): + cascade = _cascade(monkeypatch) + browser = FakeBrowser() + tts = FakeWS([{"type": "Audio", "audio": "AAA=", "is_final": True}]) + with pytest.raises(RuntimeError, match="did not begin"): + asyncio.run(cascade._synthesize(browser, tts, "hi")) + + +def test_generate_reply_speaks_llm_text(monkeypatch): + cascade, deps = _deps( + monkeypatch, + stt=FakeWS(), + tts_frames=[ + {"type": "Begin", "configuration": {}}, + {"type": "Audio", "audio": "AAA=", "is_final": True}, + ], + llm_text=["Hello", " world"], + ) + browser = FakeBrowser() + asyncio.run(cascade._generate_reply(browser, deps, cascade.build_messages("be brief", "hi"))) + assert {"type": "transcript.agent", "text": "Hello world"} in browser.sent + assert {"type": "reply.audio", "data": "AAA="} in browser.sent + assert browser.sent[-1] == {"type": "reply.done", "status": "completed"} + + +def test_generate_reply_empty_llm_emits_done(monkeypatch): + cascade, deps = _deps(monkeypatch, stt=FakeWS(), tts_frames=[], llm_text=[" "]) + browser = FakeBrowser() + asyncio.run(cascade._generate_reply(browser, deps, [])) + assert browser.sent == [{"type": "reply.done", "status": "empty"}] + + +def test_maybe_barge_in_cancels_active_reply(monkeypatch): + cascade = _cascade(monkeypatch) + browser = FakeBrowser() + + async def drive(): + session = cascade.Session() + started = asyncio.Event() + + async def never_ending(): + started.set() + await asyncio.Event().wait() + + session.reply_task = asyncio.create_task(never_ending()) + await started.wait() + await cascade.maybe_barge_in(browser, session) + return session + + session = asyncio.run(drive()) + assert browser.sent == [{"type": "input.speech.started"}] + assert session.reply_task is None + + +def test_maybe_barge_in_noop_without_reply(monkeypatch): + cascade = _cascade(monkeypatch) + browser = FakeBrowser() + asyncio.run(cascade.maybe_barge_in(browser, cascade.Session())) + assert browser.sent == [] + + +def test_run_session_unavailable_emits_error(monkeypatch): + cascade = _cascade(monkeypatch) + settings = importlib.import_module("api.settings") + settings.API_KEY = "" + browser = FakeBrowser() + deps = cascade.Deps( + connect_stt=_async_return(FakeWS()), + connect_tts=_async_return(FakeWS()), + llm_stream=lambda _m: iter(()), + settings=settings, + ) + asyncio.run(cascade.run_session(browser, deps)) + assert browser.types() == ["session.error"] + + +def test_run_session_happy_path(monkeypatch): + # STT yields one finalized user turn, then closes -> the reply drains, then the + # session tears down. The greeting speaks first. + stt = FakeWS( + [{"type": "Turn", "transcript": "what time is it", "end_of_turn": True, "turn_is_formatted": True}] + ) + + # Each connect_tts call returns a fresh socket (greeting + reply). + tts_sockets = [ + FakeWS([{"type": "Begin", "configuration": {}}, {"type": "Audio", "audio": "G=", "is_final": True}]), + FakeWS([{"type": "Begin", "configuration": {}}, {"type": "Audio", "audio": "R=", "is_final": True}]), + ] + cascade = _cascade(monkeypatch) + settings = importlib.import_module("api.settings") + settings.API_KEY = "sk-test" + settings.TTS_HOST = "tts.example" + settings.GREETING = "hello!" + settings.SYSTEM_PROMPT = "be brief" + + async def llm_stream(_messages): + yield "It is noon." + + def connect_tts(): + async def factory(): + return tts_sockets.pop(0) + + return factory() + + deps = cascade.Deps( + connect_stt=_async_return(stt), + connect_tts=connect_tts, + llm_stream=llm_stream, + settings=settings, + ) + browser = FakeBrowser() + asyncio.run(asyncio.wait_for(cascade.run_session(browser, deps), timeout=5)) + + types = browser.types() + # Greeting (agent transcript + audio + done), then the user turn, then the reply. + assert types[0] == "transcript.agent" # greeting text + assert {"type": "transcript.user", "text": "what time is it"} in browser.sent + assert {"type": "transcript.agent", "text": "It is noon."} in browser.sent + assert {"type": "reply.audio", "data": "R="} in browser.sent + assert browser.sent[-1] == {"type": "reply.done", "status": "completed"} + assert stt.closed is True +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `uv run pytest tests/test_init_template_agent_framework.py -q -k "pump_mic or synthesize or generate_reply or barge or run_session"` +Expected: FAIL — `Deps`, `Session`, `run_session`, `_synthesize`, etc. don't exist yet. + +- [ ] **Step 3: Append the orchestrator to `cascade.py`** + +Add to `aai_cli/init/templates/agent-framework/api/cascade.py` (after the helpers): + +```python +@dataclass +class Deps: + """Injected cascade dependencies. `Deps.real(settings)` wires the live clients; + tests pass fakes with the same shapes.""" + + connect_stt: Callable[[], Awaitable[Any]] + connect_tts: Callable[[], Awaitable[Any]] + llm_stream: Callable[[list[dict[str, str]]], AsyncIterator[str]] + settings: Any + + @classmethod + def real(cls, settings: Any) -> "Deps": + return cls( + connect_stt=lambda: _connect_stt(settings), + connect_tts=lambda: _connect_tts(settings), + llm_stream=lambda messages: _llm_stream(settings, messages), + settings=settings, + ) + + +class Session: + """Tracks the in-flight reply so a new user turn can barge in and cancel it.""" + + def __init__(self) -> None: + self.reply_task: asyncio.Task[None] | None = None + + async def cancel_reply(self) -> None: + task, self.reply_task = self.reply_task, None + if task is not None and not task.done(): + task.cancel() + with contextlib.suppress(asyncio.CancelledError, Exception): + await task + + async def drain(self) -> None: + """Await the in-flight reply to natural completion (used when STT closes).""" + task = self.reply_task + if task is not None: + with contextlib.suppress(Exception): + await task + + +async def _connect_stt(settings: Any) -> Any: + import websockets + + return await websockets.connect( + stt_url(settings), additional_headers={"Authorization": settings.API_KEY} + ) + + +async def _connect_tts(settings: Any) -> Any: + import websockets + + # max_size=None: a synthesis's Audio frames can exceed the 1 MiB default. + return await websockets.connect( + tts_url(settings), + additional_headers={"Authorization": settings.API_KEY}, + max_size=None, + ) + + +async def _llm_stream(settings: Any, messages: list[dict[str, str]]) -> AsyncIterator[str]: + from openai import AsyncOpenAI + + client = AsyncOpenAI(base_url=settings.LLM_GATEWAY_URL, api_key=settings.API_KEY) + stream = await client.chat.completions.create( + model=settings.MODEL, messages=messages, stream=True + ) + async for chunk in stream: + delta = chunk.choices[0].delta.content + if delta: + yield delta + + +async def _safe_close(conn: Any) -> None: + with contextlib.suppress(Exception): + await conn.close() + + +async def _pump_mic(browser: Any, stt: Any) -> None: + """Forward each base64 mic frame from the browser to the STT socket.""" + while True: + msg = await browser.recv() + if msg is None: + return + audio = msg.get("audio") if msg.get("type") == "input.audio" else None + if isinstance(audio, str): + await stt.send(base64.b64decode(audio)) + + +async def _synthesize(browser: Any, tts: Any, text: str) -> None: + """Drive the TTS protocol on an open socket, forwarding Audio as reply.audio.""" + begin = json.loads(await tts.recv()) + if begin.get("type") != "Begin": + raise RuntimeError(f"TTS did not begin (got {begin.get('type')!r}).") + await tts.send(json.dumps({"type": "Generate", "text": text})) + await tts.send(json.dumps({"type": "ForceFlushTextBuffer"})) + while True: + frame = json.loads(await tts.recv()) + kind = frame.get("type") + if kind == "Audio": + await browser.send({"type": "reply.audio", "data": frame.get("audio", "")}) + if frame.get("is_final"): + break + elif kind == "Error": + raise RuntimeError(frame.get("error") or "TTS error") + with contextlib.suppress(Exception): + await tts.send(json.dumps({"type": "Terminate"})) + + +async def _speak(browser: Any, deps: Deps, text: str) -> None: + """Emit agent text, synthesize it, and mark the reply done.""" + await browser.send({"type": "transcript.agent", "text": text}) + tts = await deps.connect_tts() + try: + await _synthesize(browser, tts, text) + finally: + await _safe_close(tts) + await browser.send({"type": "reply.done", "status": "completed"}) + + +async def _generate_reply(browser: Any, deps: Deps, messages: list[dict[str, str]]) -> None: + """Stream the LLM reply, then speak it. Errors surface as session.error.""" + try: + text = "".join([delta async for delta in deps.llm_stream(messages)]).strip() + if not text: + await browser.send({"type": "reply.done", "status": "empty"}) + return + await _speak(browser, deps, text) + except asyncio.CancelledError: + raise + except Exception as exc: # noqa: BLE001 — any leg failure becomes one clean event + await browser.send({"type": "session.error", "message": str(exc)}) + + +async def maybe_barge_in(browser: Any, session: Session) -> None: + """If a reply is playing, tell the browser to stop and cancel it.""" + if session.reply_task is not None and not session.reply_task.done(): + await browser.send({"type": "input.speech.started"}) + await session.cancel_reply() + + +async def _pump_stt(browser: Any, stt: Any, deps: Deps, session: Session) -> None: + """Read STT turns: emit user transcripts, reply on finalized turns, barge in on + interim speech, and drain the last reply when the socket closes.""" + async for raw in stt: + msg = json.loads(raw) + if msg.get("type") != "Turn": + continue + text = msg.get("transcript", "") + if not text: + continue + await browser.send({"type": "transcript.user", "text": text}) + if is_final_user_turn(msg): + await session.cancel_reply() + session.reply_task = asyncio.create_task( + _generate_reply(browser, deps, build_messages(deps.settings.SYSTEM_PROMPT, text)) + ) + else: + await maybe_barge_in(browser, session) + await session.drain() + + +async def run_session(browser: Any, deps: Deps) -> None: + """Run one browser session: greet, then cascade STT -> LLM -> TTS until either + side closes. All credentials stay server-side.""" + reason = unavailable_reason(deps.settings) + if reason is not None: + await browser.send({"type": "session.error", "message": reason}) + return + try: + stt = await deps.connect_stt() + except Exception as exc: # noqa: BLE001 + await browser.send({"type": "session.error", "message": f"Could not start the session: {exc}"}) + return + + session = Session() + session.reply_task = asyncio.create_task(_speak(browser, deps, deps.settings.GREETING)) + mic = asyncio.create_task(_pump_mic(browser, stt)) + listen = asyncio.create_task(_pump_stt(browser, stt, deps, session)) + try: + await asyncio.wait({mic, listen}, return_when=asyncio.FIRST_COMPLETED) + finally: + mic.cancel() + listen.cancel() + await asyncio.gather(mic, listen, return_exceptions=True) + await session.cancel_reply() + await _safe_close(stt) +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `uv run pytest tests/test_init_template_agent_framework.py -q -k "pump_mic or synthesize or generate_reply or barge or run_session"` +Expected: PASS. If `test_run_session_happy_path` is flaky on task scheduling, it should not be — the greeting reply is set before pumps start and `_pump_stt` drains the reply before returning, and the mic pump blocks on `FakeBrowser`'s idle event. If a hang occurs, the `asyncio.wait_for(..., timeout=5)` fails loudly rather than wedging. + +- [ ] **Step 5: Format + lint the template module** + +Run: `uv run ruff format aai_cli/init/templates/agent-framework/api/cascade.py && uv run ruff check aai_cli/init/templates/agent-framework/api/cascade.py` +Expected: clean (S105/TID251 are ignored for templates; the `# noqa: BLE001` keeps the broad-except lines clean). + +- [ ] **Step 6: Commit** + +```bash +git add aai_cli/init/templates/agent-framework/api/cascade.py tests/test_init_template_agent_framework.py +git commit -m "feat(agent-framework): cascade orchestrator" +``` + +--- + +## Task 6: `api/index.py` — FastAPI app + WebSocket adapter + +**Files:** +- Create: `aai_cli/init/templates/agent-framework/api/index.py` +- Modify: `aai_cli/init/templates/agent-framework/api/cascade.py` (add `FastAPIBrowser`) +- Test: `tests/test_init_template_agent_framework.py` (append) + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_init_template_agent_framework.py`: + +```python +def test_index_serves_page(monkeypatch): + index = _load("api.index", monkeypatch, ASSEMBLYAI_API_KEY="sk-test") + from fastapi.testclient import TestClient + + resp = TestClient(index.app).get("/") + assert resp.status_code == 200 + assert " None: + self._ws = websocket + + async def send(self, event: dict[str, Any]) -> None: + await self._ws.send_json(event) + + async def recv(self) -> dict[str, Any] | None: + from fastapi import WebSocketDisconnect + + try: + return await self._ws.receive_json() + except WebSocketDisconnect: + return None +``` + +- [ ] **Step 4: Write `api/index.py`** + +Create `aai_cli/init/templates/agent-framework/api/index.py`: + +```python +"""Talk to a cascaded voice agent — AssemblyAI agent-framework starter (FastAPI). + +The browser opens one WebSocket to this backend, which runs the cascade itself — +Streaming STT -> LLM Gateway -> streaming TTS — so your API key never reaches the +client. Streaming TTS is sandbox-only, so scaffold with `assembly --sandbox init +agent-framework` and use a sandbox key. + + WS /ws <- {"type":"input.audio","audio":} ; -> transcripts + reply.audio +""" + +from __future__ import annotations + +from pathlib import Path + +from fastapi import FastAPI, WebSocket +from fastapi.responses import FileResponse +from fastapi.staticfiles import StaticFiles + +from api import cascade, settings + +ROOT = Path(__file__).resolve().parent.parent +STATIC = ROOT / "static" +app = FastAPI() +app.mount("/static", StaticFiles(directory=STATIC), name="static") + + +@app.get("/") +def index() -> FileResponse: + return FileResponse(STATIC / "index.html") + + +@app.websocket("/ws") +async def ws(websocket: WebSocket) -> None: + """Accept the browser socket and run one cascade session over it.""" + await websocket.accept() + browser = cascade.FastAPIBrowser(websocket) + await cascade.run_session(browser, cascade.Deps.real(settings)) +``` + +- [ ] **Step 5: Run to verify it passes** + +Run: `uv run pytest tests/test_init_template_agent_framework.py -q` +Expected: PASS (all tests). Also confirm the registry directory test now passes: +Run: `uv run pytest tests/test_init_templates.py -q` +Expected: PASS. + +- [ ] **Step 6: Format + lint** + +Run: `uv run ruff format aai_cli/init/templates/agent-framework/api/ && uv run ruff check aai_cli/init/templates/agent-framework/api/` +Expected: clean. + +- [ ] **Step 7: Commit** + +```bash +git add aai_cli/init/templates/agent-framework/api/index.py aai_cli/init/templates/agent-framework/api/cascade.py tests/test_init_template_agent_framework.py +git commit -m "feat(agent-framework): FastAPI app + websocket adapter" +``` + +--- + +## Task 7: Frontend — `index.html` + `app.js` + +**Files:** +- Create: `aai_cli/init/templates/agent-framework/static/index.html` +- Create: `aai_cli/init/templates/agent-framework/static/app.js` + +- [ ] **Step 1: Write `static/index.html`** + +Create `aai_cli/init/templates/agent-framework/static/index.html` (same structure/IDs/classes as voice-agent, cascade-worded copy): + +```html + + + + + + Talk to a cascaded voice agent · AssemblyAI + + + +
+ + + + + + +
+ + +
+ +
+ + +
+ + + + + +``` + +- [ ] **Step 2: Write `static/app.js`** + +Create `aai_cli/init/templates/agent-framework/static/app.js`. Same event handling as voice-agent's `onEvent`/`addTurn`/`bargeIn` (so `audio.js` and the UI carry over), but it opens the same-origin `/ws` directly — no token fetch, no `session.update`: + +```javascript +const SESSION_CONFIG = { + inputSampleRate: 16000, + outputSampleRate: 24000, + processorBufferSize: 4096, + microphone: { audio: { echoCancellation: true, noiseSuppression: false } }, +}; + +const connBtn = document.getElementById("conn"); +const statusEl = document.getElementById("status"); +const logEl = document.getElementById("log"); + +let ws = null; +let micPipeline = null; +let player = null; +let connected = false; + +connBtn.addEventListener("click", () => + connected ? hangup() : connect().catch(fail), +); + +function setStatus(message, state) { + statusEl.textContent = message; + statusEl.dataset.state = state; +} + +function wsUrl() { + const scheme = location.protocol === "https:" ? "wss" : "ws"; + return `${scheme}://${location.host}/ws`; +} + +async function connect() { + setStatus("Connecting...", "idle"); + ws = new WebSocket(wsUrl()); + ws.onopen = () => startMic().catch(fail); + ws.onmessage = (event) => onEvent(JSON.parse(event.data)); + ws.onerror = () => fail("WebSocket error"); + ws.onclose = () => { + if (connected) hangup(); + }; +} + +async function startMic() { + const stream = await navigator.mediaDevices.getUserMedia( + SESSION_CONFIG.microphone, + ); + micPipeline = AudioHelpers.createMicrophonePipeline(stream, { + bufferSize: SESSION_CONFIG.processorBufferSize, + }); + player = AudioHelpers.createPcmPlayer({ + sampleRate: SESSION_CONFIG.outputSampleRate, + }); + await player.resume(); + await micPipeline.start((frame, sampleRate) => { + if (!ws || ws.readyState !== WebSocket.OPEN) return; + const pcm = AudioHelpers.downsampleToPCM( + frame, + sampleRate, + SESSION_CONFIG.inputSampleRate, + ); + ws.send( + JSON.stringify({ + type: "input.audio", + audio: AudioHelpers.bytesToB64(pcm), + }), + ); + }); + + connected = true; + connBtn.textContent = "■ Hang up"; + connBtn.dataset.state = "connected"; + setStatus("● Connected - just talk", "live"); +} + +function onEvent(event) { + switch (event.type) { + case "transcript.user": + return addTurn("you", "You", event.text); + case "transcript.agent": + return addTurn("agent", "Agent", event.text); + case "reply.audio": + return player.playBase64Chunk(event.data); + case "input.speech.started": + return bargeIn(); + case "reply.done": + if (event.status === "interrupted") bargeIn(); + return; + case "session.error": + return fail(event.message || "session error"); + } +} + +function bargeIn() { + if (player) player.stopQueuedAudio(); +} + +function addTurn(speakerKind, speaker, text) { + if (!text) return; + const turn = document.createElement("div"); + turn.className = "conversation-turn"; + turn.dataset.speaker = speakerKind; + const who = document.createElement("span"); + who.className = "turn-speaker"; + who.textContent = speaker + ": "; + turn.append(who, document.createTextNode(text)); + logEl.appendChild(turn); + turn.scrollIntoView({ block: "end" }); +} + +function hangup() { + connected = false; + connBtn.textContent = "● Connect"; + connBtn.dataset.state = "idle"; + setStatus("Disconnected", "idle"); + bargeIn(); + if (ws && ws.readyState === WebSocket.OPEN) ws.close(); + if (micPipeline) micPipeline.close(); + if (player) player.close(); + ws = null; + micPipeline = null; + player = null; +} + +function fail(message) { + setStatus("Error: " + message, "error"); + if (connected) hangup(); +} +``` + +- [ ] **Step 3: Prettier-format the JS/CSS (the gate runs `prettier --check`)** + +Run: `prettier --write "aai_cli/init/templates/agent-framework/static/*.js" "aai_cli/init/templates/agent-framework/static/*.css"` +Then verify: `prettier --check "aai_cli/init/templates/agent-framework/static/*.{js,css}"` +Expected: "All matched files use Prettier code style!" + +- [ ] **Step 4: Verify the frontend↔backend route contract + static refs** + +Run: `uv run pytest "tests/test_init_template_contract.py::test_static_assets_referenced_by_html_exist[agent-framework]" "tests/test_init_template_contract.py::test_frontend_routes_exist_in_backend[agent-framework]" -q` +Expected: PASS (the page references `styles.css`/`audio.js`/`app.js`, all present; it fetches no `/api/*` path — it uses a WebSocket — so the route check is satisfied trivially). + +- [ ] **Step 5: Commit** + +```bash +git add aai_cli/init/templates/agent-framework/static/index.html aai_cli/init/templates/agent-framework/static/app.js +git commit -m "feat(agent-framework): frontend (cascade UI + /ws client)" +``` + +--- + +## Task 8: Scaffold parity files (deploy + docs + deps) + +**Files (all create under `aai_cli/init/templates/agent-framework/`):** +- `requirements.txt`, `env.example`, `gitignore`, `runtime.txt`, `vercel.json`, `Procfile`, `Dockerfile`, `dockerignore`, `README.md`, `AGENTS.md` + +- [ ] **Step 1: `requirements.txt`** + +```text +fastapi>=0.136.3 +uvicorn>=0.30.0 +websockets>=14.1 +openai>=1.54.0 +python-dotenv>=1.2.2 +# Pin starlette directly: FastAPI's own floor still admits versions with known CVEs, +# so raise the transitive floor above them. +starlette>=1.2.1 +``` + +(`websockets` uses `additional_headers`, supported from 14.x; if the `install` test reports it unsupported, bump the floor. `openai>=1.54.0` provides `AsyncOpenAI` + streamed `chat.completions`.) + +- [ ] **Step 2: `env.example`** + +```text +ASSEMBLYAI_API_KEY=your_assemblyai_api_key_here +# This cascade uses streaming TTS, which is sandbox-only — use a sandbox key and the +# sandbox hosts (assembly --sandbox init agent-framework fills these in for you): +# ASSEMBLYAI_STREAMING_HOST=streaming.sandbox000.assemblyai-labs.com +# ASSEMBLYAI_TTS_HOST=streaming-tts.sandbox000.assemblyai-labs.com +# ASSEMBLYAI_LLM_GATEWAY_URL=https://llm-gateway.sandbox000.assemblyai-labs.com/v1 +``` + +- [ ] **Step 3: `gitignore`, `runtime.txt`, `vercel.json`, `dockerignore` (copy voice-agent's shapes)** + +Run: + +```bash +SRC=aai_cli/init/templates/voice-agent +DST=aai_cli/init/templates/agent-framework +cp "$SRC/gitignore" "$DST/gitignore" +cp "$SRC/runtime.txt" "$DST/runtime.txt" +cp "$SRC/vercel.json" "$DST/vercel.json" +cp "$SRC/dockerignore" "$DST/dockerignore" +``` + +- [ ] **Step 4: `Procfile`** + +```text +web: python -m uvicorn api.index:app --host 0.0.0.0 --port ${PORT:-3000} +``` + +- [ ] **Step 5: `Dockerfile` (copy voice-agent's — it already satisfies the contract: EXPOSE 8080, `${PORT:-8080}`, non-root USER)** + +Run: `cp aai_cli/init/templates/voice-agent/Dockerfile aai_cli/init/templates/agent-framework/Dockerfile` + +- [ ] **Step 6: `README.md`** + +```markdown +# Talk to a cascaded voice agent — AssemblyAI agent-framework starter + +Click connect and talk. Unlike the `voice-agent` template (which uses AssemblyAI's +all-in-one Voice Agent API), this app is a **cascade your own backend orchestrates**: +Streaming STT transcribes you, the LLM Gateway generates a reply, and streaming TTS +speaks it back — with turn detection and barge-in handled server-side. The browser +holds one WebSocket to your backend, so your API key never reaches the client. + +## Sandbox-only + +Streaming TTS has no production host, so the whole cascade runs against the AssemblyAI +sandbox with a sandbox key. Scaffold it that way: + +```sh +assembly --sandbox init agent-framework +``` + +That pins the sandbox hosts in `.env`. Running against production exits with a hint. + +## Run locally + +```sh +assembly dev # opens http://localhost:3000 (allow microphone access; headphones recommended) +``` + +`ASSEMBLYAI_API_KEY` is read from `.env` (created for you by `assembly init`). + +## Deploy + +This app keeps a **long-running WebSocket**, so it needs a persistent process — not +Vercel's serverless functions. Use the shipped `Procfile`/`Dockerfile` on Render, +Railway, Fly.io, or Google Cloud Run (`gcloud run deploy --source .`): + +```sh +uvicorn api.index:app --host 0.0.0.0 --port $PORT +``` + +Set `ASSEMBLYAI_API_KEY` and the three sandbox host vars (`ASSEMBLYAI_STREAMING_HOST`, +`ASSEMBLYAI_TTS_HOST`, `ASSEMBLYAI_LLM_GATEWAY_URL`) in the platform's environment. + +## Ideas to extend + +- Change the `MODEL`, `VOICE`, `SYSTEM_PROMPT`, or `GREETING` in `api/settings.py`. +- Stream each LLM sentence into TTS as it completes (lower latency) instead of + synthesizing the whole reply at once — see `_generate_reply` in `api/cascade.py`. +- Add tools (function calling) on the LLM leg so the agent can look things up. +``` + +- [ ] **Step 7: `AGENTS.md` (must contain `ASSEMBLYAI_API_KEY`, `buildless`, `static/app.js` for the contract)** + +```markdown +# Agent Notes + +This is a buildless FastAPI + browser starter for a **cascaded** voice agent +(Streaming STT -> LLM Gateway -> streaming TTS), orchestrated server-side. Run it with: + +```sh +assembly dev +``` + +## Map + +- `api/settings.py`: API key, hosts, model, voice, system prompt, greeting, sample rates. +- `api/cascade.py`: the orchestrator — STT/TTS socket helpers, the LLM stream, turn + detection, barge-in, and the `/ws` browser adapter. Built with injected `Deps` so it + is tested against fakes. +- `api/index.py`: FastAPI app — serves the page/assets and the `/ws` WebSocket. +- `static/app.js`: WebSocket lifecycle, mic capture, UI state, and event handling + (`_CONFIG` block at the top is the primary edit point). +- `static/audio.js`: microphone pipeline, PCM conversion, playback queue, barge-in. +- `static/styles.css`: visual styling only; the top `:root` block is the theme edit point. +- `static/index.html`: page structure and static asset links. + +## Change Points + +- Model, voice, prompt, greeting, sample rates: edit `api/settings.py`. +- Cascade behavior (turn detection, barge-in, LLM->TTS piping): edit `api/cascade.py`. +- Transcript log rendering: edit `addTurn` in `static/app.js`. +- Playback, barge-in, or PCM conversion: edit `static/audio.js`. + +## Invariants + +- Never expose `ASSEMBLYAI_API_KEY` or any server secret in `static/`. +- Streaming TTS is sandbox-only; keep this app pointed at the sandbox hosts. +- `reply.audio` carries base64 PCM on the `data` field. +- The browser ↔ backend event protocol matches the `voice-agent` template — keep it + stable so `static/audio.js` and the UI stay reusable. +- Keep the app buildless unless the user explicitly asks for a frontend toolchain. +``` + +- [ ] **Step 8: Run the full parametrized contract suite for this template** + +Run: `uv run pytest tests/test_init_template_contract.py tests/test_init_template_serve.py -q -k agent-framework` +Expected: PASS for every parametrized case (`agent-framework`): required files, vercel framework pin, Dockerfile shape, dockerignore `.env`, no `public/`, Procfile, runtime pin, static refs, AGENTS edit points, no committed dotenv, requirements cover imports + pinned, root + static assets served. + +- [ ] **Step 9: Commit** + +```bash +git add aai_cli/init/templates/agent-framework/requirements.txt aai_cli/init/templates/agent-framework/env.example aai_cli/init/templates/agent-framework/gitignore aai_cli/init/templates/agent-framework/runtime.txt aai_cli/init/templates/agent-framework/vercel.json aai_cli/init/templates/agent-framework/Procfile aai_cli/init/templates/agent-framework/Dockerfile aai_cli/init/templates/agent-framework/dockerignore aai_cli/init/templates/agent-framework/README.md aai_cli/init/templates/agent-framework/AGENTS.md +git commit -m "feat(agent-framework): deploy, docs, and dependency scaffold" +``` + +--- + +## Task 9: Regenerate snapshots + full gate + +**Files:** +- Modify: `tests/__snapshots__/test_snapshots_help_build.ambr` (regenerated) + +- [ ] **Step 1: Regenerate the `--help` snapshots (the init arg help now lists the new template)** + +Run: `uv run pytest tests/test_snapshots_help_build.py --snapshot-update -q` +Then review: `git diff tests/__snapshots__/test_snapshots_help_build.ambr` +Expected: the only change is `agent-framework` appended to the `init` template enumeration. If other help snapshot files changed, regenerate them too (`uv run pytest -k snapshots_help --snapshot-update`). + +- [ ] **Step 2: Run the targeted suites green** + +Run: `uv run pytest tests/test_init_template_agent_framework.py tests/test_init_templates.py tests/test_init_command.py tests/test_init_template_contract.py tests/test_init_template_serve.py -q` +Expected: all PASS. + +- [ ] **Step 3: Run the install smoke test for this template (network + uv required)** + +Run: `uv run pytest -m install -q -k agent-framework` +Expected: PASS (requirements install into a clean venv and `api.index` imports). If `websockets`/`openai` floors are wrong, bump them in `requirements.txt` and re-run. + +- [ ] **Step 4: Run the full gate** + +Run: `./scripts/check.sh` +Expected: ends with `All checks passed.` Watch specifically for: +- `prettier` (template JS/CSS) — clean. +- `ruff`/`ruff format` over `api/*.py` — clean. +- `diff-cover` 100% patch coverage — every new `cascade.py`/`index.py`/`settings.py` line is covered by Task 3–6 tests. If a line is reported uncovered, add a direct assertion (do not add `pragma: no cover` for reachable orchestration lines). +- mutation gate — a surviving mutant means a changed line lacks a *failing-on-break* assertion; strengthen the relevant test. +- the init template contract gate + unused snapshot/fixture gate. + +- [ ] **Step 5: Commit the regenerated snapshot (only if not already committed) and finalize** + +```bash +git add tests/__snapshots__/test_snapshots_help_build.ambr +git commit -m "test(init): refresh --help snapshot for agent-framework template" +``` + +--- + +## Self-review notes (resolved) + +- **Spec coverage:** every spec section maps to a task — architecture/orchestrator (T4–T6), components (T2–T8), CLI wiring (T1), deploy/sandbox caveats (T8 README/AGENTS/settings guard), error handling (T5 `session.error` paths), testing (T3–T9). +- **Import-time safety:** `settings.py` never raises (T3 test `test_settings_imports_without_key_or_tts_host`); the availability guard is in `run_session` (T5). +- **Coverage/mutation burden:** orchestrator is decomposed into directly-testable units (`unavailable_reason`, `stt_url`, `tts_url`, `is_final_user_turn`, `build_messages`, `_pump_mic`, `_synthesize`, `_speak`, `_generate_reply`, `maybe_barge_in`, `_pump_stt`, `run_session`, `FastAPIBrowser`), each with an asserting test. +- **Naming consistency:** `Deps`, `Session`, `run_session`, `_synthesize`, `_generate_reply`, `_speak`, `_pump_mic`, `_pump_stt`, `maybe_barge_in`, `FastAPIBrowser` used identically across tasks and tests. +- **Gated assertions:** the exact help-string test (T1) and the `--help` snapshot (T9) are both updated for the new registry entry. +``` diff --git a/docs/superpowers/specs/2026-06-15-agent-framework-template-design.md b/docs/superpowers/specs/2026-06-15-agent-framework-template-design.md new file mode 100644 index 00000000..c12ba079 --- /dev/null +++ b/docs/superpowers/specs/2026-06-15-agent-framework-template-design.md @@ -0,0 +1,134 @@ +# `agent-framework` init template — design + +**Date:** 2026-06-15 +**Status:** Approved (design); pending implementation plan + +## Goal + +Add a fourth `assembly init` starter template, `agent-framework`, that delivers the +same browser UI/UX as the existing `voice-agent` template but is built on a +**cascaded** architecture instead of AssemblyAI's all-in-one Voice Agent endpoint. +The cascade wires three primitives together server-side: + +1. **Streaming STT** (v3 realtime WebSocket) — speech in, turn detection. +2. **LLM Gateway** (OpenAI-compatible HTTP) — reply generation. +3. **Streaming TTS** (sandbox WebSocket) — speech out. + +This is the "framework" you would build yourself if the managed Voice Agent did not +exist, so it is a useful, instructive starter for users who want control over each leg. + +## Architecture + +``` +Browser ──mic PCM (16k)──► FastAPI /ws ──audio bytes──► STT WS (v3) + ◄──transcripts──── │ ◄──Turn/end_of_turn──┘ + ◄──reply.audio (24k)── ├──finalized turn──► LLM Gateway (OpenAI-compatible, streamed) + └──reply text──► TTS WS (sandbox) ──Audio──► back to browser +``` + +The browser opens **one** same-origin WebSocket to our FastAPI backend. The backend +runs the full cascade and keeps all three API credentials server-side. No token mint +is needed (unlike `voice-agent`/`live-captions`, which mint short-lived tokens because +the browser connects directly to AssemblyAI). + +### Browser ↔ backend protocol (identical to `voice-agent`) + +Reusing the existing event vocabulary keeps `static/audio.js` unchanged and reduces +`static/app.js` to a connection-setup change. + +- Browser → server: + - `{type: "input.audio", audio: }` — one mic frame. + - `{type: "session.update", session: {...}}` — optional; the backend may apply + `system_prompt`/`greeting`/`voice` overrides or ignore it. Kept for parity. +- Server → browser: + - `{type: "transcript.user", text}` — STT transcript (partial and final). + - `{type: "transcript.agent", text}` — the LLM reply text. + - `{type: "reply.audio", data: }` — a TTS audio chunk. + - `{type: "input.speech.started"}` — barge-in: user started talking; browser stops + queued audio. + - `{type: "reply.done", status}` — reply finished (or `interrupted`). + - `{type: "session.error", message}` — any leg failed; surfaced in the UI. + +## Components (template files) + +- `api/index.py` — FastAPI app. Serves `index.html` + `/static`, and exposes + `@app.websocket("/ws")` which hands each accepted connection to the orchestrator. +- `api/settings.py` — config from env: `ASSEMBLYAI_API_KEY`, `ASSEMBLYAI_STREAMING_HOST`, + `ASSEMBLYAI_TTS_HOST`, `ASSEMBLYAI_LLM_GATEWAY_URL`, model (`claude-haiku-4-5-20251001`), + voice (`ivy`), system prompt, greeting, sample rates (16 kHz in, 24 kHz out). Fails + fast with an actionable message when `ASSEMBLYAI_TTS_HOST` is empty (production has no + streaming-TTS host). +- `api/cascade.py` — per-session async orchestrator: + - Opens the STT WS (API key auth) and forwards mic bytes from the browser. + - Reads STT `Turn` events: emits `transcript.user` for partials; on `end_of_turn` + (formatted final) triggers the reply pipeline. + - Reply pipeline: streams the LLM completion (emitting `transcript.agent`), pipes the + reply text into a TTS WS (Begin → Generate → ForceFlushTextBuffer → Audio frames → + Terminate, mirroring `aai_cli/tts/session.py`), and forwards each Audio frame as + `reply.audio`. + - Barge-in: a new non-empty user partial while a reply is in flight emits + `input.speech.started` and cancels the in-flight LLM/TTS task. + - Speaks the configured greeting on connect (greeting text → TTS → `reply.audio`). + - Tears down cleanly on browser disconnect / socket close / LLM error, cancelling + sibling tasks. +- `static/index.html` — copy of `voice-agent`'s page with the eyebrow/title/subtitle + reworded to describe the cascade; IDs/classes unchanged. +- `static/styles.css` — identical to `voice-agent`. +- `static/audio.js` — identical to `voice-agent` (mic pipeline, PCM player, downsample, + base64 helpers). +- `static/app.js` — same event handling as `voice-agent`; `connect()` opens a same-origin + `/ws` directly (no `/api/token` fetch). +- Scaffold parity files: `README.md`, `AGENTS.md`, `env.example`, `gitignore`, + `requirements.txt` (adds `websockets` + `openai` to the FastAPI/uvicorn base), + `Procfile`, `Dockerfile`, `dockerignore`, `runtime.txt`, `vercel.json`. + +## Stack + +Async throughout: the `websockets` async client (STT + TTS), `openai.AsyncOpenAI` +pointed at the gateway base (streamed completion), and FastAPI/Starlette WebSockets for +the browser side. Served as a long-lived process by `uvicorn`. + +## CLI wiring (shared edits — unavoidable for a new template) + +- `aai_cli/init/templates.py` — add `"agent-framework": "Agent Framework"` to `TEMPLATES` + and to `TEMPLATE_ORDER`. +- `aai_cli/app/init_exec.py` — add `"ASSEMBLYAI_TTS_HOST": env.streaming_tts_host` to + `_active_env_vars()`. This appends one extra (unused, empty-in-prod) var to every + template's `.env`; harmless to the others and required by `agent-framework`. + +These are the standard registration touch-points for a template; the "a new command +edits no shared file" rule applies to commands, not templates. + +## Deploy / operational caveats + +- **Sandbox-only.** Streaming TTS has no production host (`streaming_tts_host` is empty + in `production`). A credential is valid only against the environment that minted it, + so the *entire* cascade must point at `sandbox000` with a sandbox key. The README + leads with `assembly --sandbox init agent-framework`, which pins all three hosts to + sandbox via `_active_env_vars()`. Running against production exits fast with a + `--sandbox` hint. +- **Not Vercel-serverless.** The persistent browser WebSocket needs a long-lived + process, so the primary deploy path is the shipped `Procfile`/`Dockerfile` (Render, + Railway, Fly, Cloud Run). `vercel.json` is retained for static parity, but the README + is explicit that the WebSocket requires a long-running host. + +## Error handling + +Every leg maps a failure to a single `session.error` event to the browser (mirroring +`voice-agent`). The orchestrator cancels sibling tasks on browser disconnect, STT/TTS +socket close, or LLM error, so a session never leaks tasks or sockets. + +## Testing + +The parametrized init-template contract tests (`tests/test_init_template_*.py`) cover +the new template automatically once it is in `TEMPLATE_ORDER`: required files present, +renamed dotfiles (`gitignore` → `.gitignore`, `env.example`), wheel packaging, and +ruff/prettier cleanliness. The plan will confirm exactly what those contracts assert and +add template-specific coverage where needed (notably the new `ASSEMBLYAI_TTS_HOST` env +var and the prod fail-fast path). + +## Out of scope (YAGNI) + +- Function calling / tools on the LLM leg (left as an "ideas to extend" note). +- Sentence-level TTS streaming tuning beyond what is needed for acceptable latency. +- A production TTS path (does not exist yet). diff --git a/pyproject.toml b/pyproject.toml index 73047a80..d614d42d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -161,8 +161,9 @@ markers = [ [tool.mypy] python_version = "3.12" files = ["aai_cli", "tests"] -# Init templates are packaged scaffold assets, not importable package modules. -exclude = ["^aai_cli/init/templates/"] +# Init templates ARE type-checked: they're importable packages +# (aai_cli.init.templates..api.*) whose api/ code must stay strict-clean against +# the real SDK types, not just shipped as scaffold text. # Third-party deps (assemblyai, sounddevice) ship no type stubs. ignore_missing_imports = true disallow_untyped_defs = true @@ -213,7 +214,12 @@ disable_error_code = ["annotation-unchecked"] # pyrightconfig.tests.json from scripts/check.sh so pytest fixtures/mocks don't # create thousands of low-value strict diagnostics. include = ["aai_cli"] -exclude = ["aai_cli/init/templates/**"] +# Re-list pyright's built-in excludes explicitly (defining any `exclude` drops the +# defaults, which pyright warns about). NOTE: init templates are deliberately NOT here — +# their api/ code is strict-clean and type-checked in-tree +# (aai_cli.init.templates..api.*), the same bar as the rest of the package; only +# generated/hidden dirs are skipped. +exclude = ["**/node_modules", "**/__pycache__", "**/.*"] pythonVersion = "3.12" typeCheckingMode = "strict" # Third-party deps (assemblyai, sounddevice) ship no type stubs. @@ -224,6 +230,28 @@ reportMissingTypeStubs = false venvPath = "." venv = ".venv" +# Editors (Pylance) read this config and also analyze whatever test file you have +# open. The `include` above scopes the *gate's* `pyright` run to aai_cli (strict), but +# the editor checks open tests too — and applies the strict mode above, surfacing +# thousands of low-value pytest fixture/mock diagnostics (untyped `monkeypatch`, +# unknown member/parameter types, …). `executionEnvironments` can't set +# `typeCheckingMode`, so we silence exactly the strict-only "unknown type" family for +# tests/ — matching the standard mode the gate uses for tests (pyrightconfig.tests.json +# in scripts/check.sh). Editor-facing only: the gate's `pyright` run never analyzes +# tests/ (not in `include`), so this changes nothing about what the gate checks. +[[tool.pyright.executionEnvironments]] +root = "tests" +reportUnknownParameterType = "none" +reportMissingParameterType = "none" +reportUnknownMemberType = "none" +reportUnknownVariableType = "none" +reportUnknownArgumentType = "none" +reportUnknownLambdaType = "none" +reportPrivateUsage = "none" +reportUnusedFunction = "none" +reportMissingTypeArgument = "none" +reportUntypedNamedTuple = "none" + [tool.ruff] line-length = 100 target-version = "py312" @@ -336,7 +364,14 @@ max-statements = 40 # Template constants include URL path names such as TOKEN_PATH, not credentials. # TID251: the scaffolds are end-user example apps that read their own config straight # from os.environ — that's correct, idiomatic code to ship, not a CLI-internal env read. -"aai_cli/init/templates/**" = ["S105", "TID251"] +# BLE001: starter apps funnel any leg failure into one user-facing error event/response +# (a broad `except Exception` is the right shape to ship), so the blind-except lint +# doesn't apply to scaffolds. +# TID252: scaffolds ship as a self-contained top-level `api/` package, so their inner +# imports must be relative (`from . import settings`) — that's the one form that resolves +# both in the shipped app (`uvicorn api.index:app`) and when type-checked in-tree as +# aai_cli.init.templates..api. Absolute `from api import …` can't satisfy both. +"aai_cli/init/templates/**" = ["S105", "TID251", "BLE001", "TID252"] # ENV_CLIENT_TOKEN holds an env-var *name*; the shipped token constant is empty in # source (release builds inject the write-only client token). "aai_cli/core/telemetry.py" = ["S105"] diff --git a/scripts/check.sh b/scripts/check.sh index cbd8e511..1dffd1f4 100755 --- a/scripts/check.sh +++ b/scripts/check.sh @@ -21,6 +21,13 @@ cleanup_generated_code_dir() { echo "==> uv lock freshness" uv lock --check +echo "==> validate-pyproject (pyproject.toml schema)" +# Validate pyproject's standardized tables ([build-system]/[project]) against the PyPA +# JSON schemas. Run via uvx (like twine/codespell below) so it needs no dev-dep/uv.lock +# entry; --with packaging enables full requirement/license-expression checks. Unknown +# [tool.*] tables (ruff/mypy/pyright/…) are intentionally left to those tools. +uvx --with "packaging>=24.2" validate-pyproject pyproject.toml + echo "==> ruff check (src + tests)" uv run ruff check . diff --git a/scripts/docs_consistency_gate.py b/scripts/docs_consistency_gate.py index 6d495e37..623c0a61 100644 --- a/scripts/docs_consistency_gate.py +++ b/scripts/docs_consistency_gate.py @@ -34,6 +34,7 @@ "ASSEMBLYAI_LLM_GATEWAY_URL", "ASSEMBLYAI_STREAMING_HOST", "ASSEMBLYAI_AGENTS_HOST", + "ASSEMBLYAI_TTS_HOST", } _VAR_RE = re.compile(r"\b((?:AAI|ASSEMBLYAI)_[A-Z0-9_]+)\b") diff --git a/scripts/template_contract_gate.py b/scripts/template_contract_gate.py index d821802a..fe084dc9 100644 --- a/scripts/template_contract_gate.py +++ b/scripts/template_contract_gate.py @@ -44,7 +44,15 @@ def _fail(message: str) -> NoReturn: def _template_dirs() -> dict[str, Path]: - dirs = {path.name: path for path in _ROOT.iterdir() if path.is_dir()} + # On-disk dirs are underscore package names; registry ids are kebab. Map each + # shipped dir back to its kebab id so both sets compare in the id namespace. + # Templates are now importable packages, so importing them creates __pycache__ + # alongside the template dirs — skip dunder dirs (matches the registry tests). + dirs = { + path.name.replace("_", "-"): path + for path in _ROOT.iterdir() + if path.is_dir() and not path.name.startswith("__") + } registered = set(templates.TEMPLATES) shipped = set(dirs) missing = registered - shipped diff --git a/tests/__snapshots__/test_snapshots_help_build.ambr b/tests/__snapshots__/test_snapshots_help_build.ambr index 3437689b..14447ca7 100644 --- a/tests/__snapshots__/test_snapshots_help_build.ambr +++ b/tests/__snapshots__/test_snapshots_help_build.ambr @@ -89,8 +89,8 @@ ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ │ template [TEMPLATE] Template to scaffold: audio-transcription, │ - │ live-captions, voice-agent (omit to pick │ - │ interactively) │ + │ live-captions, voice-agent, agent-framework │ + │ (omit to pick interactively) │ │ directory [DIRECTORY] Target directory (default: