From d837427932eb25179f0c99a278e8a83e30faabad Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 13 Jun 2026 15:24:49 +0000 Subject: [PATCH 1/4] Tighten the gate: add ASYNC/LOG/TID/etc. ruff rules + remaining mypy strict flags The codebase already satisfies several ruff rule categories and the rest of mypy's --strict, so enabling them is forward-looking, zero-churn enforcement rather than a mass autofix: - ruff select += ASYNC (blocking calls in the streaming/agent asyncio code), LOG/G (logging anti-patterns), DTZ (naive datetimes), FLY (static join -> f-string), ICN/SLOT (import/__slots__ hygiene), ISC (implicit string concat, the missing-comma-in-a-list bug; ISC001 left to the formatter), and TID with ban-relative-imports="all" so every intra-package import stays absolute and the import-linter contracts stay unambiguous. - mypy src now runs full --strict except disallow_untyped_calls (jiwer ships no stubs, so wer.py's RemovePunctuation() call would force a net-new # type: ignore the escape-hatch gate rejects). The added flags catch incomplete defs, unchecked untyped bodies, untyped decorators, Any-subclassing, and stale config. Tests relax the untyped-body flags (mock plumbing) but keep the rest. The full scripts/check.sh gate passes. --- AGENTS.md | 2 +- pyproject.toml | 54 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index c74f27b2..78d8f161 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,7 +34,7 @@ Individual tools (all via `uv run`): ```sh uv run ruff check . # lint uv run ruff format . # format (line-length 100) -uv run mypy # files = ["aai_cli", "tests"] from pyproject; strict (disallow_untyped_defs on src) +uv run mypy # files = ["aai_cli", "tests"] from pyproject; src is full --strict bar disallow_untyped_calls (jiwer ships no stubs); tests relax the untyped-body flags prettier --check "aai_cli/init/templates/**/*.{js,css}" # JS/CSS template formatting uv run pytest -q # default unit suite uv run pytest tests/test_transcribe.py -q # a single file diff --git a/pyproject.toml b/pyproject.toml index 2e5bd6df..31fe04ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -175,11 +175,37 @@ warn_unreachable = true disallow_any_generics = true no_implicit_reexport = true extra_checks = true +# The remaining flags from mypy --strict, which aai_cli already satisfies. They close +# gaps the above leave open: a function with *some* annotations but an unannotated +# arg/return (disallow_incomplete_defs), the body of any still-untyped function going +# unchecked (check_untyped_defs), an untyped decorator silently erasing a function's +# type (disallow_untyped_decorators), subclassing an Any-typed base (disallow_subclassing_any), +# and a config option that no longer matches any file (warn_unused_configs). The one +# strict flag left off is disallow_untyped_calls: jiwer ships no stubs, so wer.py's +# RemovePunctuation() call is unavoidably untyped, and turning it on would force a +# net-new `# type: ignore` the escape-hatch gate rejects. +disallow_incomplete_defs = true +check_untyped_defs = true +disallow_untyped_decorators = true +disallow_subclassing_any = true +warn_unused_configs = true [[tool.mypy.overrides]] -# Tests are type-checked too, but pytest functions don't need return annotations. +# Tests are type-checked too, but pytest functions don't need return annotations +# (disallow_untyped_defs/disallow_incomplete_defs), and the bodies of those untyped +# helpers — full of mock plumbing and ad-hoc fixtures — would drown the signal if +# type-checked (check_untyped_defs) or block subclassing untyped test doubles +# (disallow_subclassing_any) / wrapping them in untyped decorators +# (disallow_untyped_decorators). The strict flags stay on for the shipped package. +# With check_untyped_defs off, mypy emits an `annotation-unchecked` note per annotated +# untyped helper; silence those notes so the test output stays signal. module = "tests.*" disallow_untyped_defs = false +disallow_incomplete_defs = false +check_untyped_defs = false +disallow_subclassing_any = false +disallow_untyped_decorators = false +disable_error_code = ["annotation-unchecked"] [tool.pyright] # Second type checker alongside mypy: pyright catches a different class of @@ -211,9 +237,23 @@ extend-exclude = ["aai_cli/_version.py"] # A/N/FBT/PL/T20/PT/PIE/PERF/TCH add maintainability pressure: naming/shadowing, # boolean traps, pylint-style design issues, centralized raw output, pytest style, # small simplifications, performance footguns, and type-only import hygiene. +# ASYNC/LOG/G/DTZ/FLY/ICN/SLOT/ISC/TID add correctness pressure the above miss and the +# codebase already satisfies (so they're forward-looking, zero-churn enforcement): +# ASYNC — blocking calls (time.sleep, open(), sync HTTP) inside the streaming/agent +# asyncio code, which would stall the event loop; +# LOG/G — logging anti-patterns (f-strings/`.format` in log calls, `exception()` +# outside handlers) in debuglog and friends; +# DTZ — naive datetime construction (timezone bugs); +# FLY — static `str.join` that should be an f-string (pairs with UP); +# ISC — implicitly concatenated string literals across lines (the classic +# missing-comma-in-a-list bug); ISC001 is owned by the formatter (ignored); +# ICN/SLOT — import-convention and __slots__ hygiene; +# TID — relative imports (banned outright below) so every import is absolute, +# reinforcing the import-linter architecture contracts. select = ["E", "F", "I", "UP", "B", "BLE", "C4", "SIM", "RET", "PTH", "ARG", "S", "RUF", "PGH", "ERA", "TRY", "TD", "FIX", "A", "N", "FBT", "PL", "C90", "T20", "PT", - "PIE", "PERF", "TCH"] + "PIE", "PERF", "TCH", "ASYNC", "LOG", "G", "DTZ", "FLY", "ICN", "SLOT", "ISC", + "TID"] # E501: line length is owned by the formatter. # B008: Typer uses function calls (typer.Option/Argument) as parameter defaults. # S603/S607: we intentionally shell out to `claude`/`npx` with controlled args. @@ -223,13 +263,21 @@ select = ["E", "F", "I", "UP", "B", "BLE", "C4", "SIM", "RET", "PTH", "ARG", "S" # PLC0415: optional/heavy runtime deps are imported lazily to keep startup fast. # TC001-TC003: the project intentionally keeps readable top-level type imports; TC006 # still enforces quoted runtime casts. +# ISC001: single-line implicit string concatenation is managed by the formatter, which +# would otherwise fight this lint (ruff's own recommendation when both are enabled). ignore = ["E501", "B008", "S603", "S607", "TRY003", "N818", "PLC0415", - "TC001", "TC002", "TC003"] + "TC001", "TC002", "TC003", "ISC001"] # Function-size pressure, tuned to keep functions small enough to read and edit in # one screen (the friction a coding agent hits most). These complement xenon's # cyclomatic-complexity gate in check.sh: mccabe (C901) and max-branches bound # branchiness; max-statements bounds raw length; max-args bounds signatures. +[tool.ruff.lint.flake8-tidy-imports] +# Every intra-package import is already absolute (`from aai_cli.x import y`); banning +# relative imports outright keeps it that way, which makes modules movable and the +# import-linter contracts unambiguous. +ban-relative-imports = "all" + [tool.ruff.lint.mccabe] max-complexity = 10 # matches xenon's grade-B ceiling (CC <= 10) so the two agree From c6dec33c2e2eb3e61d20575d7782e6a95b4e5928 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 13 Jun 2026 15:39:11 +0000 Subject: [PATCH 2/4] Port two Deno-style gates: banned-api method fencing + unused-fixture check Modeled on denoland/deno's tools/lint.js, which (a) uses per-crate clippy.toml to fence raw std methods to designated crates and (b) fails if any test .out file is unreferenced. - ruff banned-api (TID251) fences raw `subprocess` and `os.environ`/`os.getenv` to the modules that legitimately own them (allowlisted via per-file-ignores): process spawning goes through procs.py, env reads through the config/env layer. A new module reaching for either now trips the gate, so the spread is a visible, reviewable allowlist edit. os.putenv/os.unsetenv are banned outright (they desync os.environ). Tests and scripts/ are exempt; the AST matcher leaves the os.environ snippets inside the code_gen --show-code string templates alone. - scripts/unused_fixtures_gate.py (wired into check.sh) fails on an orphaned .ambr snapshot (no matching test module) or an API fixture no test references. The unit suite runs under xdist, which disables syrupy's own unused-snapshot detection, so this closes that blind spot statically with no extra test run. The full scripts/check.sh gate passes. --- AGENTS.md | 2 +- aai_cli/AGENTS.md | 1 + pyproject.toml | 61 ++++++++++++++++++++++++--- scripts/check.sh | 5 +++ scripts/unused_fixtures_gate.py | 74 +++++++++++++++++++++++++++++++++ 5 files changed, 136 insertions(+), 7 deletions(-) create mode 100644 scripts/unused_fixtures_gate.py diff --git a/AGENTS.md b/AGENTS.md index 78d8f161..e701b9a5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,7 +25,7 @@ uv run assembly --help # run the CLI from the locked environment Dev tooling is a PEP 735 `[dependency-groups]` group with `default-groups = ["dev"]`, not a `[project]` extra — `uv sync --extra dev` errors. -`scripts/check.sh` is the authoritative gate; keep this list in sync with it. It runs, in order: `uv lock --check` → `ruff check` → `ruff format --check` → `mypy` → `pyright` (src strict) → `pyright` (tests) → `vulture` (dead code) → `deptry` (dependency hygiene) → `lint-imports` (import-linter architecture contracts) → max-file-length (500 lines) → `xenon` (cyclomatic complexity, max grade B / project avg A) → `swiftlint` + swift compile (macOS only, skipped elsewhere) → `markdownlint` → `prettier` (init template JS/CSS) → `shellcheck` → `actionlint` + `zizmor` (workflow lint/audit) → `gitleaks` (secret scan) → generated `--show-code` compile gate → init template contract gate → `pytest` (90% branch coverage) → `diff-cover` (100% patch coverage vs `origin/main`) → **mutation gate** (diff-scoped: mutates each changed line and reruns the tests that cover it — a surviving mutant fails the gate, so changed lines need assertions that would *fail* if the line broke, not just coverage; suppress a genuinely unassertable line with `# pragma: no mutate`) → a "no new escape hatches" gate (`# type: ignore` / `# noqa` / `pragma: no cover` / `Any` / `cast(` / test skip/xfail/sleep, all **count-gated against the merge-base** so moving an existing hatch in a refactor doesn't false-positive but a net-new one fails) → **CodeQL gate** (`scripts/codeql_gate.py`: the same security + quality suites the CodeQL workflow uploads to GitHub's code-scanning/quality tabs, run locally over python/actions/javascript so alerts fail before push instead of on the PR; needs the CodeQL bundle on PATH — self-skips otherwise, `codeql.yml` covers CI, and the web session-start hook provisions it) → `uv build` + `twine check --strict`. The `vulture`/`deptry`/`lint-imports`/`xenon`, patch-coverage, and mutation stages catch the failures that `ruff`+`mypy` alone won't — don't claim the gate is green until the script prints `All checks passed.` +`scripts/check.sh` is the authoritative gate; keep this list in sync with it. It runs, in order: `uv lock --check` → `ruff check` → `ruff format --check` → `mypy` → `pyright` (src strict) → `pyright` (tests) → `vulture` (dead code) → `deptry` (dependency hygiene) → `lint-imports` (import-linter architecture contracts) → max-file-length (500 lines) → `xenon` (cyclomatic complexity, max grade B / project avg A) → `swiftlint` + swift compile (macOS only, skipped elsewhere) → `markdownlint` → `prettier` (init template JS/CSS) → `shellcheck` → `actionlint` + `zizmor` (workflow lint/audit) → `gitleaks` (secret scan) → generated `--show-code` compile gate → init template contract gate → unused snapshot/fixture gate (`scripts/unused_fixtures_gate.py`: orphaned `.ambr`/API fixtures, since xdist disables syrupy's own unused detection) → `pytest` (90% branch coverage) → `diff-cover` (100% patch coverage vs `origin/main`) → **mutation gate** (diff-scoped: mutates each changed line and reruns the tests that cover it — a surviving mutant fails the gate, so changed lines need assertions that would *fail* if the line broke, not just coverage; suppress a genuinely unassertable line with `# pragma: no mutate`) → a "no new escape hatches" gate (`# type: ignore` / `# noqa` / `pragma: no cover` / `Any` / `cast(` / test skip/xfail/sleep, all **count-gated against the merge-base** so moving an existing hatch in a refactor doesn't false-positive but a net-new one fails) → **CodeQL gate** (`scripts/codeql_gate.py`: the same security + quality suites the CodeQL workflow uploads to GitHub's code-scanning/quality tabs, run locally over python/actions/javascript so alerts fail before push instead of on the PR; needs the CodeQL bundle on PATH — self-skips otherwise, `codeql.yml` covers CI, and the web session-start hook provisions it) → `uv build` + `twine check --strict`. The `vulture`/`deptry`/`lint-imports`/`xenon`, patch-coverage, and mutation stages catch the failures that `ruff`+`mypy` alone won't — don't claim the gate is green until the script prints `All checks passed.` **Commits are gated.** On success `check.sh` records a working-tree signature (`scripts/gate_marker.py record` → `.git/aai-gate-pass`), and a PreToolUse hook (`.claude/hooks/require-gate-before-commit.sh`) blocks `git commit` unless that signature still matches — so run the full gate to completion *before* committing (a single-file `pytest` does not satisfy it), and re-run it after any further edit. Iterate with the fast targeted commands above, gate once at the end. For a deliberate work-in-progress commit, prefix `AAI_ALLOW_COMMIT=1 git commit …`. diff --git a/aai_cli/AGENTS.md b/aai_cli/AGENTS.md index 1a7191fc..9e3dccee 100644 --- a/aai_cli/AGENTS.md +++ b/aai_cli/AGENTS.md @@ -97,6 +97,7 @@ heavily-reworked commands with long bodies; small commands keep the inline - **`environments.py`** — a frozen `Environment` (api_base, streaming_host, llm_gateway_base, ams_base, stytch_*). `DEFAULT_ENV` is **`production`**; use `--sandbox` (or `--env sandbox000` / `AAI_ENV`) to target the sandbox. The active environment is a process-global set once at startup; precedence: `--env` → `AAI_ENV` → profile's stored env → default. A credential is only valid against the environment that minted it. - **`client.py`** — thin wrappers over the `assemblyai` SDK (`transcribe`, `list_transcripts`, `stream_audio`, etc.). It normalizes SDK exceptions: auth failures become a single clean `auth_failure()` `CLIError`; everything else becomes `APIError`. New SDK calls should follow this try/except shape. - **`errors.py`** — the `CLIError` hierarchy (each with `error_type` + `exit_code`). `output.py` emits errors to **stderr**; stdout stays clean for pipelines. `--json` switches to machine-readable output; it is never auto-enabled — `output.resolve_json()` deliberately keeps human text the default even when piped or agent-run. +- **Raw `subprocess` and `os.environ`/`os.getenv` are fenced by ruff `banned-api` (TID251).** Only the modules allowlisted in `pyproject.toml`'s `per-file-ignores` may call them — process spawning is meant to go through `procs.py`, and environment reads through the config/env-resolution layer. A new module reaching for either trips the gate, so adding one is a deliberate, reviewable allowlist edit (the Deno toolchain's per-crate `clippy.toml` model). Tests and `scripts/` are exempt. - **`debuglog.py`** — the root `-v/--verbose` flag (count: `-v` request-level at INFO, `-vv` wire-level at DEBUG). The CLI normally configures no logging, and the realtime paths *silence* library loggers (`ws.py`, `streaming/diagnostics.py`); verbose mode installs one redacting stderr handler and those silencers stand down. Secrets are registered at their resolution choke points (`config.resolve_api_key`, `AppState.resolve_session`) and masked in every rendered record — websockets logs the raw Authorization header at DEBUG, so masking lives in the formatter, not at call sites. Stdlib-only on purpose: `config` (a Rich-free layer) imports it. ### Feature subsystems diff --git a/pyproject.toml b/pyproject.toml index 31fe04ad..8562e631 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -278,6 +278,20 @@ ignore = ["E501", "B008", "S603", "S607", "TRY003", "N818", "PLC0415", # import-linter contracts unambiguous. ban-relative-imports = "all" +# Disallowed-methods enforcement, modeled on the Deno toolchain's per-crate clippy.toml +# bans (only designated crates may call the fenced std methods). Process spawning and +# raw environment access stay confined to the modules that legitimately own them +# (allowlisted via per-file-ignores below); any *new* module reaching for them trips +# TID251, so adding one is a visible, reviewable edit rather than a silent spread. +# The matcher is AST-based, so the os.environ snippets inside the code_gen --show-code +# exemplars (string literals) don't trip it. +[tool.ruff.lint.flake8-tidy-imports.banned-api] +"subprocess".msg = "Spawn detached children via aai_cli.procs; if a module genuinely needs raw subprocess, add it to the TID251 allowlist in pyproject.toml." +"os.environ".msg = "Resolve configuration through aai_cli.config / aai_cli.context (which centralize precedence and secret handling); env-owning modules are allowlisted for TID251 in pyproject.toml." +"os.getenv".msg = "Use os.environ.get (the single project idiom) via an env-owning module; see the TID251 allowlist in pyproject.toml." +"os.putenv".msg = "os.putenv/os.unsetenv bypass os.environ and desync the mapping; mutate os.environ instead." +"os.unsetenv".msg = "os.putenv/os.unsetenv bypass os.environ and desync the mapping; mutate os.environ instead." + [tool.ruff.lint.mccabe] max-complexity = 10 # matches xenon's grade-B ceiling (CC <= 10) so the two agree @@ -292,9 +306,12 @@ max-statements = 40 # TRY300: test helpers commonly `return` inside a try while asserting on the except path. # Tests also keep literal exit codes, local imports, composite assertions, and fake # call signatures where those make the intent clearer than production-style indirection. +# TID251: tests drive the CLI as a subprocess and monkeypatch os.environ freely; the +# banned-api ban targets the shipped aai_cli package, not the test harness or dev gates. "tests/**" = ["S101", "S105", "S106", "S107", "S108", "ARG001", "ARG002", "ARG005", "PTH123", "SIM117", "TRY300", "FBT", "PLR2004", "PLC0415", "PLR0913", - "PLW1510", "N806", "N818", "PLW0108", "PT018", "TCH"] + "PLW1510", "N806", "N818", "PLW0108", "PT018", "TCH", "TID251"] +"scripts/**" = ["TID251"] # Typer command functions naturally have many boolean options and broad signatures # (PLR0913/FBT). Their *bodies*, though, are held to the same length/branch limits as # the rest of the package: PLR0912/PLR0915/C901 are deliberately NOT ignored here. @@ -305,18 +322,50 @@ max-statements = 40 # command signatures do. "aai_cli/options.py" = ["FBT003"] # Raw stdout/stderr writes are centralized here; command modules call output helpers. -"aai_cli/output.py" = ["T201"] +# TID251: output owns the FORCE_COLOR/NO_COLOR env toggles and TTY/agent detection. +"aai_cli/output.py" = ["T201", "TID251"] # The active environment is process-global startup state by design. -"aai_cli/environments.py" = ["PLW0603"] +# TID251: environments.py owns AAI_ENV resolution (an env-owning module). +"aai_cli/environments.py" = ["PLW0603", "TID251"] # Verbosity is process-global startup state by design (mirrors environments.py). "aai_cli/debuglog.py" = ["PLW0603"] # BaseHTTPRequestHandler.log_message requires a parameter named `format`. "aai_cli/auth/loopback.py" = ["A002"] # Template constants include URL path names such as TOKEN_PATH, not credentials. -"aai_cli/init/templates/**" = ["S105"] +# TID251: the scaffolds are end-user example apps that read their own config straight +# from os.environ — that's correct, idiomatic code to ship, not a CLI-internal env read. +"aai_cli/init/templates/**" = ["S105", "TID251"] + +# TID251 banned-api allowlist (see [tool.ruff.lint.flake8-tidy-imports.banned-api]). +# These are the only modules permitted raw `subprocess` (process spawning) or raw +# `os.environ`/`os.getenv` (environment access). Splitting the ignore per file keeps the +# blast radius explicit: a new module needing either must be added here in review. +# Process-spawning modules (shell out to claude/npx/ffmpeg/yt-dlp/tunnels/etc.): +"aai_cli/procs.py" = ["TID251"] +"aai_cli/coding_agent.py" = ["TID251"] +"aai_cli/mediafile.py" = ["TID251"] +"aai_cli/setup_exec.py" = ["TID251"] +"aai_cli/commands/deploy/_exec.py" = ["TID251"] +"aai_cli/commands/update.py" = ["TID251"] +"aai_cli/commands/webhooks/_listen.py" = ["TID251"] +"aai_cli/init/runner.py" = ["TID251"] +"aai_cli/init/tunnel.py" = ["TID251"] +"aai_cli/streaming/macos.py" = ["TID251"] +"aai_cli/streaming/sources.py" = ["TID251"] +# Environment-owning modules (config/auth/env resolution; output & environments are +# allowlisted above alongside their existing ignores): +"aai_cli/config.py" = ["TID251"] +"aai_cli/context.py" = ["TID251"] +"aai_cli/update_check.py" = ["TID251"] +"aai_cli/auth/endpoints.py" = ["TID251"] +"aai_cli/init/keys.py" = ["TID251"] +"aai_cli/commands/dev/_exec.py" = ["TID251"] +"aai_cli/commands/share/_exec.py" = ["TID251"] +"aai_cli/commands/evaluate/_hf_api.py" = ["TID251"] # ENV_CLIENT_TOKEN holds an env-var *name*; the shipped token constant is empty in -# source (release builds inject the write-only client token). -"aai_cli/telemetry.py" = ["S105"] +# source (release builds inject the write-only client token). TID251: telemetry reads +# its opt-out / intake-URL / CI-detection env vars (an env-owning module). +"aai_cli/telemetry.py" = ["S105", "TID251"] [tool.vulture] paths = ["aai_cli", "tests"] diff --git a/scripts/check.sh b/scripts/check.sh index a89e1f14..773bc3f8 100755 --- a/scripts/check.sh +++ b/scripts/check.sh @@ -170,6 +170,11 @@ trap - EXIT echo "==> init template contract/import gate" uv run python scripts/template_contract_gate.py +echo "==> unused snapshot/fixture gate" +# xdist disables syrupy's own unused-snapshot detection, so a renamed/deleted test can +# leave an orphaned .ambr or recorded API fixture behind. This static check catches it. +uv run python scripts/unused_fixtures_gate.py + echo "==> pytest (with branch-coverage gate)" # Exclude e2e: they drive the CLI as a subprocess (uncounted by coverage) and need # a live API key. Exclude install (real per-template dep install, slow + network). diff --git a/scripts/unused_fixtures_gate.py b/scripts/unused_fixtures_gate.py new file mode 100644 index 00000000..6961f6f7 --- /dev/null +++ b/scripts/unused_fixtures_gate.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +# Orphaned-test-artifact gate, modeled on the Deno toolchain's "every `.out` file must be +# referenced by a test" check (tools/lint.js). The unit suite runs under pytest-xdist +# (`-n auto`), which disables syrupy's own unused-snapshot reporting — each worker only +# sees a slice of the snapshots — so a renamed or deleted test can silently leave its +# whole snapshot file or a recorded API fixture behind to rot. This catches that +# statically and fast, with no extra test run. +# +# Two artifact kinds are checked: +# * tests/__snapshots__/.ambr — syrupy names a snapshot file after its test +# module, so each `.ambr` must have a matching tests/.py. +# * tests/fixtures/api/.json — replay fixtures are loaded by stem +# (replay_fixtures.load_object("")), so each must be referenced by name in +# some test module (the loader module itself doesn't count). + +REPO_ROOT = Path(__file__).resolve().parent.parent +TESTS_DIR = REPO_ROOT / "tests" +SNAPSHOT_DIR = TESTS_DIR / "__snapshots__" +API_FIXTURE_DIR = TESTS_DIR / "fixtures" / "api" +# The fixture loader names every stem in its own docstring/paths, so it can't count as a +# real reference — only an actual test that loads the fixture should keep it alive. +LOADER_MODULE = "replay_fixtures.py" + + +def _orphaned_snapshots() -> list[Path]: + """`.ambr` files whose owning test module no longer exists.""" + return [ + ambr.relative_to(REPO_ROOT) + for ambr in sorted(SNAPSHOT_DIR.glob("*.ambr")) + if not (TESTS_DIR / f"{ambr.stem}.py").exists() + ] + + +def _test_sources() -> list[str]: + """Bodies of every test module except the fixture loader.""" + return [ + path.read_text(encoding="utf-8") + for path in sorted(TESTS_DIR.rglob("*.py")) + if path.name != LOADER_MODULE + ] + + +def _unreferenced_fixtures() -> list[Path]: + """API fixtures whose stem is never named by a test module.""" + if not API_FIXTURE_DIR.exists(): + return [] + sources = _test_sources() + return [ + fixture.relative_to(REPO_ROOT) + for fixture in sorted(API_FIXTURE_DIR.glob("*.json")) + if not any(fixture.stem in source for source in sources) + ] + + +def main() -> int: + snapshot_orphans = _orphaned_snapshots() + fixture_orphans = _unreferenced_fixtures() + if not snapshot_orphans and not fixture_orphans: + sys.stdout.write("No orphaned snapshots or unreferenced fixtures.\n") + return 0 + for path in snapshot_orphans: + sys.stdout.write(f"Orphaned snapshot (no matching test module): {path}\n") + for path in fixture_orphans: + sys.stdout.write(f"Unreferenced API fixture (no test loads it): {path}\n") + sys.stdout.write("Delete the dead artifact, or wire it back into a test.\n") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) From de7179bdcfa65c1b34829efd4affbe81291fb974 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 13 Jun 2026 15:51:56 +0000 Subject: [PATCH 3/4] Add five more hard checks borrowed from other projects' gates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ruff T10 (flake8-debugger): a forgotten breakpoint()/pdb left in shipped code, the debugger counterpart to the T20 print ban. Zero current violations. - codespell (Kubernetes' verify-spelling, generalized): spell-checks code, comments, and docs. Run via `uvx` in check.sh and as a pre-commit hook, so it needs no uv.lock entry; config + ignore-words in [tool.codespell]. - check-case-conflict + detect-private-key (pre-commit-hooks): cross-OS filename collisions (we ship a macOS bottle) and a literal-private-key guard (defense-in-depth alongside gitleaks). - scripts/docs_consistency_gate.py (curl's "every option is documented", generalized): fails if REFERENCE.md/README.md drift from the code — every env var and exit code must be documented, every `assembly …` example must name a real command. - scripts/docstring_coverage_gate.py: public-API docstring-coverage ratchet, an interrogate stand-in (interrogate can't parse the codebase's PEP 695 generics). Floored at the current 64% so it ratchets up without forcing a backfill. - brew audit --strict on the shipped Formula/assembly.rb (Homebrew's own CI check), self-skipping where brew isn't installed. Refleak double-run (CPython's regrtest -R) was evaluated and skipped: it needs a debug build's sys.gettotalrefcount; the achievable part (ResourceWarning) is already enforced via filterwarnings=error. The full scripts/check.sh gate passes. --- .pre-commit-config.yaml | 14 ++++ AGENTS.md | 2 +- pyproject.toml | 16 +++- scripts/check.sh | 31 ++++++++ scripts/docs_consistency_gate.py | 116 +++++++++++++++++++++++++++++ scripts/docstring_coverage_gate.py | 59 +++++++++++++++ 6 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 scripts/docs_consistency_gate.py create mode 100644 scripts/docstring_coverage_gate.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0b6796a7..346b400b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,6 +13,20 @@ repos: - id: check-toml - id: check-merge-conflict - id: check-added-large-files + # Cross-OS filename collisions: this repo ships a macOS Homebrew bottle, so two + # paths differing only in case would break a case-insensitive checkout. + - id: check-case-conflict + # Defense-in-depth alongside gitleaks: never commit a literal private key. + - id: detect-private-key + + # Spell-check code, comments, and docs (Kubernetes' verify-spelling, generalized). + # Config (skips + ignore-words) lives in [tool.codespell] in pyproject.toml; check.sh + # runs the same tool via `uvx codespell`. + - repo: https://github.com/codespell-project/codespell + rev: v2.4.2 + hooks: + - id: codespell + additional_dependencies: [tomli] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.15.16 diff --git a/AGENTS.md b/AGENTS.md index e701b9a5..5224bffe 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,7 +25,7 @@ uv run assembly --help # run the CLI from the locked environment Dev tooling is a PEP 735 `[dependency-groups]` group with `default-groups = ["dev"]`, not a `[project]` extra — `uv sync --extra dev` errors. -`scripts/check.sh` is the authoritative gate; keep this list in sync with it. It runs, in order: `uv lock --check` → `ruff check` → `ruff format --check` → `mypy` → `pyright` (src strict) → `pyright` (tests) → `vulture` (dead code) → `deptry` (dependency hygiene) → `lint-imports` (import-linter architecture contracts) → max-file-length (500 lines) → `xenon` (cyclomatic complexity, max grade B / project avg A) → `swiftlint` + swift compile (macOS only, skipped elsewhere) → `markdownlint` → `prettier` (init template JS/CSS) → `shellcheck` → `actionlint` + `zizmor` (workflow lint/audit) → `gitleaks` (secret scan) → generated `--show-code` compile gate → init template contract gate → unused snapshot/fixture gate (`scripts/unused_fixtures_gate.py`: orphaned `.ambr`/API fixtures, since xdist disables syrupy's own unused detection) → `pytest` (90% branch coverage) → `diff-cover` (100% patch coverage vs `origin/main`) → **mutation gate** (diff-scoped: mutates each changed line and reruns the tests that cover it — a surviving mutant fails the gate, so changed lines need assertions that would *fail* if the line broke, not just coverage; suppress a genuinely unassertable line with `# pragma: no mutate`) → a "no new escape hatches" gate (`# type: ignore` / `# noqa` / `pragma: no cover` / `Any` / `cast(` / test skip/xfail/sleep, all **count-gated against the merge-base** so moving an existing hatch in a refactor doesn't false-positive but a net-new one fails) → **CodeQL gate** (`scripts/codeql_gate.py`: the same security + quality suites the CodeQL workflow uploads to GitHub's code-scanning/quality tabs, run locally over python/actions/javascript so alerts fail before push instead of on the PR; needs the CodeQL bundle on PATH — self-skips otherwise, `codeql.yml` covers CI, and the web session-start hook provisions it) → `uv build` + `twine check --strict`. The `vulture`/`deptry`/`lint-imports`/`xenon`, patch-coverage, and mutation stages catch the failures that `ruff`+`mypy` alone won't — don't claim the gate is green until the script prints `All checks passed.` +`scripts/check.sh` is the authoritative gate; keep this list in sync with it. It runs, in order: `uv lock --check` → `ruff check` → `ruff format --check` → `mypy` → `pyright` (src strict) → `pyright` (tests) → `vulture` (dead code) → `deptry` (dependency hygiene) → `lint-imports` (import-linter architecture contracts) → max-file-length (500 lines) → `xenon` (cyclomatic complexity, max grade B / project avg A) → `swiftlint` + swift compile (macOS only, skipped elsewhere) → `markdownlint` → `codespell` (spell-check code/comments/docs via `uvx`; config in `[tool.codespell]`) → `prettier` (init template JS/CSS) → `shellcheck` → `actionlint` + `zizmor` (workflow lint/audit) → `gitleaks` (secret scan) → generated `--show-code` compile gate → init template contract gate → unused snapshot/fixture gate (`scripts/unused_fixtures_gate.py`: orphaned `.ambr`/API fixtures, since xdist disables syrupy's own unused detection) → docs consistency gate (`scripts/docs_consistency_gate.py`: REFERENCE.md/README.md env vars, exit codes, and `assembly …` command refs stay in sync with the code) → docstring coverage gate (`scripts/docstring_coverage_gate.py`: public-API docstring ratchet, an `interrogate` stand-in that handles PEP 695 generics) → `brew audit --strict` (the shipped `Formula/assembly.rb`; self-skips without Homebrew) → `pytest` (90% branch coverage) → `diff-cover` (100% patch coverage vs `origin/main`) → **mutation gate** (diff-scoped: mutates each changed line and reruns the tests that cover it — a surviving mutant fails the gate, so changed lines need assertions that would *fail* if the line broke, not just coverage; suppress a genuinely unassertable line with `# pragma: no mutate`) → a "no new escape hatches" gate (`# type: ignore` / `# noqa` / `pragma: no cover` / `Any` / `cast(` / test skip/xfail/sleep, all **count-gated against the merge-base** so moving an existing hatch in a refactor doesn't false-positive but a net-new one fails) → **CodeQL gate** (`scripts/codeql_gate.py`: the same security + quality suites the CodeQL workflow uploads to GitHub's code-scanning/quality tabs, run locally over python/actions/javascript so alerts fail before push instead of on the PR; needs the CodeQL bundle on PATH — self-skips otherwise, `codeql.yml` covers CI, and the web session-start hook provisions it) → `uv build` + `twine check --strict`. The `vulture`/`deptry`/`lint-imports`/`xenon`, patch-coverage, and mutation stages catch the failures that `ruff`+`mypy` alone won't — don't claim the gate is green until the script prints `All checks passed.` **Commits are gated.** On success `check.sh` records a working-tree signature (`scripts/gate_marker.py record` → `.git/aai-gate-pass`), and a PreToolUse hook (`.claude/hooks/require-gate-before-commit.sh`) blocks `git commit` unless that signature still matches — so run the full gate to completion *before* committing (a single-file `pytest` does not satisfy it), and re-run it after any further edit. Iterate with the fast targeted commands above, gate once at the end. For a deliberate work-in-progress commit, prefix `AAI_ALLOW_COMMIT=1 git commit …`. diff --git a/pyproject.toml b/pyproject.toml index 8562e631..ba0b7d42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -249,11 +249,13 @@ extend-exclude = ["aai_cli/_version.py"] # missing-comma-in-a-list bug); ISC001 is owned by the formatter (ignored); # ICN/SLOT — import-convention and __slots__ hygiene; # TID — relative imports (banned outright below) so every import is absolute, -# reinforcing the import-linter architecture contracts. +# reinforcing the import-linter architecture contracts; +# T10 — a forgotten breakpoint()/pdb/ipdb left in the shipped code (the debugger +# counterpart to the T20 print ban already selected). select = ["E", "F", "I", "UP", "B", "BLE", "C4", "SIM", "RET", "PTH", "ARG", "S", "RUF", "PGH", "ERA", "TRY", "TD", "FIX", "A", "N", "FBT", "PL", "C90", "T20", "PT", "PIE", "PERF", "TCH", "ASYNC", "LOG", "G", "DTZ", "FLY", "ICN", "SLOT", "ISC", - "TID"] + "TID", "T10"] # E501: line length is owned by the formatter. # B008: Typer uses function calls (typer.Option/Argument) as parameter defaults. # S603/S607: we intentionally shell out to `claude`/`npx` with controlled args. @@ -375,6 +377,16 @@ ignore_decorators = ["@app.command", "@app.callback"] ignore_names = ["app", "capture_output", "download", "healthy", "ist", "lpath", "memory_keyring", "org", "preserve_logging_state", "refresh", "rpath"] +[tool.codespell] +# Spell-check code, comments, and docs (Kubernetes' verify-spelling, generalized). Run via +# `uvx codespell` in check.sh and as a pre-commit hook, so it needs no entry in uv.lock. +# Skip generated/binary/snapshot trees and the lockfile; recorded fixtures and snapshots +# are byte-pinned and must not be "corrected". +skip = "./.venv,./dist,./docs,./node_modules,./.git,uv.lock,*.ambr,./tests/fixtures,./aai_cli/_version.py" +# Domain words codespell misreads: "unparseable" (accepted variant), "ist" (an identifier), +# "expresso" (a deliberate mis-transcription used as an eval/WER example). +ignore-words-list = "unparseable,ist,expresso,notin,ans" + [tool.deptry] exclude = ["docs", "dist", ".venv", "aai_cli/init/templates"] diff --git a/scripts/check.sh b/scripts/check.sh index 773bc3f8..e238a58d 100755 --- a/scripts/check.sh +++ b/scripts/check.sh @@ -103,6 +103,16 @@ fi echo "==> markdownlint (docs/ is generated, so excluded)" markdownlint "**/*.md" --ignore docs --ignore node_modules --ignore .pytest_cache +echo "==> codespell (spell-check code, comments, docs)" +# Kubernetes' verify-spelling, generalized. Config (skips + ignore-words) is in +# [tool.codespell] in pyproject.toml. Run via uvx (like twine below) so it needs no +# entry in uv.lock; pre-commit also runs it. uvx self-skips if offline/unavailable. +if command -v uvx >/dev/null 2>&1; then + uvx codespell . +else + echo " uvx not found; skipping (pre-commit + CI run codespell)" +fi + echo "==> json validity (all tracked + staged *.json)" # Parse every JSON file so a malformed dashboard / vercel.json / fixture fails here # instead of silently downstream (a bad dashboard just won't import). Validity only — @@ -175,6 +185,27 @@ echo "==> unused snapshot/fixture gate" # leave an orphaned .ambr or recorded API fixture behind. This static check catches it. uv run python scripts/unused_fixtures_gate.py +echo "==> docs consistency gate (env vars / exit codes / command refs)" +# curl's "every option is documented" presubmit, generalized: REFERENCE.md/README.md must +# not drift from the code — every env var and exit code is documented, every `assembly …` +# example names a real command. +uv run python scripts/docs_consistency_gate.py + +echo "==> docstring coverage gate (public API ratchet)" +# interrogate can't parse this codebase's PEP 695 generics, so an ast-based ratchet stands +# in: public-API docstring coverage may not drop below the floor in scripts/. +uv run python scripts/docstring_coverage_gate.py + +echo "==> brew audit (Homebrew formula)" +# Lint the formula we ship (Formula/assembly.rb) the way Homebrew's own CI does, so a +# formula regression fails here instead of on the release PR. brew is macOS/Linuxbrew +# only, so this self-skips where it isn't installed (CI's release path has it). +if command -v brew >/dev/null 2>&1; then + brew audit --strict --formula Formula/assembly.rb +else + echo " brew not found; skipping (Homebrew CI / release runner has it)" +fi + echo "==> pytest (with branch-coverage gate)" # Exclude e2e: they drive the CLI as a subprocess (uncounted by coverage) and need # a live API key. Exclude install (real per-template dep install, slow + network). diff --git a/scripts/docs_consistency_gate.py b/scripts/docs_consistency_gate.py new file mode 100644 index 00000000..6d495e37 --- /dev/null +++ b/scripts/docs_consistency_gate.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import re +import sys +from pathlib import Path + +import typer + +from aai_cli.main import app + +# Docs-stay-in-sync gate, in the spirit of curl's "every option is documented" presubmit +# and numpy's refguide-check: the reference doc and the code must not drift apart. Three +# checks, all static and fast: +# 1. Environment-variable parity — every AAI_*/ASSEMBLYAI_* var the code reads is either +# documented in REFERENCE.md or explicitly listed as internal here, and every such +# documented var is actually read (no stale rows). +# 2. Exit-code parity — every numeric exit code the code returns is in REFERENCE.md's +# exit-code table. +# 3. Command-reference validity — every `assembly []` example in the docs +# names a real command (catches a doc that outlives a rename). + +REPO_ROOT = Path(__file__).resolve().parent.parent +REFERENCE = REPO_ROOT / "REFERENCE.md" +DOC_SOURCES = (REPO_ROOT / "README.md", REFERENCE) +PACKAGE = REPO_ROOT / "aai_cli" + +# Vars the code reads that are deliberately undocumented: telemetry plumbing overrides and +# the scaffold's product-config vars (written into a generated app's .env, not CLI behavior). +INTERNAL_VARS = { + "AAI_TELEMETRY_CLIENT_TOKEN", + "AAI_TELEMETRY_INTAKE_URL", + "AAI_MACOS_AUDIO_DEBUG", + "ASSEMBLYAI_BASE_URL", + "ASSEMBLYAI_LLM_GATEWAY_URL", + "ASSEMBLYAI_STREAMING_HOST", + "ASSEMBLYAI_AGENTS_HOST", +} + +_VAR_RE = re.compile(r"\b((?:AAI|ASSEMBLYAI)_[A-Z0-9_]+)\b") +_DOC_VAR_RE = re.compile(r"`((?:AAI|ASSEMBLYAI)_[A-Z0-9_]+)`") +_EXIT_DOC_RE = re.compile(r"\|\s*`(\d+)`\s*\|") +_EXIT_CODE_RE = re.compile(r"exit_code\s*[=:]\s*(\d+)|Exit\(code=(\d+)\)") +_CMD_RE = re.compile(r"\bassembly\s+([a-z][\w-]*)(?:\s+([a-z][\w-]*))?") + + +def _package_sources() -> str: + return "\n".join( + p.read_text(encoding="utf-8") + for p in PACKAGE.rglob("*.py") + if "templates" not in p.parts and p.name != "_version.py" + ) + + +def _env_var_errors() -> list[str]: + code = _package_sources() + code_vars = set(_VAR_RE.findall(code)) + doc_vars = set(_DOC_VAR_RE.findall(REFERENCE.read_text(encoding="utf-8"))) + return [ + f"env var {var} is read in code but not documented in REFERENCE.md" + for var in sorted(code_vars - doc_vars - INTERNAL_VARS) + ] + [ + f"env var {var} is documented in REFERENCE.md but never read in code" + for var in sorted(doc_vars - code_vars - INTERNAL_VARS) + ] + + +def _exit_code_errors() -> list[str]: + documented = {int(m) for m in _EXIT_DOC_RE.findall(REFERENCE.read_text(encoding="utf-8"))} + errors: list[str] = [] + for path in sorted(PACKAGE.rglob("*.py")): + if "templates" in path.parts: + continue + for groups in _EXIT_CODE_RE.findall(path.read_text(encoding="utf-8")): + code = int(next(g for g in groups if g)) + if code not in documented: + rel = path.relative_to(REPO_ROOT) + errors.append(f"exit code {code} used in {rel} is not in REFERENCE.md's table") + return errors + + +def _command_tree() -> tuple[set[str], dict[str, set[str]]]: + root = typer.main.get_command(app) + commands = getattr(root, "commands", {}) + groups = { + name: set(getattr(obj, "commands", {})) + for name, obj in commands.items() + if hasattr(obj, "commands") + } + return set(commands), groups + + +def _command_ref_errors() -> list[str]: + top, groups = _command_tree() + errors: list[str] = [] + for doc in DOC_SOURCES: + for cmd, sub in _CMD_RE.findall(doc.read_text(encoding="utf-8")): + if cmd not in top: + errors.append(f"{doc.name}: `assembly {cmd}` names an unknown command") + elif sub and cmd in groups and sub not in groups[cmd]: + errors.append(f"{doc.name}: `assembly {cmd} {sub}` names an unknown subcommand") + return errors + + +def main() -> int: + errors = _env_var_errors() + _exit_code_errors() + _command_ref_errors() + if not errors: + sys.stdout.write("Docs and code agree (env vars, exit codes, command references).\n") + return 0 + for err in errors: + sys.stdout.write(f"{err}\n") + sys.stdout.write("Update REFERENCE.md/README.md (or the INTERNAL_VARS allowlist) to match.\n") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/docstring_coverage_gate.py b/scripts/docstring_coverage_gate.py new file mode 100644 index 00000000..c78bfde8 --- /dev/null +++ b/scripts/docstring_coverage_gate.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import ast +import sys +from pathlib import Path + +# Docstring-coverage ratchet for the shipped package's public API, replacing `interrogate` +# (which can't parse this codebase's PEP 695 generics, e.g. `def emit[T](...)`). Public = +# the module plus every non-underscore class/function/method. The FLOOR is set at the +# current level and only ever ratchets up: a change may not drop public-API documentation +# below it, but nobody is forced to backfill the existing gap in one go. Raising FLOOR as +# coverage climbs is a deliberate, reviewed edit here — the same model as a coverage gate. +FLOOR = 64.0 + +PACKAGE = Path(__file__).resolve().parent.parent / "aai_cli" + +_Def = (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef) + + +def _public_nodes(tree: ast.Module) -> list[ast.AST]: + nodes: list[ast.AST] = [tree] + nodes.extend(n for n in ast.walk(tree) if isinstance(n, _Def) and not n.name.startswith("_")) + return nodes + + +def _coverage() -> tuple[int, int, list[str]]: + total = documented = 0 + missing: list[str] = [] + for path in sorted(PACKAGE.rglob("*.py")): + if "templates" in path.parts or path.name == "_version.py": + continue + tree = ast.parse(path.read_text(encoding="utf-8")) + for node in _public_nodes(tree): + total += 1 + if ast.get_docstring(node): + documented += 1 + else: + name = getattr(node, "name", "") + missing.append(f"{path.relative_to(PACKAGE.parent)}:{name}") + return documented, total, missing + + +def main() -> int: + documented, total, missing = _coverage() + pct = 100.0 * documented / total if total else 100.0 + if pct + 1e-9 >= FLOOR: + sys.stdout.write(f"Public docstring coverage {pct:.1f}% (>= floor {FLOOR}%).\n") + return 0 + sys.stdout.write( + f"Public docstring coverage {pct:.1f}% fell below the {FLOOR}% floor " + f"({documented}/{total}). Add docstrings to public APIs you touched:\n" + ) + for item in missing[:20]: + sys.stdout.write(f" {item}\n") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) From 3f64752a409e4a36aea7022499c2551f17947c92 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 13 Jun 2026 16:46:20 +0000 Subject: [PATCH 4/4] Exempt tests from DTZ so the merged naive-datetime fixture passes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Merging origin/main brought in tests/test_jsonshape.py, which builds a naive datetime as a deterministic fixture — legitimate in the test suite (TZ is pinned in conftest and time-machine controls the clock), but it tripped the new DTZ rule. Tests are already exempt from the other production-correctness lints (S101, PLR2004, …); add DTZ to that list rather than weaken the test or add an inline noqa the escape-hatch gate would reject. DTZ still guards aai_cli. --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ba0b7d42..964a98f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -310,9 +310,11 @@ max-statements = 40 # call signatures where those make the intent clearer than production-style indirection. # TID251: tests drive the CLI as a subprocess and monkeypatch os.environ freely; the # banned-api ban targets the shipped aai_cli package, not the test harness or dev gates. +# DTZ: tests build naive datetimes as deterministic fixtures (the suite pins TZ in +# conftest and uses time-machine), so timezone-aware construction isn't required here. "tests/**" = ["S101", "S105", "S106", "S107", "S108", "ARG001", "ARG002", "ARG005", "PTH123", "SIM117", "TRY300", "FBT", "PLR2004", "PLC0415", "PLR0913", - "PLW1510", "N806", "N818", "PLW0108", "PT018", "TCH", "TID251"] + "PLW1510", "N806", "N818", "PLW0108", "PT018", "TCH", "TID251", "DTZ"] "scripts/**" = ["TID251"] # Typer command functions naturally have many boolean options and broad signatures # (PLR0913/FBT). Their *bodies*, though, are held to the same length/branch limits as