From 8debfb9834f1bfcab25f0b65bbfac6d9a68e654d Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 12 Jun 2026 21:44:23 +0000 Subject: [PATCH] Fix merge-queue CodeQL failures and harden agent-session DX - Split the Swift CodeQL analysis into codeql-swift.yml: path-filtered to Swift changes, no merge_group trigger. The ~25-minute traced macOS build outlived the merge-queue ref on every fast merge, failing the SARIF upload with "ref not found" on 7 consecutive queued PRs, and burned ~25 macOS runner-minutes per PR for a file that rarely changes. The push-to-main run scans the identical merge result; the weekly sweep keeps the baseline fresh. - Promote the debuglog tests' logging snapshot/restore into a shared preserve_logging_state conftest fixture so the next module that touches process-global logging doesn't rediscover the pytest-randomly order dependence that cost PR #125 a red CI round. - AGENTS.md: note the shared fixture, and tell concurrent sessions to check open PRs / recent main commits before starting a fix (PR #89 was closed as an exact duplicate of #90). https://claude.ai/code/session_019CwN6T2ztwvNARvj3z39sP --- .github/workflows/codeql-swift.yml | 78 ++++++++++++++++++++++++++++++ .github/workflows/codeql.yml | 39 ++++----------- AGENTS.md | 9 ++++ pyproject.toml | 2 +- tests/conftest.py | 28 +++++++++++ tests/test_debuglog.py | 25 ++-------- 6 files changed, 129 insertions(+), 52 deletions(-) create mode 100644 .github/workflows/codeql-swift.yml diff --git a/.github/workflows/codeql-swift.yml b/.github/workflows/codeql-swift.yml new file mode 100644 index 00000000..c6cec3c0 --- /dev/null +++ b/.github/workflows/codeql-swift.yml @@ -0,0 +1,78 @@ +name: CodeQL Swift + +# The Swift analysis is split out of codeql.yml because it is two orders of +# magnitude slower than the interpreted-language scans: CodeQL must observe a +# real `swiftc` build (autobuild can't discover a bare helper script with no +# Xcode/SwiftPM project), and the traced build of the macOS system-audio +# helper takes ~25 minutes on a macOS runner. +# +# Triggers are deliberately narrower than codeql.yml: +# - path-filtered to the Swift source (and this workflow), so the ~25 macOS +# runner-minutes are only spent when Swift code can actually change the +# result; +# - no merge_group: the merge-queue ref is deleted the moment the PR merges, +# so any scan slower than the queue fails its SARIF upload with +# "ref not found" (this failed on 7 consecutive queued PRs). The push run +# on main scans the identical merge result instead; +# - the weekly sweep keeps the default-branch baseline fresh when new +# queries ship between Swift changes. +on: + pull_request: + branches: [main] + paths: + - "**/*.swift" + - ".github/workflows/codeql-swift.yml" + push: + branches: [main] + paths: + - "**/*.swift" + - ".github/workflows/codeql-swift.yml" + schedule: + - cron: "29 15 * * 2" + +# Least privilege at the workflow level; the analyze job opts into the extra +# scopes CodeQL needs. Actions are pinned to commit SHAs (a moved tag can't +# silently change what runs); Dependabot keeps them current. +permissions: + contents: read + +# Cancel superseded runs when new commits land on a PR/branch, but never cancel +# a main run (don't drop the scan that updates the default-branch baseline). +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +jobs: + analyze: + name: analyze (swift) + runs-on: macos-latest + timeout-minutes: 45 + permissions: + security-events: write # upload SARIF results to code scanning + actions: read # workflow metadata for run context on private repos + contents: read + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false # no job pushes; don't leave the token in .git/config + + - name: Initialize CodeQL + uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 + with: + languages: swift + build-mode: manual + + # The same swiftc invocation scripts/check.sh uses. + - name: Build Swift audio helper + run: | + swiftc -parse-as-library aai_cli/streaming/macos_system_audio.swift \ + -framework ScreenCaptureKit \ + -framework AVFoundation \ + -framework CoreMedia \ + -framework CoreGraphics \ + -o /tmp/aai-macos-audio-codeql + + - name: Analyze + uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 + with: + category: /language:swift diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 8e12dbf1..2c3294e8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -27,7 +27,7 @@ concurrency: jobs: analyze: name: analyze (${{ matrix.language }}) - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest timeout-minutes: 30 permissions: security-events: write # upload SARIF results to code scanning @@ -38,24 +38,13 @@ jobs: matrix: # python: the CLI itself; actions: the workflows in .github/workflows; # javascript-typescript: the committed `assembly init` template JS. - # Those three are interpreted languages, so build-mode none suffices. - # swift: the macOS system-audio helper. Swift is compiled, so CodeQL - # must observe a real build — and autobuild can't discover a bare - # helper script with no Xcode/SwiftPM project, so the build is manual - # (the same swiftc invocation scripts/check.sh uses) on a macOS runner. - include: - - language: python - os: ubuntu-latest - build-mode: none - - language: actions - os: ubuntu-latest - build-mode: none - - language: javascript-typescript - os: ubuntu-latest - build-mode: none - - language: swift - os: macos-latest - build-mode: manual + # All three are interpreted languages, so build-mode none suffices and + # each analysis finishes in about a minute. The Swift helper needs a + # ~25-minute traced build on a macOS runner, so it lives in + # codeql-swift.yml, path-filtered to Swift changes and kept out of the + # merge queue (the queue ref is deleted as soon as the PR merges, which + # makes any scan slower than the queue fail its SARIF upload). + language: [python, actions, javascript-typescript] steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: @@ -65,17 +54,7 @@ jobs: uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 with: languages: ${{ matrix.language }} - build-mode: ${{ matrix.build-mode }} - - - name: Build Swift audio helper - if: matrix.build-mode == 'manual' - run: | - swiftc -parse-as-library aai_cli/streaming/macos_system_audio.swift \ - -framework ScreenCaptureKit \ - -framework AVFoundation \ - -framework CoreMedia \ - -framework CoreGraphics \ - -o /tmp/aai-macos-audio-codeql + build-mode: none - name: Analyze uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2 diff --git a/AGENTS.md b/AGENTS.md index 9e69043e..4692f36e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -95,11 +95,20 @@ Lessons that cost iterations getting the patch-coverage and mutation tail gates with "No such option"; it's `assembly transcribe … --json`. (The root callback still sniffs the whole token list via `argscan.requests_json`, so a callback-level failure like a bad `--env` keeps the JSON error shape — but the flag itself lives on the subcommand.) +- **Tests that touch global logging state must snapshot/restore it** — root handlers/level + and per-logger levels are process-global, so a leak only fails on some pytest-randomly + seeds (green locally, red in CI). Opt in to the shared `preserve_logging_state` conftest + fixture (it also resets the websockets wire loggers a silencer test may have clamped) + instead of hand-rolling the snapshot per module. ### Manual QA / running the CLI in sandboxed sessions Lessons that cost time in agent sessions — read before exercising `uv run assembly` by hand: +- **Check for in-flight duplicates before starting a fix.** Sessions run concurrently: + before implementing a bug fix or small feature, scan open PRs and the last few + `origin/main` commits touching the same files (two sessions once shipped the identical + fix; the slower PR was closed as redundant). Seconds of checking beats a discarded PR. - **Web/remote containers are fully provisioned at session start** (`.claude/hooks/session-start.sh`): system deps, `markdownlint`/`prettier`, and the Go gate binaries (`actionlint`, `gitleaks`) are installed at CI's pinned versions, so diff --git a/pyproject.toml b/pyproject.toml index 1a644852..aa18c2e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -272,7 +272,7 @@ exclude = ["aai_cli/_version.py"] min_confidence = 90 ignore_decorators = ["@app.command", "@app.callback"] ignore_names = ["app", "capture_output", "download", "healthy", "ist", "memory_keyring", "org", - "refresh"] + "preserve_logging_state", "refresh"] [tool.deptry] exclude = ["docs", "dist", ".venv", "aai_cli/init/templates"] diff --git a/tests/conftest.py b/tests/conftest.py index 948de2e7..85cb47ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -97,6 +97,34 @@ def fixed_render_size(monkeypatch): monkeypatch.setenv("LINES", "40") +@pytest.fixture +def preserve_logging_state(): + # Logging is process-global: root handlers and level, plus per-logger levels. + # A test that enables verbose diagnostics (debuglog.enable) or trips the + # realtime silencers would otherwise leak that state into unrelated tests — + # an order dependence pytest-randomly only exposes on some seeds (it cost a + # red CI round on PR #125). Named (not autouse): modules that touch global + # logging opt in. The websockets wire loggers are reset to NOTSET up front + # so a CRITICAL clamp left by an earlier test can't swallow records the + # opting test asserts on. + import logging + + from aai_cli import ws as wsutil + + root = logging.getLogger() + previous_handlers = list(root.handlers) + previous_level = root.level + wire_loggers = [logging.getLogger(name) for name in wsutil.WEBSOCKETS_LOGGERS] + previous_wire_levels = [logger.level for logger in wire_loggers] + for logger in wire_loggers: + logger.setLevel(logging.NOTSET) + yield + root.handlers[:] = previous_handlers + root.setLevel(previous_level) + for logger, level in zip(wire_loggers, previous_wire_levels, strict=True): + logger.setLevel(level) + + @pytest.fixture(autouse=True) def reset_active_environment(): # The active environment is a process-global (set at CLI startup); pin it to diff --git a/tests/test_debuglog.py b/tests/test_debuglog.py index 5d299365..d6a97bad 100644 --- a/tests/test_debuglog.py +++ b/tests/test_debuglog.py @@ -11,7 +11,6 @@ from typer.testing import CliRunner from aai_cli import config, debuglog -from aai_cli import ws as wsutil from aai_cli.context import AppState from aai_cli.main import app @@ -19,28 +18,12 @@ @pytest.fixture(autouse=True) -def reset_debuglog(monkeypatch): - # enable() mutates process-global state (the root logger and the module's - # verbosity/secret registries); snapshot and restore so pytest-randomly - # ordering can't leak a verbose run into unrelated tests. - root = logging.getLogger() - previous_handlers = list(root.handlers) - previous_level = root.level - # Logger levels are process-global too: any earlier test that exercised the - # realtime silencers left the websockets loggers clamped at CRITICAL, which - # would swallow the wire-level records asserted here. Reset them so these - # tests are order-independent under pytest-randomly, then restore. - wire_loggers = [logging.getLogger(name) for name in wsutil.WEBSOCKETS_LOGGERS] - previous_wire_levels = [logger.level for logger in wire_loggers] - for logger in wire_loggers: - logger.setLevel(logging.NOTSET) +def reset_debuglog(preserve_logging_state, monkeypatch): + # The shared conftest fixture snapshots the process-global logging state + # (root handlers/level, websockets wire-logger levels); enable() also + # mutates the module's own verbosity/secret registries, reset here. monkeypatch.setattr(debuglog, "_verbosity", 0) monkeypatch.setattr(debuglog, "_secrets", set()) - yield - root.handlers[:] = previous_handlers - root.setLevel(previous_level) - for logger, level in zip(wire_loggers, previous_wire_levels, strict=True): - logger.setLevel(level) def test_enable_zero_is_the_everyday_no_op():