From 8debfb9834f1bfcab25f0b65bbfac6d9a68e654d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 12 Jun 2026 21:44:23 +0000
Subject: [PATCH] Fix merge-queue CodeQL failures and harden agent-session DX

- Split the Swift CodeQL analysis into codeql-swift.yml: path-filtered to
  Swift changes, no merge_group trigger. The ~25-minute traced macOS build
  outlived the merge-queue ref on every fast merge, failing the SARIF
  upload with "ref not found" on 7 consecutive queued PRs, and burned
  ~25 macOS runner-minutes per PR for a file that rarely changes. The
  push-to-main run scans the identical merge result; the weekly sweep
  keeps the baseline fresh.
- Promote the debuglog tests' logging snapshot/restore into a shared
  preserve_logging_state conftest fixture so the next module that touches
  process-global logging doesn't rediscover the pytest-randomly order
  dependence that cost PR #125 a red CI round.
- AGENTS.md: note the shared fixture, and tell concurrent sessions to
  check open PRs / recent main commits before starting a fix (PR #89 was
  closed as an exact duplicate of #90).

https://claude.ai/code/session_019CwN6T2ztwvNARvj3z39sP
---
 .github/workflows/codeql-swift.yml | 78 ++++++++++++++++++++++++++++++
 .github/workflows/codeql.yml       | 39 ++++-----------
 AGENTS.md                          |  9 ++++
 pyproject.toml                     |  2 +-
 tests/conftest.py                  | 28 +++++++++++
 tests/test_debuglog.py             | 25 ++--------
 6 files changed, 129 insertions(+), 52 deletions(-)
 create mode 100644 .github/workflows/codeql-swift.yml

diff --git a/.github/workflows/codeql-swift.yml b/.github/workflows/codeql-swift.yml
new file mode 100644
index 00000000..c6cec3c0
--- /dev/null
+++ b/.github/workflows/codeql-swift.yml
@@ -0,0 +1,78 @@
+name: CodeQL Swift
+
+# The Swift analysis is split out of codeql.yml because it is two orders of
+# magnitude slower than the interpreted-language scans: CodeQL must observe a
+# real `swiftc` build (autobuild can't discover a bare helper script with no
+# Xcode/SwiftPM project), and the traced build of the macOS system-audio
+# helper takes ~25 minutes on a macOS runner.
+#
+# Triggers are deliberately narrower than codeql.yml:
+#   - path-filtered to the Swift source (and this workflow), so the ~25 macOS
+#     runner-minutes are only spent when Swift code can actually change the
+#     result;
+#   - no merge_group: the merge-queue ref is deleted the moment the PR merges,
+#     so any scan slower than the queue fails its SARIF upload with
+#     "ref not found" (this failed on 7 consecutive queued PRs). The push run
+#     on main scans the identical merge result instead;
+#   - the weekly sweep keeps the default-branch baseline fresh when new
+#     queries ship between Swift changes.
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - "**/*.swift"
+      - ".github/workflows/codeql-swift.yml"
+  push:
+    branches: [main]
+    paths:
+      - "**/*.swift"
+      - ".github/workflows/codeql-swift.yml"
+  schedule:
+    - cron: "29 15 * * 2"
+
+# Least privilege at the workflow level; the analyze job opts into the extra
+# scopes CodeQL needs. Actions are pinned to commit SHAs (a moved tag can't
+# silently change what runs); Dependabot keeps them current.
+permissions:
+  contents: read
+
+# Cancel superseded runs when new commits land on a PR/branch, but never cancel
+# a main run (don't drop the scan that updates the default-branch baseline).
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  analyze:
+    name: analyze (swift)
+    runs-on: macos-latest
+    timeout-minutes: 45
+    permissions:
+      security-events: write  # upload SARIF results to code scanning
+      actions: read           # workflow metadata for run context on private repos
+      contents: read
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+        with:
+          persist-credentials: false  # no job pushes; don't leave the token in .git/config
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
+        with:
+          languages: swift
+          build-mode: manual
+
+      # The same swiftc invocation scripts/check.sh uses.
+      - name: Build Swift audio helper
+        run: |
+          swiftc -parse-as-library aai_cli/streaming/macos_system_audio.swift \
+            -framework ScreenCaptureKit \
+            -framework AVFoundation \
+            -framework CoreMedia \
+            -framework CoreGraphics \
+            -o /tmp/aai-macos-audio-codeql
+
+      - name: Analyze
+        uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
+        with:
+          category: /language:swift
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 8e12dbf1..2c3294e8 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -27,7 +27,7 @@ concurrency:
 jobs:
   analyze:
     name: analyze (${{ matrix.language }})
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-latest
     timeout-minutes: 30
     permissions:
       security-events: write  # upload SARIF results to code scanning
@@ -38,24 +38,13 @@ jobs:
       matrix:
         # python: the CLI itself; actions: the workflows in .github/workflows;
         # javascript-typescript: the committed `assembly init` template JS.
-        # Those three are interpreted languages, so build-mode none suffices.
-        # swift: the macOS system-audio helper. Swift is compiled, so CodeQL
-        # must observe a real build — and autobuild can't discover a bare
-        # helper script with no Xcode/SwiftPM project, so the build is manual
-        # (the same swiftc invocation scripts/check.sh uses) on a macOS runner.
-        include:
-          - language: python
-            os: ubuntu-latest
-            build-mode: none
-          - language: actions
-            os: ubuntu-latest
-            build-mode: none
-          - language: javascript-typescript
-            os: ubuntu-latest
-            build-mode: none
-          - language: swift
-            os: macos-latest
-            build-mode: manual
+        # All three are interpreted languages, so build-mode none suffices and
+        # each analysis finishes in about a minute. The Swift helper needs a
+        # ~25-minute traced build on a macOS runner, so it lives in
+        # codeql-swift.yml, path-filtered to Swift changes and kept out of the
+        # merge queue (the queue ref is deleted as soon as the PR merges, which
+        # makes any scan slower than the queue fail its SARIF upload).
+        language: [python, actions, javascript-typescript]
     steps:
       - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
@@ -65,17 +54,7 @@ jobs:
         uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
         with:
           languages: ${{ matrix.language }}
-          build-mode: ${{ matrix.build-mode }}
-
-      - name: Build Swift audio helper
-        if: matrix.build-mode == 'manual'
-        run: |
-          swiftc -parse-as-library aai_cli/streaming/macos_system_audio.swift \
-            -framework ScreenCaptureKit \
-            -framework AVFoundation \
-            -framework CoreMedia \
-            -framework CoreGraphics \
-            -o /tmp/aai-macos-audio-codeql
+          build-mode: none
 
       - name: Analyze
         uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
diff --git a/AGENTS.md b/AGENTS.md
index 9e69043e..4692f36e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -95,11 +95,20 @@ Lessons that cost iterations getting the patch-coverage and mutation tail gates
   with "No such option"; it's `assembly transcribe … --json`. (The root callback still sniffs the
   whole token list via `argscan.requests_json`, so a callback-level failure like a bad
   `--env` keeps the JSON error shape — but the flag itself lives on the subcommand.)
+- **Tests that touch global logging state must snapshot/restore it** — root handlers/level
+  and per-logger levels are process-global, so a leak only fails on some pytest-randomly
+  seeds (green locally, red in CI). Opt in to the shared `preserve_logging_state` conftest
+  fixture (it also resets the websockets wire loggers a silencer test may have clamped)
+  instead of hand-rolling the snapshot per module.
 
 ### Manual QA / running the CLI in sandboxed sessions
 
 Lessons that cost time in agent sessions — read before exercising `uv run assembly` by hand:
 
+- **Check for in-flight duplicates before starting a fix.** Sessions run concurrently:
+  before implementing a bug fix or small feature, scan open PRs and the last few
+  `origin/main` commits touching the same files (two sessions once shipped the identical
+  fix; the slower PR was closed as redundant). Seconds of checking beats a discarded PR.
 - **Web/remote containers are fully provisioned at session start**
   (`.claude/hooks/session-start.sh`): system deps, `markdownlint`/`prettier`, and the Go
   gate binaries (`actionlint`, `gitleaks`) are installed at CI's pinned versions, so
diff --git a/pyproject.toml b/pyproject.toml
index 1a644852..aa18c2e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -272,7 +272,7 @@ exclude = ["aai_cli/_version.py"]
 min_confidence = 90
 ignore_decorators = ["@app.command", "@app.callback"]
 ignore_names = ["app", "capture_output", "download", "healthy", "ist", "memory_keyring", "org",
-                "refresh"]
+                "preserve_logging_state", "refresh"]
 
 [tool.deptry]
 exclude = ["docs", "dist", ".venv", "aai_cli/init/templates"]
diff --git a/tests/conftest.py b/tests/conftest.py
index 948de2e7..85cb47ca 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -97,6 +97,34 @@ def fixed_render_size(monkeypatch):
     monkeypatch.setenv("LINES", "40")
 
 
+@pytest.fixture
+def preserve_logging_state():
+    # Logging is process-global: root handlers and level, plus per-logger levels.
+    # A test that enables verbose diagnostics (debuglog.enable) or trips the
+    # realtime silencers would otherwise leak that state into unrelated tests —
+    # an order dependence pytest-randomly only exposes on some seeds (it cost a
+    # red CI round on PR #125). Named (not autouse): modules that touch global
+    # logging opt in. The websockets wire loggers are reset to NOTSET up front
+    # so a CRITICAL clamp left by an earlier test can't swallow records the
+    # opting test asserts on.
+    import logging
+
+    from aai_cli import ws as wsutil
+
+    root = logging.getLogger()
+    previous_handlers = list(root.handlers)
+    previous_level = root.level
+    wire_loggers = [logging.getLogger(name) for name in wsutil.WEBSOCKETS_LOGGERS]
+    previous_wire_levels = [logger.level for logger in wire_loggers]
+    for logger in wire_loggers:
+        logger.setLevel(logging.NOTSET)
+    yield
+    root.handlers[:] = previous_handlers
+    root.setLevel(previous_level)
+    for logger, level in zip(wire_loggers, previous_wire_levels, strict=True):
+        logger.setLevel(level)
+
+
 @pytest.fixture(autouse=True)
 def reset_active_environment():
     # The active environment is a process-global (set at CLI startup); pin it to
diff --git a/tests/test_debuglog.py b/tests/test_debuglog.py
index 5d299365..d6a97bad 100644
--- a/tests/test_debuglog.py
+++ b/tests/test_debuglog.py
@@ -11,7 +11,6 @@
 from typer.testing import CliRunner
 
 from aai_cli import config, debuglog
-from aai_cli import ws as wsutil
 from aai_cli.context import AppState
 from aai_cli.main import app
 
@@ -19,28 +18,12 @@
 
 
 @pytest.fixture(autouse=True)
-def reset_debuglog(monkeypatch):
-    # enable() mutates process-global state (the root logger and the module's
-    # verbosity/secret registries); snapshot and restore so pytest-randomly
-    # ordering can't leak a verbose run into unrelated tests.
-    root = logging.getLogger()
-    previous_handlers = list(root.handlers)
-    previous_level = root.level
-    # Logger levels are process-global too: any earlier test that exercised the
-    # realtime silencers left the websockets loggers clamped at CRITICAL, which
-    # would swallow the wire-level records asserted here. Reset them so these
-    # tests are order-independent under pytest-randomly, then restore.
-    wire_loggers = [logging.getLogger(name) for name in wsutil.WEBSOCKETS_LOGGERS]
-    previous_wire_levels = [logger.level for logger in wire_loggers]
-    for logger in wire_loggers:
-        logger.setLevel(logging.NOTSET)
+def reset_debuglog(preserve_logging_state, monkeypatch):
+    # The shared conftest fixture snapshots the process-global logging state
+    # (root handlers/level, websockets wire-logger levels); enable() also
+    # mutates the module's own verbosity/secret registries, reset here.
     monkeypatch.setattr(debuglog, "_verbosity", 0)
     monkeypatch.setattr(debuglog, "_secrets", set())
-    yield
-    root.handlers[:] = previous_handlers
-    root.setLevel(previous_level)
-    for logger, level in zip(wire_loggers, previous_wire_levels, strict=True):
-        logger.setLevel(level)
 
 
 def test_enable_zero_is_the_everyday_no_op():