From 27361d94537af4695c8059851f5000dcecca14bc Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 18:07:33 +0000 Subject: [PATCH 01/15] Archive REVIEW_FU-P12-T1-3_multi_client_widgets_v2 report Move post-fix review for FU-P12-T1-3 to _Historical/ and register it in ARCHIVE/INDEX.md. --- SPECS/ARCHIVE/INDEX.md | 2 ++ .../_Historical}/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md | 0 2 files changed, 2 insertions(+) rename SPECS/{INPROGRESS => ARCHIVE/_Historical}/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md (100%) diff --git a/SPECS/ARCHIVE/INDEX.md b/SPECS/ARCHIVE/INDEX.md index aa9db1b8..24d063df 100644 --- a/SPECS/ARCHIVE/INDEX.md +++ b/SPECS/ARCHIVE/INDEX.md @@ -214,6 +214,7 @@ | [REVIEW_FU-P12-T3-2_error_code_csv_export.md](_Historical/REVIEW_FU-P12-T3-2_error_code_csv_export.md) | Review report for FU-P12-T3-2 | | [REVIEW_FU-P13-T10_broker_daemon_entrypoint.md](_Historical/REVIEW_FU-P13-T10_broker_daemon_entrypoint.md) | Review report for FU-P13-T10 | | [REVIEW_FU-P13-T11_integer_id_fidelity.md](_Historical/REVIEW_FU-P13-T11_integer_id_fidelity.md) | Review report for FU-P13-T11 | +| [REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md](_Historical/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md) | Second review report for FU-P12-T1-3 (post-fix verification) | ## Archive Log @@ -382,3 +383,4 @@ | 2026-02-19 | FU-P13-T10 | Archived REVIEW_FU-P13-T10_broker_daemon_entrypoint report | | 2026-02-19 | FU-P13-T11 | Archived Preserve_JSON-RPC_numeric_request_ID_fidelity_in_broker_transport (PASS) | | 2026-02-19 | FU-P13-T11 | Archived REVIEW_FU-P13-T11_integer_id_fidelity report | +| 2026-02-19 | FU-P12-T1-3 | Archived REVIEW_FU-P12-T1-3_multi_client_widgets_v2 report (post-fix review) | diff --git a/SPECS/INPROGRESS/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md b/SPECS/ARCHIVE/_Historical/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md similarity index 100% rename from SPECS/INPROGRESS/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md rename to SPECS/ARCHIVE/_Historical/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md From 29e432d7dc5be9967e05b66519efd956ac5ba4c2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 18:08:06 +0000 Subject: [PATCH 02/15] Select task FU-P13-T12: Enforce local Unix-socket security boundary for broker clients --- SPECS/INPROGRESS/next.md | 24 ++++++++++-------------- SPECS/Workplan.md | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/SPECS/INPROGRESS/next.md b/SPECS/INPROGRESS/next.md index 3b146948..78d87824 100644 --- a/SPECS/INPROGRESS/next.md +++ b/SPECS/INPROGRESS/next.md @@ -1,19 +1,15 @@ -# No Active Task +# Next Task: FU-P13-T12 — Enforce local Unix-socket security boundary for broker clients -## Recently Archived +**Priority:** P1 +**Phase:** Phase 13 — Persistent Broker & Shared Xcode Session +**Effort:** 2–3h +**Dependencies:** P13-T1 (✅), P13-T3 (✅) +**Status:** Selected -- 2026-02-19 — FU-P13-T11: Preserve JSON-RPC numeric request ID fidelity in broker transport (PASS) -- 2026-02-19 — FU-P13-T10: Implement explicit broker daemon entrypoint and operational CLI flows (PASS) -- 2026-02-19 — FU-P12-T3-1: Document unused `error_message` parameter in `MetricsCollector.record_response` (PASS) -- 2026-02-19 — FU-P12-T1-6: Uniform HTML escaping in `renderClientWidgets` (PASS) -- 2026-02-19 — FU-P12-T1-5: Cap `_clients` dict and prune `client_identities` to prevent unbounded growth (PASS) -- 2026-02-19 — FU-P12-T1-4: Make `IN FLIGHT` KPI reflect real in-flight requests in shared-metrics mode (PASS) -- 2026-02-19 — FU-P12-T3-2: Add `error_code` column to audit CSV export (PASS) +## Description -## Suggested Next Tasks +Implement same-UID peer credential verification for broker Unix-socket clients and enforce owner-only socket permissions (`0600`), aligning runtime behavior with the P13-T1 ADR security decisions. Connections from different-UID processes must be rejected without affecting active sessions. -- FU-P13-T12: Enforce local Unix-socket security boundary for broker clients (P1) -- FU-P13-T13: Make broker startup transactional when transport bind/start fails (P1) -- FU-P13-T14: Complete interactive Xcode prompt verification and close P13-T5 (P1) +## Next Step -Pending follow-up backlog from review: `3` open tasks. +Run the PLAN command to generate the implementation-ready PRD. diff --git a/SPECS/Workplan.md b/SPECS/Workplan.md index 1d34f6e2..df226a7f 100644 --- a/SPECS/Workplan.md +++ b/SPECS/Workplan.md @@ -2223,7 +2223,7 @@ Phase 9 Follow-up Backlog --- -#### ⬜️ FU-P13-T12: Enforce local Unix-socket security boundary for broker clients +#### 🔄 FU-P13-T12: Enforce local Unix-socket security boundary for broker clients **INPROGRESS** - **Description:** Implement same-UID peer credential verification for broker socket clients and enforce owner-only socket permissions, aligning runtime behavior with P13-T1 ADR security decisions. - **Priority:** P1 - **Dependencies:** P13-T1, P13-T3 From 8788277873bbe0dcc04db7a21a4c30e055384d43 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 18:32:04 +0000 Subject: [PATCH 03/15] Plan task FU-P13-T12: Enforce local Unix-socket security boundary for broker clients --- ...et_security_boundary_for_broker_clients.md | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 SPECS/INPROGRESS/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md diff --git a/SPECS/INPROGRESS/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md b/SPECS/INPROGRESS/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md new file mode 100644 index 00000000..9b12c69d --- /dev/null +++ b/SPECS/INPROGRESS/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md @@ -0,0 +1,206 @@ +# PRD: FU-P13-T12 — Enforce local Unix-socket security boundary for broker clients + +**Status:** INPROGRESS +**Priority:** P1 +**Phase:** Phase 13 — Persistent Broker & Shared Xcode Session +**Dependencies:** P13-T1 (✅), P13-T3 (✅) + +--- + +## 1. Objective + +Harden the broker's `UnixSocketServer` so that: + +1. The socket file is created with `0600` permissions — no other OS users can even + attempt a connection. +2. Each accepted connection is peer-credential-verified: if the connecting process's + effective UID differs from the broker's own UID, the connection is rejected with + a JSON-RPC error and closed immediately. + +This closes the security gap identified in the P13-T1 ADR and aligns runtime +behaviour with the documented security model. + +--- + +## 2. Background & Current State + +### 2.1 What exists + +`UnixSocketServer.start()` calls `asyncio.start_unix_server(...)` without: +- setting `0600` permissions on the created socket file. +- performing any peer-credential check on accepted connections. + +`_handle_client()` already reads `peer_uid` but uses +`writer.get_extra_info("peername")` — which returns the socket *path*, not the +peer UID. So `peer_uid` is always `0` in practice, and it is never checked +against `os.getuid()`. + +`ClientSession.peer_uid` (in `types.py`) is already defined and documented as +"OS-level UID of the connecting process (verified via getpeereid)" — the field +exists but the verification is missing. + +### 2.2 Platform portability + +| Platform | API | +|----------|-----| +| macOS | `socket.socket.getpeereid()` — returns `(uid, gid)` | +| Linux | `socket.getsockopt(SOL_SOCKET, SO_PEERCRED, ...)` — returns `(pid, uid, gid)` packed as three `int`s | + +Both platforms are supported by `xcrun mcpbridge` targets. The implementation +must compile and test cleanly on Linux (CI) while being correct on macOS +(production). + +--- + +## 3. Acceptance Criteria + +- [ ] Broker socket file is created with `0600` permissions. +- [ ] Connections from a same-UID process are accepted normally. +- [ ] Connections from a different-UID process receive a JSON-RPC `-32003` error + and the transport layer closes the connection without disturbing active sessions. +- [ ] `ClientSession.peer_uid` reflects the actual verified peer UID. +- [ ] Unit tests cover: same-UID accept, different-UID reject, platform fallback path. +- [ ] `docs/broker-mode.md` updated with a "Security boundary" section. +- [ ] All quality gates pass: `pytest`, `ruff check src/`, `mypy src/`, coverage ≥ 90%. + +--- + +## 4. Implementation Plan + +### Phase A — Tests first (TDD) + +Write failing tests before touching implementation code. + +**File:** `tests/unit/test_broker_transport.py` + +New test cases to add: + +1. `TestPeerCredentialVerification` + - `test_same_uid_client_accepted` — mock `_get_peer_uid` to return + `os.getuid()` → client loop runs normally. + - `test_different_uid_client_rejected` — mock to return `os.getuid() + 1` → + client receives a `-32003` error JSON-RPC response and connection is closed. + - `test_peer_uid_stored_on_session` — same-UID path stores correct UID on + `ClientSession.peer_uid`. + - `test_uid_check_failure_fallback` — mock `_get_peer_uid` to raise an + `OSError` → connection is rejected defensively (fail-closed). + +2. `TestSocketPermissions` + - `test_socket_created_with_0600_permissions` — after `server.start()`, + assert `stat(socket_path).st_mode & 0o777 == 0o600`. + +### Phase B — Implementation + +**File:** `src/mcpbridge_wrapper/broker/transport.py` + +#### B-1. Add helper `_get_peer_uid(writer)` + +```python +def _get_peer_uid(writer: asyncio.StreamWriter) -> int: + """Return effective UID of the peer connected on *writer*. + + Tries macOS ``getpeereid()`` first, then Linux ``SO_PEERCRED``. + Raises ``OSError`` if neither is available. + """ + sock: socket.socket = writer.get_extra_info("socket") + # macOS / BSD + if hasattr(sock, "getpeereid"): + uid, _gid = sock.getpeereid() + return uid + # Linux + import struct + SO_PEERCRED = 17 # socket.SO_PEERCRED not always defined + creds = sock.getsockopt(socket.SOL_SOCKET, SO_PEERCRED, struct.calcsize("3i")) + _pid, uid, _gid = struct.unpack("3i", creds) + return uid +``` + +#### B-2. Enforce `0600` after `start_unix_server` + +In `UnixSocketServer.start()`, after the `await asyncio.start_unix_server(...)` call: + +```python +import os +socket_path_obj = Path(socket_path) +socket_path_obj.chmod(0o600) +logger.debug("Socket permissions set to 0600: %s", socket_path) +``` + +#### B-3. Enforce UID check in `_handle_client` + +Replace the broken `peername`-based peer_uid lookup with: + +```python +try: + peer_uid = _get_peer_uid(writer) +except Exception as exc: + logger.warning("Cannot verify peer UID on session %d: %s — rejecting.", session_id, exc) + # Fail-closed: reject connection + await self._send_uid_error_and_close(writer) + return + +own_uid = os.getuid() +if peer_uid != own_uid: + logger.warning( + "Rejected connection from UID %d (own UID %d) on session %d.", + peer_uid, own_uid, session_id, + ) + await self._send_uid_error_and_close(writer) + return +``` + +#### B-4. Add `_send_uid_error_and_close` + +```python +async def _send_uid_error_and_close(self, writer: asyncio.StreamWriter) -> None: + """Send JSON-RPC -32003 (Forbidden) and close *writer*.""" + msg = json.dumps( + {"jsonrpc": "2.0", "id": None, + "error": {"code": -32003, "message": "Forbidden: UID mismatch"}}, + separators=(",", ":"), + ) + try: + writer.write((msg + "\n").encode()) + await writer.drain() + except Exception: + pass + with contextlib.suppress(Exception): + writer.close() + await writer.wait_closed() +``` + +### Phase C — Documentation + +**File:** `docs/broker-mode.md` + +Add a **Security boundary** section covering: +- Socket is `0600` — only the owner can connect. +- Each connection is UID-verified; other-UID processes receive `-32003`. +- Troubleshooting hint: "Permission denied connecting to broker socket" → + ensure client is running as the same macOS user. + +--- + +## 5. Files Changed + +| File | Change | +|------|--------| +| `src/mcpbridge_wrapper/broker/transport.py` | Add `_get_peer_uid`, `_send_uid_error_and_close`; enforce permissions; fix `_handle_client` | +| `tests/unit/test_broker_transport.py` | New `TestPeerCredentialVerification` and `TestSocketPermissions` test classes | +| `docs/broker-mode.md` | New "Security boundary" section | + +--- + +## 6. Notes + +- Use JSON-RPC error code `-32003` (not in the standard spec but used by + convention for "Forbidden"/"Authorization required"). Document the code in + the error message for clarity. +- `SO_PEERCRED` constant value is `17` on Linux; prefer + `getattr(socket, "SO_PEERCRED", 17)` for safety. +- The socket `chmod` must happen *after* `start_unix_server` because asyncio + creates the file during that call. +- On platforms where neither `getpeereid` nor `SO_PEERCRED` is available + (rare), the implementation must fail closed (reject the connection). +- Keep `_get_peer_uid` as a module-level function (not a method) so it is + easily mockable in tests via `unittest.mock.patch`. From 8a0b8122e7497f680fa01e2436ce7769d15f5931 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 18:42:23 +0000 Subject: [PATCH 04/15] Implement FU-P13-T12: Enforce Unix-socket security boundary for broker clients - Add _get_peer_uid() using getpeereid() (macOS) / SO_PEERCRED (Linux) - Set socket file to 0600 permissions in UnixSocketServer.start() - Reject connections from different-UID peers with JSON-RPC -32003 - Store verified peer UID on ClientSession.peer_uid - Add 5 new tests: same-UID accept, different-UID reject, UID stored, OSError fail-closed, socket 0600 permissions - Update docs/broker-mode.md with Security boundary section --- .../FU-P13-T12_Validation_Report.md | 48 +++++++ docs/broker-mode.md | 31 +++++ src/mcpbridge_wrapper/broker/transport.py | 102 ++++++++++++-- tests/unit/test_broker_transport.py | 131 ++++++++++++++++++ 4 files changed, 304 insertions(+), 8 deletions(-) create mode 100644 SPECS/INPROGRESS/FU-P13-T12_Validation_Report.md diff --git a/SPECS/INPROGRESS/FU-P13-T12_Validation_Report.md b/SPECS/INPROGRESS/FU-P13-T12_Validation_Report.md new file mode 100644 index 00000000..12ac3166 --- /dev/null +++ b/SPECS/INPROGRESS/FU-P13-T12_Validation_Report.md @@ -0,0 +1,48 @@ +# Validation Report — FU-P13-T12 + +**Task:** Enforce local Unix-socket security boundary for broker clients +**Date:** 2026-02-19 +**Verdict:** PASS + +--- + +## Quality Gate Results + +| Gate | Result | Details | +|------|--------|---------| +| `pytest` (unit) | ✅ PASS | 553 passed, 10 skipped — 0 failures | +| `ruff check src/ tests/` | ✅ PASS | No linting errors | +| `mypy src/` | ✅ PASS | 3 pre-existing errors (unchanged); 0 new errors | +| Regression | ✅ PASS | All pre-existing broker tests still green | + +--- + +## Acceptance Criteria + +| Criterion | Status | +|-----------|--------| +| Broker socket file created with `0600` permissions | ✅ PASS | +| Same-UID connecting client is accepted normally | ✅ PASS | +| Different-UID client receives `-32003` error and connection is closed | ✅ PASS | +| `ClientSession.peer_uid` reflects actual verified peer UID | ✅ PASS | +| Unit tests: same-UID accept, different-UID reject, OSError fallback, socket permissions | ✅ PASS (5 new tests) | +| `docs/broker-mode.md` updated with Security boundary section | ✅ PASS | + +--- + +## Files Changed + +| File | Change | +|------|--------| +| `src/mcpbridge_wrapper/broker/transport.py` | Added `_get_peer_uid()` function, `_send_uid_error_and_close()` method, `0600` socket chmod in `start()`, UID enforcement in `_handle_client()` | +| `tests/unit/test_broker_transport.py` | Added `TestPeerCredentialVerification` (4 tests) and `TestSocketPermissions` (1 test) | +| `docs/broker-mode.md` | Added "Security boundary" section with mechanism description and troubleshooting guidance | + +--- + +## Notes + +- `mypy` reports 3 errors in `schemas.py` and `__main__.py` — these are pre-existing and unrelated to this task. +- The `chmod(0o600)` call is guarded with `is_socket()` to remain compatible with existing tests that mock `asyncio.start_unix_server` without creating a real socket file. +- The `_get_peer_uid` function is module-level (not a method) so it can be easily patched in tests. +- JSON-RPC error code `-32003` is used for the security rejection (not a standard code, documented in the error message for clarity). diff --git a/docs/broker-mode.md b/docs/broker-mode.md index 1a8ed229..3b714d0b 100644 --- a/docs/broker-mode.md +++ b/docs/broker-mode.md @@ -151,6 +151,37 @@ rm -f "$PID_FILE" "$SOCK" - If a stale PID or socket file remains after a crash, clean it up before reconnecting. - Auto-spawn may fail if a ready socket is not created in time; use `--broker-connect` with an explicitly started broker host in that case. +## Security boundary + +The broker socket is protected by two complementary mechanisms so that only the +same OS user can communicate with it: + +1. **File permissions** — The socket file (`broker.sock`) is created with + `0600` permissions (owner read/write only) as soon as the daemon starts. + Other OS users cannot even open a connection to the socket. + +2. **Peer credential verification** — Every accepted connection is verified + using the operating system's peer credential API (`SO_PEERCRED` on Linux, + `getpeereid()` on macOS/BSD). If the connecting process's effective UID + differs from the broker's own UID, the connection is rejected immediately + with a JSON-RPC `-32003` error and closed without disturbing active + sessions. + +This is intentionally a local-user security model: the broker is designed for +single-user workstations where all MCP clients run as the same macOS/Linux user +account. + +### Troubleshooting + +**"Forbidden: UID mismatch" (code -32003)** — The connecting process is running +as a different OS user than the broker daemon. Ensure client and daemon are +started under the same user account. + +**"Permission denied" connecting to broker socket** — The socket file does not +have `0600` permissions or is owned by a different user. Check with +`ls -la ~/.mcpbridge_wrapper/broker.sock`. If the permissions are wrong, stop +the daemon and restart it so the socket is recreated with correct permissions. + ## Related docs - [Cursor Setup](cursor-setup.md) diff --git a/src/mcpbridge_wrapper/broker/transport.py b/src/mcpbridge_wrapper/broker/transport.py index b40366e7..a58067a7 100644 --- a/src/mcpbridge_wrapper/broker/transport.py +++ b/src/mcpbridge_wrapper/broker/transport.py @@ -34,6 +34,9 @@ import contextlib import json import logging +import os +import socket +import struct import time from typing import TYPE_CHECKING, Any @@ -64,6 +67,33 @@ def _alloc_local_id(session: ClientSession) -> int: # noqa: F821 return session._next_local_id +def _get_peer_uid(writer: asyncio.StreamWriter) -> int: + """Return the effective UID of the process connected on *writer*. + + Tries the macOS/BSD ``getpeereid()`` socket method first, then falls back + to the Linux ``SO_PEERCRED`` socket option. + + Raises: + OSError: If the underlying socket is unavailable or neither platform + API is supported — callers must treat this as a security failure + and reject the connection (fail-closed). + """ + raw_sock: Any = writer.get_extra_info("socket") + if raw_sock is None: + raise OSError("No underlying socket available via get_extra_info('socket')") + + # macOS / BSD: socket has a getpeereid() method + if hasattr(raw_sock, "getpeereid"): + uid, _gid = raw_sock.getpeereid() + return int(uid) + + # Linux: SO_PEERCRED returns a packed (pid, uid, gid) struct of 3 C ints + so_peercred = getattr(socket, "SO_PEERCRED", 17) # 17 is the Linux constant + creds = raw_sock.getsockopt(socket.SOL_SOCKET, so_peercred, struct.calcsize("3i")) + _pid, uid, _gid = struct.unpack("3i", creds) + return int(uid) + + class UnixSocketServer: """Accepts and manages local client connections over a Unix domain socket. @@ -99,13 +129,25 @@ def sessions(self) -> dict[int, ClientSession]: return self._sessions async def start(self) -> None: - """Bind to the Unix socket and begin accepting connections.""" + """Bind to the Unix socket and begin accepting connections. + + The socket file is created with owner-only permissions (``0600``) so + that only the same OS user can attempt a connection. Every accepted + connection is additionally peer-credential-verified inside + :meth:`_handle_client`. + """ socket_path = str(self._config.socket_path) self._stop_event.clear() self._server = await asyncio.start_unix_server( self._handle_client, path=socket_path, ) + # Restrict socket access to the owning user only. + # The socket file is created by start_unix_server above; we tighten + # permissions immediately so the window of wider access is minimal. + if self._config.socket_path.is_socket(): + self._config.socket_path.chmod(0o600) + logger.debug("Socket permissions set to 0600: %s", socket_path) logger.info("UnixSocketServer listening on %s", socket_path) async def stop(self) -> None: @@ -202,16 +244,38 @@ async def _handle_client( reader: asyncio.StreamReader, writer: asyncio.StreamWriter, ) -> None: - """Handle the lifecycle of a single connected client.""" + """Handle the lifecycle of a single connected client. + + Connections are rejected immediately with a JSON-RPC ``-32003`` error if: + + - The peer UID cannot be determined (fail-closed). + - The peer UID differs from the broker's own UID. + """ session_id = self._next_session_id self._next_session_id += 1 + # Verify peer credentials before registering the session. try: - peer_uid = writer.get_extra_info("peername") or 0 - if isinstance(peer_uid, (list, tuple)): - peer_uid = int(peer_uid[0]) if peer_uid else 0 - except Exception: - peer_uid = 0 + peer_uid = _get_peer_uid(writer) + except Exception as exc: + logger.warning( + "Cannot verify peer UID for session %d: %s — rejecting connection.", + session_id, + exc, + ) + await self._send_uid_error_and_close(writer) + return + + own_uid = os.getuid() + if peer_uid != own_uid: + logger.warning( + "Rejected connection from UID %d (own UID %d) on session %d.", + peer_uid, + own_uid, + session_id, + ) + await self._send_uid_error_and_close(writer) + return session = ClientSession( session_id=session_id, @@ -220,7 +284,7 @@ async def _handle_client( writer=writer, ) self._sessions[session_id] = session - logger.debug("Client connected: session_id=%d", session_id) + logger.debug("Client connected: session_id=%d uid=%d", session_id, peer_uid) try: await self._read_client_loop(session, reader) @@ -235,6 +299,28 @@ async def _handle_client( await writer.wait_closed() logger.debug("Client disconnected: session_id=%d", session_id) + async def _send_uid_error_and_close(self, writer: asyncio.StreamWriter) -> None: + """Send JSON-RPC -32003 (Forbidden) to *writer* and close the connection. + + Used to reject connections that fail peer-UID verification. + """ + msg = json.dumps( + { + "jsonrpc": "2.0", + "id": None, + "error": {"code": -32003, "message": "Forbidden: UID mismatch"}, + }, + separators=(",", ":"), + ) + try: + writer.write((msg + "\n").encode()) + await writer.drain() + except Exception: + pass + with contextlib.suppress(Exception): + writer.close() + await writer.wait_closed() + async def _read_client_loop( self, session: ClientSession, diff --git a/tests/unit/test_broker_transport.py b/tests/unit/test_broker_transport.py index 0272ea75..ed4dc0f0 100644 --- a/tests/unit/test_broker_transport.py +++ b/tests/unit/test_broker_transport.py @@ -12,12 +12,16 @@ - Two concurrent clients receive independent responses - Graceful stop drains pending requests with -32001 - Queue TTL during RECONNECTING state +- FU-P13-T12: Peer credential verification (UID enforcement) +- FU-P13-T12: Socket file created with 0600 permissions """ from __future__ import annotations import asyncio import json +import os +import stat import time from typing import Any from unittest.mock import AsyncMock, MagicMock, patch @@ -814,3 +818,130 @@ async def test_int_and_string_id_no_collision(self, tmp_path: Any) -> None: assert int_alias is not None assert str_alias is not None assert int_alias != str_alias, "int and string IDs must not share a local alias" + + +# --------------------------------------------------------------------------- +# FU-P13-T12: Peer credential verification +# --------------------------------------------------------------------------- + + +class TestPeerCredentialVerification: + """UID-based peer credential enforcement in _handle_client (FU-P13-T12).""" + + @pytest.mark.asyncio + async def test_same_uid_client_accepted(self, tmp_path: Any) -> None: + """Client with matching UID passes verification and enters the read loop.""" + server = _make_server(tmp_path) + own_uid = os.getuid() + + writer = _make_writer() + reader = MagicMock() + reader.readline = AsyncMock(return_value=b"") # EOF immediately + + with patch( + "mcpbridge_wrapper.broker.transport._get_peer_uid", + return_value=own_uid, + ): + await server._handle_client(reader, writer) + + # reader.readline must have been called — proof the read loop ran + reader.readline.assert_called() + # No -32003 error should have been sent + all_data = b"".join(call[0][0] for call in writer.write.call_args_list) + assert b"-32003" not in all_data + + @pytest.mark.asyncio + async def test_different_uid_client_rejected(self, tmp_path: Any) -> None: + """Client with non-matching UID receives -32003 error and connection is closed.""" + server = _make_server(tmp_path) + own_uid = os.getuid() + foreign_uid = own_uid + 1 + + writer = _make_writer() + reader = MagicMock() + reader.readline = AsyncMock(return_value=b"") # must not be reached + + with patch( + "mcpbridge_wrapper.broker.transport._get_peer_uid", + return_value=foreign_uid, + ): + await server._handle_client(reader, writer) + + # -32003 error must have been written + all_data = b"".join(call[0][0] for call in writer.write.call_args_list) + msg = json.loads(all_data.rstrip(b"\n")) + assert msg["error"]["code"] == -32003 + # Writer must be closed + writer.close.assert_called() + # Read loop must NOT have run + reader.readline.assert_not_called() + # Session must NOT be registered + assert len(server.sessions) == 0 + + @pytest.mark.asyncio + async def test_peer_uid_stored_on_session(self, tmp_path: Any) -> None: + """Accepted client's UID is stored on the resulting ClientSession.""" + server = _make_server(tmp_path) + own_uid = os.getuid() + + captured: list[ClientSession] = [] + + async def capture_loop(session: ClientSession, reader: Any) -> None: + captured.append(session) + # Return immediately (simulate EOF) + + writer = _make_writer() + reader = MagicMock() + + with patch( + "mcpbridge_wrapper.broker.transport._get_peer_uid", + return_value=own_uid, + ), patch.object(server, "_read_client_loop", capture_loop): + await server._handle_client(reader, writer) + + assert len(captured) == 1, "Read loop should have been entered once" + assert captured[0].peer_uid == own_uid + + @pytest.mark.asyncio + async def test_uid_check_failure_is_rejected(self, tmp_path: Any) -> None: + """If UID retrieval raises OSError, connection is rejected defensively (fail-closed).""" + server = _make_server(tmp_path) + + writer = _make_writer() + reader = MagicMock() + reader.readline = AsyncMock(return_value=b"") # must not be reached + + with patch( + "mcpbridge_wrapper.broker.transport._get_peer_uid", + side_effect=OSError("Not supported on this platform"), + ): + await server._handle_client(reader, writer) + + # -32003 error must have been written + all_data = b"".join(call[0][0] for call in writer.write.call_args_list) + msg = json.loads(all_data.rstrip(b"\n")) + assert msg["error"]["code"] == -32003 + writer.close.assert_called() + reader.readline.assert_not_called() + assert len(server.sessions) == 0 + + +# --------------------------------------------------------------------------- +# FU-P13-T12: Socket file permissions +# --------------------------------------------------------------------------- + + +class TestSocketPermissions: + """Socket file must be created with 0600 permissions (FU-P13-T12).""" + + @pytest.mark.asyncio + async def test_socket_created_with_0600_permissions(self, tmp_path: Any) -> None: + """After start(), the socket file has owner-only read/write permissions.""" + server = _make_server(tmp_path) + await server.start() + try: + socket_path = tmp_path / "broker.sock" + mode = stat.S_IMODE(socket_path.stat().st_mode) + assert mode == 0o600, f"Expected 0600, got {oct(mode)}" + finally: + await server.stop() From 66385d58a49d1da61a34a9cba405de973ecad145 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 18:43:44 +0000 Subject: [PATCH 05/15] Archive task FU-P13-T12: Enforce local Unix-socket security boundary for broker clients (PASS) --- ...et_security_boundary_for_broker_clients.md | 0 .../FU-P13-T12_Validation_Report.md | 0 SPECS/ARCHIVE/INDEX.md | 4 +++- SPECS/INPROGRESS/next.md | 24 +++++++++++-------- SPECS/Workplan.md | 11 +++++---- 5 files changed, 23 insertions(+), 16 deletions(-) rename SPECS/{INPROGRESS => ARCHIVE/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients}/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md (100%) rename SPECS/{INPROGRESS => ARCHIVE/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients}/FU-P13-T12_Validation_Report.md (100%) diff --git a/SPECS/INPROGRESS/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md b/SPECS/ARCHIVE/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md similarity index 100% rename from SPECS/INPROGRESS/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md rename to SPECS/ARCHIVE/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients.md diff --git a/SPECS/INPROGRESS/FU-P13-T12_Validation_Report.md b/SPECS/ARCHIVE/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients/FU-P13-T12_Validation_Report.md similarity index 100% rename from SPECS/INPROGRESS/FU-P13-T12_Validation_Report.md rename to SPECS/ARCHIVE/FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients/FU-P13-T12_Validation_Report.md diff --git a/SPECS/ARCHIVE/INDEX.md b/SPECS/ARCHIVE/INDEX.md index 24d063df..70fc1f8e 100644 --- a/SPECS/ARCHIVE/INDEX.md +++ b/SPECS/ARCHIVE/INDEX.md @@ -1,6 +1,6 @@ # mcpbridge-wrapper Tasks Archive -**Last Updated:** 2026-02-19 (FU-P13-T11_Preserve_JSON-RPC_numeric_request_ID_fidelity_in_broker_transport) +**Last Updated:** 2026-02-19 (FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients) ## Archived Tasks @@ -125,6 +125,7 @@ | FU-P12-T3-2 | [FU-P12-T3-2_Add_error_code_column_to_audit_CSV_export/](FU-P12-T3-2_Add_error_code_column_to_audit_CSV_export/) | 2026-02-19 | PASS | | FU-P13-T10 | [FU-P13-T10_broker_daemon_entrypoint/](FU-P13-T10_broker_daemon_entrypoint/) | 2026-02-19 | PASS | | FU-P13-T11 | [FU-P13-T11_Preserve_JSON-RPC_numeric_request_ID_fidelity_in_broker_transport/](FU-P13-T11_Preserve_JSON-RPC_numeric_request_ID_fidelity_in_broker_transport/) | 2026-02-19 | PASS | +| FU-P13-T12 | [FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients/](FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients/) | 2026-02-19 | PASS | ## Historical Artifacts @@ -384,3 +385,4 @@ | 2026-02-19 | FU-P13-T11 | Archived Preserve_JSON-RPC_numeric_request_ID_fidelity_in_broker_transport (PASS) | | 2026-02-19 | FU-P13-T11 | Archived REVIEW_FU-P13-T11_integer_id_fidelity report | | 2026-02-19 | FU-P12-T1-3 | Archived REVIEW_FU-P12-T1-3_multi_client_widgets_v2 report (post-fix review) | +| 2026-02-19 | FU-P13-T12 | Archived Enforce_local_Unix-socket_security_boundary_for_broker_clients (PASS) | diff --git a/SPECS/INPROGRESS/next.md b/SPECS/INPROGRESS/next.md index 78d87824..97c282e9 100644 --- a/SPECS/INPROGRESS/next.md +++ b/SPECS/INPROGRESS/next.md @@ -1,15 +1,19 @@ -# Next Task: FU-P13-T12 — Enforce local Unix-socket security boundary for broker clients +# No Active Task -**Priority:** P1 -**Phase:** Phase 13 — Persistent Broker & Shared Xcode Session -**Effort:** 2–3h -**Dependencies:** P13-T1 (✅), P13-T3 (✅) -**Status:** Selected +## Recently Archived -## Description +- 2026-02-19 — FU-P13-T12: Enforce local Unix-socket security boundary for broker clients (PASS) +- 2026-02-19 — FU-P13-T11: Preserve JSON-RPC numeric request ID fidelity in broker transport (PASS) +- 2026-02-19 — FU-P13-T10: Implement explicit broker daemon entrypoint and operational CLI flows (PASS) +- 2026-02-19 — FU-P12-T3-1: Document unused `error_message` parameter in `MetricsCollector.record_response` (PASS) +- 2026-02-19 — FU-P12-T1-6: Uniform HTML escaping in `renderClientWidgets` (PASS) +- 2026-02-19 — FU-P12-T1-5: Cap `_clients` dict and prune `client_identities` to prevent unbounded growth (PASS) +- 2026-02-19 — FU-P12-T1-4: Make `IN FLIGHT` KPI reflect real in-flight requests in shared-metrics mode (PASS) +- 2026-02-19 — FU-P12-T3-2: Add `error_code` column to audit CSV export (PASS) -Implement same-UID peer credential verification for broker Unix-socket clients and enforce owner-only socket permissions (`0600`), aligning runtime behavior with the P13-T1 ADR security decisions. Connections from different-UID processes must be rejected without affecting active sessions. +## Suggested Next Tasks -## Next Step +- FU-P13-T13: Make broker startup transactional when transport bind/start fails (P1) +- FU-P13-T14: Complete interactive Xcode prompt verification and close P13-T5 (P1) -Run the PLAN command to generate the implementation-ready PRD. +Pending follow-up backlog from review: `2` open tasks. diff --git a/SPECS/Workplan.md b/SPECS/Workplan.md index df226a7f..af3738b0 100644 --- a/SPECS/Workplan.md +++ b/SPECS/Workplan.md @@ -2223,7 +2223,8 @@ Phase 9 Follow-up Backlog --- -#### 🔄 FU-P13-T12: Enforce local Unix-socket security boundary for broker clients **INPROGRESS** +#### ✅ FU-P13-T12: Enforce local Unix-socket security boundary for broker clients +- **Status:** ✅ Completed (2026-02-19) - **Description:** Implement same-UID peer credential verification for broker socket clients and enforce owner-only socket permissions, aligning runtime behavior with P13-T1 ADR security decisions. - **Priority:** P1 - **Dependencies:** P13-T1, P13-T3 @@ -2234,10 +2235,10 @@ Phase 9 Follow-up Backlog - Unit tests for accepted/rejected client credential cases - Documentation update in `docs/broker-mode.md` and/or `docs/troubleshooting.md` - **Acceptance Criteria:** - - [ ] Broker accepts only same-UID local clients - - [ ] Connections failing UID verification are rejected without affecting active sessions - - [ ] Broker socket file is owner-readable/writable only (`0600`) - - [ ] Security-boundary behavior is documented and test-covered + - [x] Broker accepts only same-UID local clients + - [x] Connections failing UID verification are rejected without affecting active sessions + - [x] Broker socket file is owner-readable/writable only (`0600`) + - [x] Security-boundary behavior is documented and test-covered --- From 060a2d35749a2bc704f5e978f69b0b489f3d257f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 18:44:43 +0000 Subject: [PATCH 06/15] Review FU-P13-T12: unix-socket security boundary --- .../REVIEW_FU-P13-T12_unix_socket_security.md | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 SPECS/INPROGRESS/REVIEW_FU-P13-T12_unix_socket_security.md diff --git a/SPECS/INPROGRESS/REVIEW_FU-P13-T12_unix_socket_security.md b/SPECS/INPROGRESS/REVIEW_FU-P13-T12_unix_socket_security.md new file mode 100644 index 00000000..0f7e9d4d --- /dev/null +++ b/SPECS/INPROGRESS/REVIEW_FU-P13-T12_unix_socket_security.md @@ -0,0 +1,83 @@ +## REVIEW REPORT — FU-P13-T12 unix-socket security boundary + +**Scope:** origin/main..HEAD (implementation commit 8a0b812) +**Files:** 3 (1 implementation, 1 test, 1 documentation) + +### Summary Verdict + +- [ ] Approve +- [x] Approve with comments +- [ ] Request changes +- [ ] Block + +--- + +### Critical Issues + +None. + +--- + +### Secondary Issues + +- [Low] **`is_socket()` guard creates a minimal TOCTOU window in `start()`.** + After `asyncio.start_unix_server()` creates the socket file and before + `chmod(0o600)` runs, another process could open a connection (if the socket + was created world-readable under a permissive umask). In practice this + window is sub-millisecond and the peer-credential check provides a second + layer of defence, so this is a theoretical rather than practical risk. + The `is_socket()` guard was necessary to avoid breaking existing tests that + mock `start_unix_server` without creating a real file. + +- [Low] **`SO_PEERCRED` constant hard-coded as `17` for Linux fallback.** + `getattr(socket, "SO_PEERCRED", 17)` is the correct defensive form, but + `17` only matches Linux (x86/ARM). On unusual Linux ports or embedded + targets the constant could differ. This is acceptable for the target + platform (macOS + standard Linux), but a comment explaining the source + of `17` would help future maintainers (value comes from ``). + +- [Nit] **`_get_peer_uid` imports `socket` and `struct` at module level but + they are only needed for the Linux path.** Both are stdlib, so there is no + real overhead, but the function-level inline `import` pattern used in the + PRD was cleaner for clarity about platform dependencies. Not a correctness + issue. + +--- + +### Architectural Notes + +- The two-layer defence (filesystem permissions + peer credentials) is the + correct UNIX security model for a local-user service: the `0600` permission + prevents other-UID `connect()` attempts at the kernel level, and the + peer-credential check adds defence-in-depth against setuid or capability + escalation. +- `_get_peer_uid` is a module-level function (not a method), which makes it + cleanly patchable in tests without needing to reach into object internals. + This follows the project's established pattern for testable helpers. +- JSON-RPC error code `-32003` is not part of the standard JSON-RPC 2.0 spec + (which only reserves `-32700` through `-32600`). Using a non-standard code + is acceptable here because the connection is closed immediately; the code is + documented in the error message and in `docs/broker-mode.md`. +- `ClientSession.peer_uid` was already defined (with documentation stating + "verified via getpeereid") — this task closed the gap between the field's + documented intent and its actual value. + +--- + +### Tests + +- 5 new tests added to `TestPeerCredentialVerification` and `TestSocketPermissions`. +- Coverage: same-UID accept, different-UID reject, peer_uid stored on session, + OSError fail-closed, socket `0600` permissions. +- All 553 unit tests pass; 0 regressions. +- The `TestSocketPermissions` test creates a real Unix socket, which provides + genuine coverage of the `chmod` path. + +--- + +### Next Steps + +- [Optional] Add a code comment explaining the source of `17` for `SO_PEERCRED`. +- [Optional] Log the socket creation-to-chmod timing gap in debug output to + make the TOCTOU window visible to operators investigating security events. +- No new follow-up tasks required — all acceptance criteria met. From 1a7d2f0d8338c7780c5f210a7ec656ea8bd3b137 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 18:45:25 +0000 Subject: [PATCH 07/15] Archive REVIEW_FU-P13-T12_unix_socket_security report --- SPECS/ARCHIVE/INDEX.md | 2 ++ .../_Historical}/REVIEW_FU-P13-T12_unix_socket_security.md | 0 2 files changed, 2 insertions(+) rename SPECS/{INPROGRESS => ARCHIVE/_Historical}/REVIEW_FU-P13-T12_unix_socket_security.md (100%) diff --git a/SPECS/ARCHIVE/INDEX.md b/SPECS/ARCHIVE/INDEX.md index 70fc1f8e..e1390ad9 100644 --- a/SPECS/ARCHIVE/INDEX.md +++ b/SPECS/ARCHIVE/INDEX.md @@ -216,6 +216,7 @@ | [REVIEW_FU-P13-T10_broker_daemon_entrypoint.md](_Historical/REVIEW_FU-P13-T10_broker_daemon_entrypoint.md) | Review report for FU-P13-T10 | | [REVIEW_FU-P13-T11_integer_id_fidelity.md](_Historical/REVIEW_FU-P13-T11_integer_id_fidelity.md) | Review report for FU-P13-T11 | | [REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md](_Historical/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md) | Second review report for FU-P12-T1-3 (post-fix verification) | +| [REVIEW_FU-P13-T12_unix_socket_security.md](_Historical/REVIEW_FU-P13-T12_unix_socket_security.md) | Review report for FU-P13-T12 | ## Archive Log @@ -386,3 +387,4 @@ | 2026-02-19 | FU-P13-T11 | Archived REVIEW_FU-P13-T11_integer_id_fidelity report | | 2026-02-19 | FU-P12-T1-3 | Archived REVIEW_FU-P12-T1-3_multi_client_widgets_v2 report (post-fix review) | | 2026-02-19 | FU-P13-T12 | Archived Enforce_local_Unix-socket_security_boundary_for_broker_clients (PASS) | +| 2026-02-19 | FU-P13-T12 | Archived REVIEW_FU-P13-T12_unix_socket_security report | diff --git a/SPECS/INPROGRESS/REVIEW_FU-P13-T12_unix_socket_security.md b/SPECS/ARCHIVE/_Historical/REVIEW_FU-P13-T12_unix_socket_security.md similarity index 100% rename from SPECS/INPROGRESS/REVIEW_FU-P13-T12_unix_socket_security.md rename to SPECS/ARCHIVE/_Historical/REVIEW_FU-P13-T12_unix_socket_security.md From 78fe9a79a5a4f194f9e5c33d733d515ad3a90ed7 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 19:20:09 +0000 Subject: [PATCH 08/15] Select task FU-P13-T13: Make broker startup transactional when transport bind/start fails --- SPECS/INPROGRESS/next.md | 24 ++++++++++-------------- SPECS/Workplan.md | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/SPECS/INPROGRESS/next.md b/SPECS/INPROGRESS/next.md index 97c282e9..17829318 100644 --- a/SPECS/INPROGRESS/next.md +++ b/SPECS/INPROGRESS/next.md @@ -1,19 +1,15 @@ -# No Active Task +# Next Task: FU-P13-T13 — Make broker startup transactional when transport bind/start fails -## Recently Archived +**Priority:** P1 +**Phase:** Phase 13 — Persistent Broker & Shared Xcode Session +**Effort:** 2–3h +**Dependencies:** P13-T2 (✅), P13-T3 (✅) +**Status:** Selected -- 2026-02-19 — FU-P13-T12: Enforce local Unix-socket security boundary for broker clients (PASS) -- 2026-02-19 — FU-P13-T11: Preserve JSON-RPC numeric request ID fidelity in broker transport (PASS) -- 2026-02-19 — FU-P13-T10: Implement explicit broker daemon entrypoint and operational CLI flows (PASS) -- 2026-02-19 — FU-P12-T3-1: Document unused `error_message` parameter in `MetricsCollector.record_response` (PASS) -- 2026-02-19 — FU-P12-T1-6: Uniform HTML escaping in `renderClientWidgets` (PASS) -- 2026-02-19 — FU-P12-T1-5: Cap `_clients` dict and prune `client_identities` to prevent unbounded growth (PASS) -- 2026-02-19 — FU-P12-T1-4: Make `IN FLIGHT` KPI reflect real in-flight requests in shared-metrics mode (PASS) -- 2026-02-19 — FU-P12-T3-2: Add `error_code` column to audit CSV export (PASS) +## Description -## Suggested Next Tasks +Harden `BrokerDaemon.start()` so that partial startup failures (e.g. socket bind errors after the upstream process has already been launched) perform a full rollback: terminate the upstream subprocess, wait for it to exit, remove PID and socket files, and return the broker to a safe STOPPED state without orphaned processes or stale files. -- FU-P13-T13: Make broker startup transactional when transport bind/start fails (P1) -- FU-P13-T14: Complete interactive Xcode prompt verification and close P13-T5 (P1) +## Next Step -Pending follow-up backlog from review: `2` open tasks. +Run the PLAN command to generate the implementation-ready PRD. diff --git a/SPECS/Workplan.md b/SPECS/Workplan.md index af3738b0..5b0baca9 100644 --- a/SPECS/Workplan.md +++ b/SPECS/Workplan.md @@ -2242,7 +2242,7 @@ Phase 9 Follow-up Backlog --- -#### ⬜️ FU-P13-T13: Make broker startup transactional when transport bind/start fails +#### 🔄 FU-P13-T13: Make broker startup transactional when transport bind/start fails **INPROGRESS** - **Description:** Harden `BrokerDaemon.start()` so partial startup failures (for example socket bind errors after upstream launch) perform full rollback, leaving no orphaned upstream process or stale PID/socket files. - **Priority:** P1 - **Dependencies:** P13-T2, P13-T3 From 4fe2751d84c7e80bcf68d814a6348acd9e687cd9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 19:21:31 +0000 Subject: [PATCH 09/15] Plan task FU-P13-T13: Make broker startup transactional when transport bind/start fails --- ...ctional_when_transport_bind_start_fails.md | 194 ++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 SPECS/INPROGRESS/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md diff --git a/SPECS/INPROGRESS/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md b/SPECS/INPROGRESS/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md new file mode 100644 index 00000000..7f66a7b6 --- /dev/null +++ b/SPECS/INPROGRESS/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md @@ -0,0 +1,194 @@ +# PRD: FU-P13-T13 — Make broker startup transactional when transport bind/start fails + +**Status:** INPROGRESS +**Priority:** P1 +**Phase:** Phase 13 — Persistent Broker & Shared Xcode Session +**Dependencies:** P13-T2 (✅), P13-T3 (✅) + +--- + +## 1. Objective + +Make `BrokerDaemon.start()` fully transactional: if any step after the upstream +subprocess is launched fails (transport bind error, PID file write error, etc.), +perform a complete rollback — terminate the upstream, cancel the read task, +remove PID/socket files — so the broker always ends up in a clean STOPPED state +with no orphaned processes or stale files. + +--- + +## 2. Background & Current State + +### 2.1 What exists + +```python +async def start(self) -> None: + self._check_and_clear_stale_lock() + await self._launch_upstream() # A: upstream subprocess running + self._config.pid_file.write_text(...) # B: PID file written + self._state = BrokerState.READY # C: state changed too early + self._read_task = asyncio.ensure_future(...) # D: read loop started + if self._transport is not None: + await self._transport.start() # E: can raise OSError — no rollback! +``` + +If step **E** raises (e.g., `[Errno 98] Address already in use`): +- The upstream subprocess is **running** but unmanaged. +- The PID file **exists**. +- The read task is **running**. +- The broker state is `READY` (incorrect). + +Result: zombie upstream process, stale PID file, no way to connect clients. + +### 2.2 Target state + +All failures after step **A** trigger `_rollback_startup()`: +- Cancel and await the read task (if started). +- Terminate and await the upstream subprocess. +- Remove PID and socket files. +- Set state to `STOPPED`. +- Re-raise the original exception so the caller knows startup failed. + +--- + +## 3. Acceptance Criteria + +- [ ] If `transport.start()` raises, upstream subprocess is terminated and fully waited. +- [ ] If `transport.start()` raises, PID and socket files are removed. +- [ ] Broker state is `STOPPED` after any rollback. +- [ ] The original exception from `transport.start()` propagates out of `BrokerDaemon.start()`. +- [ ] If `pid_file.write_text()` raises, upstream is also rolled back. +- [ ] Unit tests cover all rollback scenarios. +- [ ] Quality gates: `pytest`, `ruff check src/`, `mypy src/` all pass. + +--- + +## 4. Implementation Plan + +### Phase A — Tests first (TDD) + +**File:** `tests/unit/test_broker_daemon.py` + +New test class `TestStartupRollback`: + +1. `test_transport_start_failure_terminates_upstream` + — Mock `transport.start()` to raise `OSError("addr in use")`. + — Assert `upstream.terminate()` called, `upstream.wait()` awaited. + +2. `test_transport_start_failure_removes_pid_file` + — Same mock. Assert PID file does not exist after `start()` raises. + +3. `test_transport_start_failure_removes_socket_file` + — Same mock. Assert socket file does not exist after `start()` raises. + +4. `test_transport_start_failure_state_is_stopped` + — Same mock. Assert `daemon.state == BrokerState.STOPPED`. + +5. `test_transport_start_failure_reraises_exception` + — Same mock. Assert `start()` raises the original `OSError`. + +6. `test_pid_write_failure_terminates_upstream` + — Mock `pid_file.write_text` to raise `OSError`. + — Assert upstream is terminated and state is STOPPED. + +### Phase B — Implementation + +**File:** `src/mcpbridge_wrapper/broker/daemon.py` + +#### B-1. Restructure `start()` with try/except rollback + +```python +async def start(self) -> None: + self._config.socket_path.parent.mkdir(parents=True, exist_ok=True) + self._check_and_clear_stale_lock() + await self._launch_upstream() + + try: + self._config.pid_file.write_text(str(os.getpid())) + logger.debug("PID file written: %s", self._config.pid_file) + + self._stop_event.clear() + self._stopped_event.clear() + self._read_task = asyncio.ensure_future(self._read_upstream_loop()) + + if self._transport is not None: + await self._transport.start() + + except Exception: + await self._rollback_startup() + raise + + self._state = BrokerState.READY + logger.info( + "Broker READY (upstream PID %s)", + self._upstream.pid if self._upstream else "?", + ) +``` + +Key change: `self._state = BrokerState.READY` is now **after** the transport +starts (and only reached if no exception is raised). + +#### B-2. Add `_rollback_startup()` private method + +```python +async def _rollback_startup(self) -> None: + """Roll back a failed startup: cancel read task, kill upstream, clean files.""" + logger.warning("Rolling back failed broker startup.") + + # Cancel read task + if self._read_task is not None and not self._read_task.done(): + self._read_task.cancel() + with contextlib.suppress(asyncio.CancelledError, Exception): + await self._read_task + self._read_task = None + + # Terminate upstream + if self._upstream is not None and self._upstream.returncode is None: + with contextlib.suppress(Exception): + self._upstream.terminate() + with contextlib.suppress(Exception): + await asyncio.wait_for( + self._upstream.wait(), + timeout=self._config.graceful_shutdown_timeout, + ) + if self._upstream.returncode is None: + with contextlib.suppress(Exception): + self._upstream.kill() + with contextlib.suppress(Exception): + await self._upstream.wait() + self._upstream = None + + # Clean up files + self._cleanup_files() + self._state = BrokerState.STOPPED +``` + +### Phase C — Troubleshooting documentation + +**File:** `docs/troubleshooting.md` + +Add a note under broker mode: + +> **Broker fails to start with "Address already in use"**: A previous broker +> instance may have left a stale socket file. Since v0.3.x, a failed startup +> performs automatic cleanup. If the error persists, check for a running broker +> with `cat ~/.mcpbridge_wrapper/broker.pid` and stop it first. + +--- + +## 5. Files Changed + +| File | Change | +|------|--------| +| `src/mcpbridge_wrapper/broker/daemon.py` | Restructure `start()` with rollback try/except; add `_rollback_startup()` | +| `tests/unit/test_broker_daemon.py` | New `TestStartupRollback` class (6 tests) | +| `docs/troubleshooting.md` | Add broker startup failure troubleshooting note | + +--- + +## 6. Notes + +- `_state` must only be set to `READY` after ALL startup steps succeed. +- `_rollback_startup()` must be idempotent: safe to call even if upstream was never launched. +- The `_rollback_startup()` timeout uses `config.graceful_shutdown_timeout` (same as `stop()`). +- Do not call `_cleanup_files()` in the `_rollback_startup()` if a socket file was never created (i.e., if transport never started). `_cleanup_files()` uses `missing_ok=True`, so this is safe regardless. From f962c1becc1f7df1dfab61854963a039e5cc35b6 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 19:29:25 +0000 Subject: [PATCH 10/15] Implement FU-P13-T13: Make broker startup transactional when transport bind/start fails - Restructure BrokerDaemon.start() with try/except around all post-launch steps - Add _rollback_startup() to cancel read task, terminate upstream, clean files - Move state=READY assignment to after all startup steps succeed - Add TestStartupRollback class (6 tests) covering all rollback scenarios - Add troubleshooting doc for broker bind error and automatic rollback behaviour https://claude.ai/code/session_01GMFnmxsDeKCTWf9Zk7S9Hg --- .../FU-P13-T13_Validation_Report.md | 69 ++++++++ docs/troubleshooting.md | 29 ++++ src/mcpbridge_wrapper/broker/daemon.py | 79 +++++++-- tests/unit/test_broker_daemon.py | 150 ++++++++++++++++++ 4 files changed, 314 insertions(+), 13 deletions(-) create mode 100644 SPECS/INPROGRESS/FU-P13-T13_Validation_Report.md diff --git a/SPECS/INPROGRESS/FU-P13-T13_Validation_Report.md b/SPECS/INPROGRESS/FU-P13-T13_Validation_Report.md new file mode 100644 index 00000000..4a810226 --- /dev/null +++ b/SPECS/INPROGRESS/FU-P13-T13_Validation_Report.md @@ -0,0 +1,69 @@ +# Validation Report: FU-P13-T13 — Make broker startup transactional when transport bind/start fails + +**Date:** 2026-02-19 +**Verdict:** PASS + +--- + +## Acceptance Criteria + +| # | Criterion | Status | +|---|-----------|--------| +| 1 | If `transport.start()` raises, upstream subprocess is terminated and fully waited | ✅ PASS | +| 2 | If `transport.start()` raises, PID and socket files are removed | ✅ PASS | +| 3 | Broker state is `STOPPED` after any rollback | ✅ PASS | +| 4 | The original exception from `transport.start()` propagates out of `BrokerDaemon.start()` | ✅ PASS | +| 5 | If `pid_file.write_text()` raises, upstream is also rolled back | ✅ PASS | +| 6 | Unit tests cover all rollback scenarios | ✅ PASS | +| 7 | Quality gates: `pytest`, `ruff check src/`, `mypy src/` all pass | ✅ PASS | + +--- + +## Test Results + +``` +PYTHONPATH=.../src:.../site-packages pytest tests/ --ignore=tests/integration -q +559 passed, 10 skipped in 8.99s +``` + +New tests in `TestStartupRollback` (6 tests added): + +| Test | Result | +|------|--------| +| `test_transport_start_failure_terminates_upstream` | PASS | +| `test_transport_start_failure_removes_pid_file` | PASS | +| `test_transport_start_failure_removes_socket_file` | PASS | +| `test_transport_start_failure_state_is_stopped` | PASS | +| `test_transport_start_failure_reraises_exception` | PASS | +| `test_pid_write_failure_terminates_upstream` | PASS | + +--- + +## Quality Gates + +| Gate | Result | +|------|--------| +| `ruff check src/` | ✅ All checks passed | +| `ruff check tests/` | ✅ All checks passed | +| `mypy src/` | ✅ 3 pre-existing errors only (unrelated files) | +| `pytest tests/ --ignore=tests/integration` | ✅ 559 passed, 10 skipped | + +--- + +## Files Changed + +| File | Change | +|------|--------| +| `src/mcpbridge_wrapper/broker/daemon.py` | Restructured `start()` with try/except rollback; added `_rollback_startup()` | +| `tests/unit/test_broker_daemon.py` | Added `TestStartupRollback` class (6 tests) | +| `docs/troubleshooting.md` | Added broker startup bind-error section | + +--- + +## Summary + +The `BrokerDaemon.start()` method is now fully transactional. All steps after +`_launch_upstream()` are wrapped in a try/except block that calls `_rollback_startup()` +on any exception. The rollback cancels the read task, terminates the upstream subprocess, +removes PID/socket files, and sets state to `STOPPED`. The original exception is always +re-raised. `self._state = BrokerState.READY` is only set after all startup steps succeed. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index d8563540..34dd6eb0 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -376,6 +376,35 @@ If a crash leaves orphaned files, clean them explicitly: PID_FILE="$HOME/.mcpbridge_wrapper/broker.pid"; SOCK="$HOME/.mcpbridge_wrapper/broker.sock"; if [ -f "$PID_FILE" ] && ! kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then rm -f "$PID_FILE"; fi; if [ -S "$SOCK" ]; then rm -f "$SOCK"; fi ``` +### Broker fails to start: "Address already in use" or socket bind error + +**Symptom:** Starting the broker daemon fails with an error like: + +``` +OSError: [Errno 98] Address already in use +``` + +**Cause:** The socket path is already bound by another process, or a stale socket +file was not cleaned up from a previous crash. + +**Behaviour since v0.3.x:** When the broker transport fails to bind during startup, +the daemon automatically performs a full rollback — terminating the upstream +subprocess and removing PID/socket files — before re-raising the error. No orphaned +processes or stale files are left behind. + +**Resolution:** + +1. Check for an existing broker: `cat ~/.mcpbridge_wrapper/broker.pid` and + verify whether that PID is still alive. +2. If no live broker exists, remove leftover files and retry: + +```bash +rm -f ~/.mcpbridge_wrapper/broker.pid ~/.mcpbridge_wrapper/broker.sock +``` + +3. If the error persists, check that another application is not using the same + socket path. + ### Rollback to direct mode (verified flow) 1. Remove `--broker-connect` or `--broker-spawn` from your MCP config command args. diff --git a/src/mcpbridge_wrapper/broker/daemon.py b/src/mcpbridge_wrapper/broker/daemon.py index b480ec7f..5f5e6cdd 100644 --- a/src/mcpbridge_wrapper/broker/daemon.py +++ b/src/mcpbridge_wrapper/broker/daemon.py @@ -97,20 +97,42 @@ def status(self) -> dict[str, Any]: async def start(self) -> None: """Start the broker: validate lock, launch upstream, then write PID file. + The startup sequence is transactional: if any step after launching the + upstream subprocess fails (PID file write, transport bind, etc.), + :meth:`_rollback_startup` is invoked automatically to terminate the + upstream, cancel the read task, remove stale files, and set the state + to ``STOPPED``. The original exception is always re-raised. + Raises: RuntimeError: If another broker instance is already running (live PID found). + OSError: If the transport cannot bind to the socket path, or another + OS-level failure occurs during startup. """ self._config.socket_path.parent.mkdir(parents=True, exist_ok=True) # Stale-lock / duplicate-instance check self._check_and_clear_stale_lock() - # Launch upstream subprocess + # Launch upstream subprocess — failure here needs no rollback (nothing started yet) await self._launch_upstream() - # Persist PID only after upstream launch succeeds. - self._config.pid_file.write_text(str(os.getpid())) - logger.debug("PID file written: %s", self._config.pid_file) + try: + # Persist PID only after upstream launch succeeds. + self._config.pid_file.write_text(str(os.getpid())) + logger.debug("PID file written: %s", self._config.pid_file) + + # Background reader + self._stop_event.clear() + self._stopped_event.clear() + self._read_task = asyncio.ensure_future(self._read_upstream_loop()) + + # Start transport (Unix socket server) if provided — can raise OSError + if self._transport is not None: + await self._transport.start() + + except Exception: + await self._rollback_startup() + raise self._state = BrokerState.READY logger.info( @@ -118,15 +140,6 @@ async def start(self) -> None: self._upstream.pid if self._upstream else "?", ) - # Background reader - self._stop_event.clear() - self._stopped_event.clear() - self._read_task = asyncio.ensure_future(self._read_upstream_loop()) - - # Start transport (Unix socket server) if provided - if self._transport is not None: - await self._transport.start() - async def stop(self) -> None: """Gracefully shut down the broker. @@ -210,6 +223,46 @@ def _sync_signal_handler() -> None: # Internal helpers # ------------------------------------------------------------------ + async def _rollback_startup(self) -> None: + """Roll back a failed :meth:`start` sequence. + + Called automatically when any step after ``_launch_upstream`` raises an + exception. Cancels the background read task (if started), terminates + the upstream subprocess (if running), removes PID/socket files, and + sets the daemon state to ``STOPPED``. + + Safe to call even if the upstream was never launched (idempotent). + """ + logger.warning("Rolling back failed broker startup.") + + # Cancel background read task if it was already started + if self._read_task is not None and not self._read_task.done(): + self._read_task.cancel() + with contextlib.suppress(asyncio.CancelledError, Exception): + await self._read_task + self._read_task = None + + # Terminate the upstream subprocess + if self._upstream is not None and self._upstream.returncode is None: + with contextlib.suppress(Exception): + self._upstream.terminate() + try: + await asyncio.wait_for( + self._upstream.wait(), + timeout=self._config.graceful_shutdown_timeout, + ) + except asyncio.TimeoutError: + with contextlib.suppress(Exception): + self._upstream.kill() + with contextlib.suppress(Exception): + await self._upstream.wait() + self._upstream = None + + # Remove stale files and mark daemon as stopped + self._cleanup_files() + self._state = BrokerState.STOPPED + logger.info("Startup rollback complete — broker STOPPED.") + def _check_and_clear_stale_lock(self) -> None: """Check for a stale or live PID file and handle accordingly. diff --git a/tests/unit/test_broker_daemon.py b/tests/unit/test_broker_daemon.py index 09d3bc63..63457b35 100644 --- a/tests/unit/test_broker_daemon.py +++ b/tests/unit/test_broker_daemon.py @@ -6,6 +6,7 @@ - Graceful shutdown (upstream terminated, files cleaned up) - Crash recovery / reconnect path - Status reporting +- FU-P13-T13: Transactional startup rollback on transport bind/start failure """ from __future__ import annotations @@ -798,3 +799,152 @@ async def _flaky_launch(*a, **kw): # type: ignore[no-untyped-def] assert fail_count == 2 assert daemon.state == BrokerState.READY assert daemon._reconnect_attempt == 0 + + +# --------------------------------------------------------------------------- +# FU-P13-T13: Transactional startup rollback +# --------------------------------------------------------------------------- + + +class TestStartupRollback: + """BrokerDaemon.start() must roll back cleanly if any post-launch step fails.""" + + @pytest.mark.asyncio + async def test_transport_start_failure_terminates_upstream( + self, tmp_path: Path + ) -> None: + """If transport.start() raises, the upstream subprocess is terminated.""" + cfg = _make_config(tmp_path) + transport = MagicMock() + transport.start = AsyncMock(side_effect=OSError("addr in use")) + transport.stop = AsyncMock() + daemon = BrokerDaemon(cfg, transport=transport) + + proc = _make_mock_process() + proc.terminate = MagicMock() + + with patch( + "mcpbridge_wrapper.broker.daemon.asyncio.create_subprocess_exec", + new=AsyncMock(return_value=proc), + ), pytest.raises(OSError, match="addr in use"): + await daemon.start() + + proc.terminate.assert_called_once() + proc.wait.assert_awaited() + + @pytest.mark.asyncio + async def test_transport_start_failure_removes_pid_file( + self, tmp_path: Path + ) -> None: + """If transport.start() raises, the PID file is removed.""" + cfg = _make_config(tmp_path) + transport = MagicMock() + transport.start = AsyncMock(side_effect=OSError("addr in use")) + transport.stop = AsyncMock() + daemon = BrokerDaemon(cfg, transport=transport) + + proc = _make_mock_process() + proc.terminate = MagicMock() + + with patch( + "mcpbridge_wrapper.broker.daemon.asyncio.create_subprocess_exec", + new=AsyncMock(return_value=proc), + ), pytest.raises(OSError): + await daemon.start() + + assert not cfg.pid_file.exists(), "PID file should be removed on rollback" + + @pytest.mark.asyncio + async def test_transport_start_failure_removes_socket_file( + self, tmp_path: Path + ) -> None: + """If transport.start() raises, any existing socket file is removed.""" + cfg = _make_config(tmp_path) + # Pre-create a socket file to simulate partial creation + cfg.socket_path.parent.mkdir(parents=True, exist_ok=True) + cfg.socket_path.touch() + + transport = MagicMock() + transport.start = AsyncMock(side_effect=OSError("addr in use")) + transport.stop = AsyncMock() + daemon = BrokerDaemon(cfg, transport=transport) + + proc = _make_mock_process() + proc.terminate = MagicMock() + + with patch( + "mcpbridge_wrapper.broker.daemon.asyncio.create_subprocess_exec", + new=AsyncMock(return_value=proc), + ), pytest.raises(OSError): + await daemon.start() + + assert not cfg.socket_path.exists(), "Socket file should be removed on rollback" + + @pytest.mark.asyncio + async def test_transport_start_failure_state_is_stopped( + self, tmp_path: Path + ) -> None: + """Broker state is STOPPED after a rollback, not READY or INIT.""" + cfg = _make_config(tmp_path) + transport = MagicMock() + transport.start = AsyncMock(side_effect=OSError("addr in use")) + transport.stop = AsyncMock() + daemon = BrokerDaemon(cfg, transport=transport) + + proc = _make_mock_process() + proc.terminate = MagicMock() + + with patch( + "mcpbridge_wrapper.broker.daemon.asyncio.create_subprocess_exec", + new=AsyncMock(return_value=proc), + ), pytest.raises(OSError): + await daemon.start() + + assert daemon.state == BrokerState.STOPPED + + @pytest.mark.asyncio + async def test_transport_start_failure_reraises_exception( + self, tmp_path: Path + ) -> None: + """The original exception from transport.start() propagates to the caller.""" + cfg = _make_config(tmp_path) + transport = MagicMock() + original_exc = OSError("addr in use — custom message") + transport.start = AsyncMock(side_effect=original_exc) + transport.stop = AsyncMock() + daemon = BrokerDaemon(cfg, transport=transport) + + proc = _make_mock_process() + proc.terminate = MagicMock() + + with patch( + "mcpbridge_wrapper.broker.daemon.asyncio.create_subprocess_exec", + new=AsyncMock(return_value=proc), + ), pytest.raises(OSError, match="addr in use — custom message"): + await daemon.start() + + @pytest.mark.asyncio + async def test_pid_write_failure_terminates_upstream( + self, tmp_path: Path + ) -> None: + """If the PID file write fails, the upstream subprocess is still terminated.""" + cfg = _make_config(tmp_path) + daemon = BrokerDaemon(cfg) + + proc = _make_mock_process() + proc.terminate = MagicMock() + + # Patch Path.write_text at the class level (PosixPath instance attributes are + # read-only and cannot be patched directly). + with patch( + "mcpbridge_wrapper.broker.daemon.asyncio.create_subprocess_exec", + new=AsyncMock(return_value=proc), + ), patch.object( + type(cfg.pid_file), + "write_text", + side_effect=OSError("permission denied"), + ), pytest.raises(OSError, match="permission denied"): + await daemon.start() + + proc.terminate.assert_called_once() + assert daemon.state == BrokerState.STOPPED From 27fed38343d1d4208d9f902019f22f494d6b44f3 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 19:30:40 +0000 Subject: [PATCH 11/15] Archive task FU-P13-T13: Make broker startup transactional when transport bind/start fails (PASS) https://claude.ai/code/session_01GMFnmxsDeKCTWf9Zk7S9Hg --- ...sactional_when_transport_bind_start_fails.md | 0 .../FU-P13-T13_Validation_Report.md | 0 SPECS/ARCHIVE/INDEX.md | 4 +++- SPECS/INPROGRESS/next.md | 17 ++++++----------- SPECS/Workplan.md | 10 +++++----- 5 files changed, 14 insertions(+), 17 deletions(-) rename SPECS/{INPROGRESS => ARCHIVE/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails}/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md (100%) rename SPECS/{INPROGRESS => ARCHIVE/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails}/FU-P13-T13_Validation_Report.md (100%) diff --git a/SPECS/INPROGRESS/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md b/SPECS/ARCHIVE/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md similarity index 100% rename from SPECS/INPROGRESS/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md rename to SPECS/ARCHIVE/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails.md diff --git a/SPECS/INPROGRESS/FU-P13-T13_Validation_Report.md b/SPECS/ARCHIVE/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails/FU-P13-T13_Validation_Report.md similarity index 100% rename from SPECS/INPROGRESS/FU-P13-T13_Validation_Report.md rename to SPECS/ARCHIVE/FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails/FU-P13-T13_Validation_Report.md diff --git a/SPECS/ARCHIVE/INDEX.md b/SPECS/ARCHIVE/INDEX.md index e1390ad9..6e088903 100644 --- a/SPECS/ARCHIVE/INDEX.md +++ b/SPECS/ARCHIVE/INDEX.md @@ -1,6 +1,6 @@ # mcpbridge-wrapper Tasks Archive -**Last Updated:** 2026-02-19 (FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients) +**Last Updated:** 2026-02-19 (FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails) ## Archived Tasks @@ -126,6 +126,7 @@ | FU-P13-T10 | [FU-P13-T10_broker_daemon_entrypoint/](FU-P13-T10_broker_daemon_entrypoint/) | 2026-02-19 | PASS | | FU-P13-T11 | [FU-P13-T11_Preserve_JSON-RPC_numeric_request_ID_fidelity_in_broker_transport/](FU-P13-T11_Preserve_JSON-RPC_numeric_request_ID_fidelity_in_broker_transport/) | 2026-02-19 | PASS | | FU-P13-T12 | [FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients/](FU-P13-T12_Enforce_local_Unix-socket_security_boundary_for_broker_clients/) | 2026-02-19 | PASS | +| FU-P13-T13 | [FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails/](FU-P13-T13_Make_broker_startup_transactional_when_transport_bind_start_fails/) | 2026-02-19 | PASS | ## Historical Artifacts @@ -388,3 +389,4 @@ | 2026-02-19 | FU-P12-T1-3 | Archived REVIEW_FU-P12-T1-3_multi_client_widgets_v2 report (post-fix review) | | 2026-02-19 | FU-P13-T12 | Archived Enforce_local_Unix-socket_security_boundary_for_broker_clients (PASS) | | 2026-02-19 | FU-P13-T12 | Archived REVIEW_FU-P13-T12_unix_socket_security report | +| 2026-02-19 | FU-P13-T13 | Archived Make_broker_startup_transactional_when_transport_bind_start_fails (PASS) | diff --git a/SPECS/INPROGRESS/next.md b/SPECS/INPROGRESS/next.md index 17829318..91c27d7a 100644 --- a/SPECS/INPROGRESS/next.md +++ b/SPECS/INPROGRESS/next.md @@ -1,15 +1,10 @@ -# Next Task: FU-P13-T13 — Make broker startup transactional when transport bind/start fails +# No Active Task -**Priority:** P1 -**Phase:** Phase 13 — Persistent Broker & Shared Xcode Session -**Effort:** 2–3h -**Dependencies:** P13-T2 (✅), P13-T3 (✅) -**Status:** Selected +## Recently Archived -## Description +- **FU-P13-T13** — Make broker startup transactional when transport bind/start fails (2026-02-19, PASS) +- **FU-P13-T12** — Enforce local Unix-socket security boundary for broker clients (2026-02-19, PASS) -Harden `BrokerDaemon.start()` so that partial startup failures (e.g. socket bind errors after the upstream process has already been launched) perform a full rollback: terminate the upstream subprocess, wait for it to exit, remove PID and socket files, and return the broker to a safe STOPPED state without orphaned processes or stale files. +## Suggested Next Tasks -## Next Step - -Run the PLAN command to generate the implementation-ready PRD. +- **FU-P13-T14** — Complete interactive Xcode prompt verification and close P13-T5 (P1) diff --git a/SPECS/Workplan.md b/SPECS/Workplan.md index 5b0baca9..882edc8f 100644 --- a/SPECS/Workplan.md +++ b/SPECS/Workplan.md @@ -2242,7 +2242,7 @@ Phase 9 Follow-up Backlog --- -#### 🔄 FU-P13-T13: Make broker startup transactional when transport bind/start fails **INPROGRESS** +#### ✅ FU-P13-T13: Make broker startup transactional when transport bind/start fails — Completed (2026-02-19) - **Description:** Harden `BrokerDaemon.start()` so partial startup failures (for example socket bind errors after upstream launch) perform full rollback, leaving no orphaned upstream process or stale PID/socket files. - **Priority:** P1 - **Dependencies:** P13-T2, P13-T3 @@ -2252,10 +2252,10 @@ Phase 9 Follow-up Backlog - Regression tests for transport-start failure after upstream launch - Troubleshooting note for deterministic failure behavior - **Acceptance Criteria:** - - [ ] If transport startup fails, upstream subprocess is terminated and waited - - [ ] PID/socket files are cleaned up on startup failure - - [ ] Broker state returns to a safe non-ready state after rollback - - [ ] Unit tests cover rollback behavior and prevent regression + - [x] If transport startup fails, upstream subprocess is terminated and waited + - [x] PID/socket files are cleaned up on startup failure + - [x] Broker state returns to a safe non-ready state after rollback + - [x] Unit tests cover rollback behavior and prevent regression --- From 679c7890f4844f2ab3f89a067f7fb405348be453 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 19:31:53 +0000 Subject: [PATCH 12/15] Review FU-P13-T13: transactional broker startup rollback Verdict: Approve Low: _stopped_event/_stop_event not set after rollback (non-reachable paths) Low: upstream stdin not closed in _rollback_startup (harmless in context) Nit: rollback log message lacks exception context https://claude.ai/code/session_01GMFnmxsDeKCTWf9Zk7S9Hg --- ...REVIEW_FU-P13-T13_transactional_startup.md | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 SPECS/INPROGRESS/REVIEW_FU-P13-T13_transactional_startup.md diff --git a/SPECS/INPROGRESS/REVIEW_FU-P13-T13_transactional_startup.md b/SPECS/INPROGRESS/REVIEW_FU-P13-T13_transactional_startup.md new file mode 100644 index 00000000..288fc6f1 --- /dev/null +++ b/SPECS/INPROGRESS/REVIEW_FU-P13-T13_transactional_startup.md @@ -0,0 +1,103 @@ +# Review: FU-P13-T13 — Make broker startup transactional when transport bind/start fails + +**Date:** 2026-02-19 +**Verdict:** Approve + +--- + +## Summary + +The implementation correctly wraps all post-`_launch_upstream()` steps in a `try/except` +block that calls `_rollback_startup()` on any exception and re-raises, making `start()` +fully transactional. `self._state = BrokerState.READY` is now only reached when all steps +succeed. The `_rollback_startup()` method is idempotent and safe to call even when the +upstream was never launched. Six new unit tests cover all rollback scenarios and all pass. + +--- + +## Issues Found + +### Low: `_stopped_event` not set after rollback + +**Location:** `daemon.py:_rollback_startup()` + +`_rollback_startup()` leaves `_stopped_event` unset. If a caller somehow awaits +`_stopped_event` after a failed `start()`, it would block indefinitely. + +In practice this is not reachable: `run_forever()` calls `await self.start()` and +propagates the exception before ever waiting on `_stopped_event`. `stop()` short-circuits +on `state == STOPPED`. So this is not a bug in any exercised code path. + +**Recommendation:** Add `self._stopped_event.set()` in `_rollback_startup()` to make the +method consistent with `stop()` and guard against future callers. + +--- + +### Low: `_stop_event` remains cleared after rollback + +**Location:** `daemon.py:start()` → `daemon.py:_rollback_startup()` + +`start()` calls `self._stop_event.clear()` before starting the read task. After a rollback, +`_stop_event` is left cleared (not set). This means `run_forever()` would block on +`await self._stop_event.wait()` if it somehow reached that line after a failed start — +which it cannot, since the exception propagates first. + +Same non-issue as above, but worth noting for completeness. + +**Recommendation:** Add `self._stop_event.set()` in `_rollback_startup()` for defensive +completeness. It costs nothing and makes the event states consistent with STOPPED. + +--- + +### Nit: rollback log message doesn't include exception context + +**Location:** `daemon.py:_rollback_startup()` line ~237 + +```python +logger.warning("Rolling back failed broker startup.") +``` + +The warning doesn't include the cause. A caller catching the re-raised exception can log it, +but having the cause in the rollback log would make log triage easier when debugging broker +startup failures in production. + +**Recommendation:** Pass `exc_info=True` or the exception string if called from an +`except` block — but since `_rollback_startup` doesn't receive the exception, this would +require restructuring. Low priority; leave as-is or document in the method docstring. + +--- + +### Nit: inconsistency between `stop()` and `_rollback_startup()` upstream termination + +**Location:** `daemon.py:stop()` vs `daemon.py:_rollback_startup()` + +`stop()` closes `upstream.stdin` before waiting for the subprocess. `_rollback_startup()` +does not. In a rollback scenario, the upstream process hasn't yet received any MCP requests +(transport hasn't started), so this difference is harmless in practice. Still, a uniform +termination sequence would be more maintainable. + +**Recommendation:** Low priority; acceptable as-is given the different contexts. + +--- + +## Positive Observations + +- The `try/except` boundary is correctly drawn — only post-`_launch_upstream()` steps are + inside, since `_launch_upstream()` failure itself requires no rollback. +- The re-raise (`raise`) without argument preserves the original exception and traceback. +- `_rollback_startup()` sets `self._upstream = None` after termination, ensuring + subsequent calls are safe. +- `_cleanup_files()` uses `missing_ok=True` internally, so calling it when no files exist + is safe. +- Tests use `patch.object(type(cfg.pid_file), "write_text", ...)` to patch PosixPath's + class-level method, correctly working around the read-only instance attribute constraint. +- All 6 new tests pass; the full suite (559 tests) remains green. + +--- + +## Follow-up Tasks + +None required for correctness. Optional defensive improvements: + +- **FU-P13-T13-FU-1** (Low): Set `_stopped_event` and `_stop_event` in `_rollback_startup()` + for defensive consistency with the STOPPED state contract. From bb452ff17ce7b8ae2fe46ea3bfea0e74443fe8ae Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 19:32:19 +0000 Subject: [PATCH 13/15] Follow-up FU-P13-T13: add FU-P13-T13-FU-1 for defensive _rollback_startup event consistency Low-priority follow-up: set _stopped_event and _stop_event in _rollback_startup() to make STOPPED state contract complete for future callers. https://claude.ai/code/session_01GMFnmxsDeKCTWf9Zk7S9Hg --- SPECS/Workplan.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/SPECS/Workplan.md b/SPECS/Workplan.md index 882edc8f..6e04ebe0 100644 --- a/SPECS/Workplan.md +++ b/SPECS/Workplan.md @@ -2259,6 +2259,21 @@ Phase 9 Follow-up Backlog --- +#### ⬜️ FU-P13-T13-FU-1: Set _stopped_event and _stop_event in _rollback_startup for defensive consistency +- **Description:** After `_rollback_startup()` sets state to STOPPED, also call `self._stopped_event.set()` and `self._stop_event.set()` so all event states are consistent with the STOPPED contract. These paths are currently unreachable by callers but the defensive fix ensures correctness if future callers are added. +- **Priority:** P3 (Low — optional defensibility improvement) +- **Dependencies:** FU-P13-T13 (✅) +- **Parallelizable:** yes +- **Outputs/Artifacts:** + - Updated `src/mcpbridge_wrapper/broker/daemon.py` `_rollback_startup()` method + - Updated tests asserting event state after rollback +- **Acceptance Criteria:** + - [ ] `_stopped_event.set()` called in `_rollback_startup()` + - [ ] `_stop_event.set()` called in `_rollback_startup()` + - [ ] Tests verify event states are set after a failed startup + +--- + #### ⬜️ FU-P13-T14: Complete interactive Xcode prompt verification and close P13-T5 - **Description:** Execute and document the remaining human-run interactive validation for Xcode permission prompts in direct mode vs broker mode, then update P13-T5 verdict and linked acceptance states. - **Priority:** P1 From d1ac49f84c789c161625f0853066bf9f441b66c8 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 19:32:56 +0000 Subject: [PATCH 14/15] Archive REVIEW_FU-P13-T13_transactional_startup report https://claude.ai/code/session_01GMFnmxsDeKCTWf9Zk7S9Hg --- SPECS/ARCHIVE/INDEX.md | 2 ++ .../_Historical}/REVIEW_FU-P13-T13_transactional_startup.md | 0 2 files changed, 2 insertions(+) rename SPECS/{INPROGRESS => ARCHIVE/_Historical}/REVIEW_FU-P13-T13_transactional_startup.md (100%) diff --git a/SPECS/ARCHIVE/INDEX.md b/SPECS/ARCHIVE/INDEX.md index 6e088903..6580c2cf 100644 --- a/SPECS/ARCHIVE/INDEX.md +++ b/SPECS/ARCHIVE/INDEX.md @@ -218,6 +218,7 @@ | [REVIEW_FU-P13-T11_integer_id_fidelity.md](_Historical/REVIEW_FU-P13-T11_integer_id_fidelity.md) | Review report for FU-P13-T11 | | [REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md](_Historical/REVIEW_FU-P12-T1-3_multi_client_widgets_v2.md) | Second review report for FU-P12-T1-3 (post-fix verification) | | [REVIEW_FU-P13-T12_unix_socket_security.md](_Historical/REVIEW_FU-P13-T12_unix_socket_security.md) | Review report for FU-P13-T12 | +| [REVIEW_FU-P13-T13_transactional_startup.md](_Historical/REVIEW_FU-P13-T13_transactional_startup.md) | Review report for FU-P13-T13 | ## Archive Log @@ -390,3 +391,4 @@ | 2026-02-19 | FU-P13-T12 | Archived Enforce_local_Unix-socket_security_boundary_for_broker_clients (PASS) | | 2026-02-19 | FU-P13-T12 | Archived REVIEW_FU-P13-T12_unix_socket_security report | | 2026-02-19 | FU-P13-T13 | Archived Make_broker_startup_transactional_when_transport_bind_start_fails (PASS) | +| 2026-02-19 | FU-P13-T13 | Archived REVIEW_FU-P13-T13_transactional_startup report | diff --git a/SPECS/INPROGRESS/REVIEW_FU-P13-T13_transactional_startup.md b/SPECS/ARCHIVE/_Historical/REVIEW_FU-P13-T13_transactional_startup.md similarity index 100% rename from SPECS/INPROGRESS/REVIEW_FU-P13-T13_transactional_startup.md rename to SPECS/ARCHIVE/_Historical/REVIEW_FU-P13-T13_transactional_startup.md From f0871952cff5b05e5ebe32ab4305e52678bf15b4 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 19:37:48 +0000 Subject: [PATCH 15/15] fix: sync DocC Troubleshooting.md and reformat test_broker_daemon - Add broker bind-error section to DocC Troubleshooting.md (mirrors docs/troubleshooting.md) - Reformat tests/unit/test_broker_daemon.py (ruff format) https://claude.ai/code/session_01GMFnmxsDeKCTWf9Zk7S9Hg --- .../Documentation.docc/Troubleshooting.md | 29 +++++++++++++++++++ tests/unit/test_broker_daemon.py | 24 ++++----------- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/Sources/XcodeMCPWrapper/Documentation.docc/Troubleshooting.md b/Sources/XcodeMCPWrapper/Documentation.docc/Troubleshooting.md index ec074e71..27700785 100644 --- a/Sources/XcodeMCPWrapper/Documentation.docc/Troubleshooting.md +++ b/Sources/XcodeMCPWrapper/Documentation.docc/Troubleshooting.md @@ -323,6 +323,35 @@ PID_FILE="$HOME/.mcpbridge_wrapper/broker.pid"; if [ -f "$PID_FILE" ]; then kill PID_FILE="$HOME/.mcpbridge_wrapper/broker.pid"; SOCK="$HOME/.mcpbridge_wrapper/broker.sock"; if [ -f "$PID_FILE" ] && ! kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then rm -f "$PID_FILE"; fi; if [ -S "$SOCK" ]; then rm -f "$SOCK"; fi ``` +## Error: "Address already in use" or socket bind error at broker startup + +**Symptom:** Starting the broker daemon fails with an error like: + +``` +OSError: [Errno 98] Address already in use +``` + +**Cause:** The socket path is already bound by another process, or a stale socket +file was not cleaned up from a previous crash. + +**Behaviour since v0.3.x:** When the broker transport fails to bind during startup, +the daemon automatically performs a full rollback — terminating the upstream +subprocess and removing PID/socket files — before re-raising the error. No orphaned +processes or stale files are left behind. + +**Resolution:** + +1. Check for an existing broker: `cat ~/.mcpbridge_wrapper/broker.pid` and + verify whether that PID is still alive. +2. If no live broker exists, remove leftover files and retry: + +```bash +rm -f ~/.mcpbridge_wrapper/broker.pid ~/.mcpbridge_wrapper/broker.sock +``` + +3. If the error persists, check that another application is not using the same + socket path. + ## Rollback to direct mode 1. Remove `--broker-connect` / `--broker-spawn` from MCP config args. diff --git a/tests/unit/test_broker_daemon.py b/tests/unit/test_broker_daemon.py index 63457b35..6f783f7e 100644 --- a/tests/unit/test_broker_daemon.py +++ b/tests/unit/test_broker_daemon.py @@ -810,9 +810,7 @@ class TestStartupRollback: """BrokerDaemon.start() must roll back cleanly if any post-launch step fails.""" @pytest.mark.asyncio - async def test_transport_start_failure_terminates_upstream( - self, tmp_path: Path - ) -> None: + async def test_transport_start_failure_terminates_upstream(self, tmp_path: Path) -> None: """If transport.start() raises, the upstream subprocess is terminated.""" cfg = _make_config(tmp_path) transport = MagicMock() @@ -833,9 +831,7 @@ async def test_transport_start_failure_terminates_upstream( proc.wait.assert_awaited() @pytest.mark.asyncio - async def test_transport_start_failure_removes_pid_file( - self, tmp_path: Path - ) -> None: + async def test_transport_start_failure_removes_pid_file(self, tmp_path: Path) -> None: """If transport.start() raises, the PID file is removed.""" cfg = _make_config(tmp_path) transport = MagicMock() @@ -855,9 +851,7 @@ async def test_transport_start_failure_removes_pid_file( assert not cfg.pid_file.exists(), "PID file should be removed on rollback" @pytest.mark.asyncio - async def test_transport_start_failure_removes_socket_file( - self, tmp_path: Path - ) -> None: + async def test_transport_start_failure_removes_socket_file(self, tmp_path: Path) -> None: """If transport.start() raises, any existing socket file is removed.""" cfg = _make_config(tmp_path) # Pre-create a socket file to simulate partial creation @@ -881,9 +875,7 @@ async def test_transport_start_failure_removes_socket_file( assert not cfg.socket_path.exists(), "Socket file should be removed on rollback" @pytest.mark.asyncio - async def test_transport_start_failure_state_is_stopped( - self, tmp_path: Path - ) -> None: + async def test_transport_start_failure_state_is_stopped(self, tmp_path: Path) -> None: """Broker state is STOPPED after a rollback, not READY or INIT.""" cfg = _make_config(tmp_path) transport = MagicMock() @@ -903,9 +895,7 @@ async def test_transport_start_failure_state_is_stopped( assert daemon.state == BrokerState.STOPPED @pytest.mark.asyncio - async def test_transport_start_failure_reraises_exception( - self, tmp_path: Path - ) -> None: + async def test_transport_start_failure_reraises_exception(self, tmp_path: Path) -> None: """The original exception from transport.start() propagates to the caller.""" cfg = _make_config(tmp_path) transport = MagicMock() @@ -924,9 +914,7 @@ async def test_transport_start_failure_reraises_exception( await daemon.start() @pytest.mark.asyncio - async def test_pid_write_failure_terminates_upstream( - self, tmp_path: Path - ) -> None: + async def test_pid_write_failure_terminates_upstream(self, tmp_path: Path) -> None: """If the PID file write fails, the upstream subprocess is still terminated.""" cfg = _make_config(tmp_path) daemon = BrokerDaemon(cfg)