From 98d7f2908532ab35c07fe1d67c83f47a06a59837 Mon Sep 17 00:00:00 2001 From: Egor Merkushev Date: Sun, 1 Mar 2026 05:06:11 +0300 Subject: [PATCH 1/8] Select task P2-T2: Self-healing stale socket and PID file recovery --- SPECS/INPROGRESS/next.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/SPECS/INPROGRESS/next.md b/SPECS/INPROGRESS/next.md index 44fec028..5e807488 100644 --- a/SPECS/INPROGRESS/next.md +++ b/SPECS/INPROGRESS/next.md @@ -1,6 +1,23 @@ -# No Active Task +# Active Task: P2-T2 -**Status:** Idle — BUG-T8 archived. Select the next task from `SPECS/Workplan.md`. +**Status:** In Progress (2026-03-01) + +## Task + +- **ID:** P2-T2 +- **Name:** Self-healing stale socket and PID file recovery +- **Priority:** P0 +- **Branch:** feature/P2-T2-stale-socket-recovery +- **Dependencies:** none + +## Summary + +When the broker daemon crashes or is killed, it leaves `broker.sock` and `broker.pid` on disk. The proxy's `_spawn_broker_if_needed` checks `socket_path.exists()` and skips spawning if the socket file is present — even if no process is listening. Fix by validating socket liveness via `connect()` before concluding a broker is running: if `connect()` fails with `ConnectionRefusedError`, treat both files as stale, remove them, and proceed with spawn. Also add `atexit` cleanup in the daemon so the socket file is removed on daemon exit. + +## Deliverables + +- `src/mcpbridge_wrapper/broker/proxy.py` — liveness check in `_spawn_broker_if_needed` +- `src/mcpbridge_wrapper/broker/daemon.py` — atexit socket cleanup on exit ## Recently Archived From 93de2bb1feee437a85f17bff7899ef2852a0adc8 Mon Sep 17 00:00:00 2001 From: Egor Merkushev Date: Sun, 1 Mar 2026 05:06:45 +0300 Subject: [PATCH 2/8] Plan task P2-T2: Self-healing stale socket and PID file recovery --- ...ling_stale_socket_and_PID_file_recovery.md | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 SPECS/INPROGRESS/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md diff --git a/SPECS/INPROGRESS/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md b/SPECS/INPROGRESS/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md new file mode 100644 index 00000000..a2e19ed8 --- /dev/null +++ b/SPECS/INPROGRESS/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md @@ -0,0 +1,115 @@ +# P2-T2: Self-healing stale socket and PID file recovery + +**Status:** In Progress +**Priority:** P0 +**Branch:** feature/P2-T2-stale-socket-recovery +**Created:** 2026-03-01 + +--- + +## Problem Statement + +When the broker daemon crashes or is killed, it leaves `broker.sock` and `broker.pid` on disk. The proxy's `_spawn_broker_if_needed` checks `socket_path.exists()` and skips spawning if the socket file is present — even if no process is listening. This silently blocks all future broker mode sessions until the user manually deletes the files. + +--- + +## Root Cause + +In `proxy.py` → `_spawn_broker_if_needed`, line 131-133: + +```python +# Check if socket already exists (race condition: broker started without PID file yet) +if socket_path.exists(): + logger.debug("Broker socket already present; skipping spawn.") + return +``` + +Existence check (`Path.exists()`) does not verify whether anything is actually listening on the socket. A stale socket file left after a crash passes the existence check and prevents spawn. + +--- + +## Solution + +### 1. `proxy.py` — Liveness check in `_spawn_broker_if_needed` + +Replace the plain existence check with a socket connect attempt: + +```python +if socket_path.exists(): + import socket as _socket + try: + with _socket.socket(_socket.AF_UNIX, _socket.SOCK_STREAM) as s: + s.settimeout(1.0) + s.connect(str(socket_path)) + # Connection succeeded → broker is alive + logger.debug("Broker socket present and accepting connections; skipping spawn.") + return + except ConnectionRefusedError: + # Stale socket — broker is not listening + logger.warning( + "Stale socket found (broker not accepting connections); removing stale files." + ) + socket_path.unlink(missing_ok=True) + pid_file.unlink(missing_ok=True) + # Fall through to spawn +``` + +This ensures: +- If broker is alive (socket accepts connections): skip spawn (no change in behaviour) +- If broker is dead (socket refuses connections): remove stale files and proceed with spawn +- `FileNotFoundError` and other OS errors during connect are suppressed — treated as "not alive" (also falls through to spawn path) + +### 2. `daemon.py` — `atexit` cleanup on daemon exit + +The daemon already removes files via `_cleanup_files()` in `stop()`, and `stop()` is called by the SIGTERM/SIGINT handlers in `run_forever()`. However, if the Python interpreter exits abnormally (e.g., unhandled exception, explicit `sys.exit()`), cleanup may be skipped. + +Add an `atexit` registration in `start()`: + +```python +import atexit +atexit.register(self._cleanup_files) +``` + +This ensures `_cleanup_files()` runs even for abnormal exits (excluding SIGKILL, which cannot be intercepted). + +--- + +## Files to Change + +| File | Change | +|------|--------| +| `src/mcpbridge_wrapper/broker/proxy.py` | Replace existence-only socket check with connect-based liveness check | +| `src/mcpbridge_wrapper/broker/daemon.py` | Register `atexit` cleanup in `start()` | + +--- + +## Tests to Add + +In `tests/unit/test_broker_proxy.py` — new class `TestBrokerProxyStaleSocket`: + +1. **`test_stale_socket_triggers_spawn`** — socket file exists but connect raises `ConnectionRefusedError`; verify `Popen` is called and stale files are removed. +2. **`test_live_socket_skips_spawn`** — socket file exists and connect succeeds; verify `Popen` is NOT called. +3. **`test_stale_socket_with_stale_pid_file_triggers_spawn`** — both files exist, connect raises `ConnectionRefusedError`; verify both files are removed and spawn proceeds. + +In `tests/unit/test_broker_daemon.py` — new class `TestBrokerDaemonAtExit`: + +4. **`test_atexit_registered_after_start`** — after `daemon.start()`, `atexit` registry includes `_cleanup_files`. + +--- + +## Acceptance Criteria + +- [ ] After broker crash, next `--broker-spawn` session auto-recovers without manual file removal +- [ ] Liveness check uses `connect()` not `exists()` +- [ ] Daemon registers `atexit` cleanup on `start()` +- [ ] All existing broker tests pass +- [ ] New tests cover the stale-socket scenario and atexit registration +- [ ] `ruff check src/` passes +- [ ] `mypy src/` passes (if configured) +- [ ] Coverage ≥ 90% + +--- + +## Dependencies + +None — can be implemented independently. From 65bc368b45e0efc2cb08b48ee37db75ec3eaa012 Mon Sep 17 00:00:00 2001 From: Egor Merkushev Date: Sun, 1 Mar 2026 05:09:58 +0300 Subject: [PATCH 3/8] Implement P2-T2: stale socket liveness check and atexit cleanup in broker --- SPECS/INPROGRESS/P2-T2_Validation_Report.md | 63 ++++++++++++++++ src/mcpbridge_wrapper/broker/daemon.py | 5 ++ src/mcpbridge_wrapper/broker/proxy.py | 20 ++++- tests/unit/test_broker_daemon.py | 35 +++++++++ tests/unit/test_broker_proxy.py | 83 ++++++++++++++++++++- 5 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 SPECS/INPROGRESS/P2-T2_Validation_Report.md diff --git a/SPECS/INPROGRESS/P2-T2_Validation_Report.md b/SPECS/INPROGRESS/P2-T2_Validation_Report.md new file mode 100644 index 00000000..3fe475b7 --- /dev/null +++ b/SPECS/INPROGRESS/P2-T2_Validation_Report.md @@ -0,0 +1,63 @@ +# P2-T2 Validation Report + +**Task:** Self-healing stale socket and PID file recovery +**Date:** 2026-03-01 +**Branch:** feature/P2-T2-stale-socket-recovery +**Verdict:** PASS + +--- + +## Quality Gate Results + +| Gate | Result | Details | +|------|--------|---------| +| `ruff check src/` | ✅ PASS | All checks passed | +| `pytest` | ✅ PASS | 719 passed, 5 skipped | +| `pytest --cov` | ✅ PASS | 91.78% coverage (≥ 90% required) | +| `mypy src/` | N/A | Not configured | + +--- + +## Changes Made + +### `src/mcpbridge_wrapper/broker/proxy.py` + +- Added `import socket` at module level (replaces inline import-inside-function approach) +- Replaced plain `socket_path.exists()` guard in `_spawn_broker_if_needed` with a connect-based liveness check: + - If socket file exists and `connect()` succeeds → broker is alive, skip spawn + - If socket file exists and `connect()` raises `OSError` (covers `ConnectionRefusedError`, `OSError: socket operation on non-socket`, timeouts) → treat as stale, remove both socket and PID files, fall through to spawn + +### `src/mcpbridge_wrapper/broker/daemon.py` + +- Added `import atexit` at module level +- Registered `self._cleanup_files` with `atexit` in `start()` after the transport is successfully started, ensuring socket/PID files are removed even on abnormal interpreter exits (unhandled exceptions, `sys.exit()`). SIGKILL cannot be intercepted and is not covered. + +--- + +## Tests Added / Updated + +### `tests/unit/test_broker_proxy.py` + +- **Updated** `TestBrokerProxyAutoSpawn::test_spawn_noop_when_socket_exists` — now mocks `socket.socket.connect()` to succeed (simulating a live broker) instead of relying on the old existence-only check. + +- **Added** `TestBrokerProxyStaleSocket` class with 3 tests: + - `test_stale_socket_triggers_spawn` — socket file exists, `connect()` raises `ConnectionRefusedError` → `Popen` is called + - `test_stale_socket_removes_files` — socket and PID files are removed before spawn attempt + - `test_live_socket_skips_spawn` — socket file exists, `connect()` succeeds → `Popen` NOT called + +### `tests/unit/test_broker_daemon.py` + +- **Added** `TestBrokerDaemonAtExit` class with 1 test: + - `test_atexit_registered_after_start` — verifies `atexit.register` is called with `_cleanup_files` during `start()` + +--- + +## Acceptance Criteria Status + +- [x] After broker crash, next `--broker-spawn` session auto-recovers without manual file removal +- [x] Liveness check uses `connect()` not `exists()` +- [x] Daemon registers `atexit` cleanup on `start()` +- [x] All existing broker tests pass (719 passed, 5 skipped) +- [x] New tests cover the stale-socket scenario and atexit registration +- [x] `ruff check src/` passes +- [x] Coverage ≥ 90% diff --git a/src/mcpbridge_wrapper/broker/daemon.py b/src/mcpbridge_wrapper/broker/daemon.py index 3bd553cc..fa367120 100644 --- a/src/mcpbridge_wrapper/broker/daemon.py +++ b/src/mcpbridge_wrapper/broker/daemon.py @@ -19,6 +19,7 @@ from __future__ import annotations import asyncio +import atexit import contextlib import json import logging @@ -134,6 +135,10 @@ async def start(self) -> None: await self._rollback_startup() raise + # Ensure socket/PID files are removed even on abnormal interpreter exit + # (e.g. unhandled exception, sys.exit). SIGKILL cannot be intercepted. + atexit.register(self._cleanup_files) + self._state = BrokerState.READY logger.info( "Broker READY (upstream PID %s)", diff --git a/src/mcpbridge_wrapper/broker/proxy.py b/src/mcpbridge_wrapper/broker/proxy.py index 980a0b6f..e49a3d7e 100644 --- a/src/mcpbridge_wrapper/broker/proxy.py +++ b/src/mcpbridge_wrapper/broker/proxy.py @@ -18,6 +18,7 @@ import contextlib import logging import os +import socket import sys from mcpbridge_wrapper.broker.types import BrokerConfig @@ -127,10 +128,23 @@ async def _spawn_broker_if_needed(self) -> None: except (ValueError, ProcessLookupError, PermissionError): logger.debug("Stale PID file; will spawn broker.") - # Check if socket already exists (race condition: broker started without PID file yet) + # Check if socket already exists and is actually alive. + # A stale socket file left after a crash passes exists() but refuses connections. if socket_path.exists(): - logger.debug("Broker socket already present; skipping spawn.") - return + try: + with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: + s.settimeout(1.0) + s.connect(str(socket_path)) + # Connection succeeded — broker is alive + logger.debug("Broker socket present and accepting connections; skipping spawn.") + return + except OSError: + logger.warning( + "Stale socket found (broker not accepting connections); removing stale files." + ) + socket_path.unlink(missing_ok=True) + pid_file.unlink(missing_ok=True) + # Fall through to spawn logger.info("Spawning broker daemon…") import subprocess diff --git a/tests/unit/test_broker_daemon.py b/tests/unit/test_broker_daemon.py index a3642b45..3c843539 100644 --- a/tests/unit/test_broker_daemon.py +++ b/tests/unit/test_broker_daemon.py @@ -958,3 +958,38 @@ async def test_transport_start_failure_sets_stop_events(self, tmp_path: Path) -> assert daemon.state == BrokerState.STOPPED assert daemon._stop_event.is_set() assert daemon._stopped_event.is_set() + + +# --------------------------------------------------------------------------- +# P2-T2: atexit cleanup registration +# --------------------------------------------------------------------------- + + +class TestBrokerDaemonAtExit: + @pytest.mark.asyncio + async def test_atexit_registered_after_start(self, tmp_path: Path) -> None: + """After start(), _cleanup_files is registered with atexit.""" + import atexit + + cfg = _make_config(tmp_path) + daemon = BrokerDaemon(cfg) + proc = _make_mock_process() + + registered: list[object] = [] + + def _fake_register(fn: object, *args: object, **kwargs: object) -> None: + registered.append(fn) + + with patch( + "mcpbridge_wrapper.broker.daemon.asyncio.create_subprocess_exec", + new=AsyncMock(return_value=proc), + ), patch("mcpbridge_wrapper.broker.daemon.atexit.register", side_effect=_fake_register): + await daemon.start() + + # _cleanup_files must have been registered + assert daemon._cleanup_files in registered + + # Cleanup + daemon._stop_event.set() + if daemon._read_task and not daemon._read_task.done(): + daemon._read_task.cancel() diff --git a/tests/unit/test_broker_proxy.py b/tests/unit/test_broker_proxy.py index f0465255..ca93dc69 100644 --- a/tests/unit/test_broker_proxy.py +++ b/tests/unit/test_broker_proxy.py @@ -254,14 +254,20 @@ async def test_spawn_noop_when_pid_file_live(self, tmp_path: Path) -> None: @pytest.mark.asyncio async def test_spawn_noop_when_socket_exists(self, tmp_path: Path) -> None: - """_spawn_broker_if_needed does nothing when socket file already exists.""" + """_spawn_broker_if_needed does nothing when socket file exists and broker is alive.""" cfg = _make_config(tmp_path) - cfg.socket_path.touch() # simulate running broker + cfg.socket_path.touch() # simulate socket file present proxy = BrokerProxy(cfg, auto_spawn=True, connect_timeout=0.1) - with patch("subprocess.Popen") as mock_popen: - await proxy._spawn_broker_if_needed() + # Mock socket.connect to succeed (broker is alive) + mock_sock = MagicMock() + mock_sock.__enter__ = MagicMock(return_value=mock_sock) + mock_sock.__exit__ = MagicMock(return_value=False) + + with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock): + with patch("subprocess.Popen") as mock_popen: + await proxy._spawn_broker_if_needed() mock_popen.assert_not_called() @@ -304,6 +310,75 @@ def _fake_exists(path_obj: Path) -> bool: assert cmd[3:] == ["--broker-daemon", "--web-ui", "--web-ui-port", "9090"] +# --------------------------------------------------------------------------- +# Stale socket recovery (P2-T2) +# --------------------------------------------------------------------------- + + +class TestBrokerProxyStaleSocket: + @pytest.mark.asyncio + async def test_stale_socket_triggers_spawn(self, tmp_path: Path) -> None: + """When socket exists but connect raises OSError, spawn proceeds.""" + cfg = _make_config(tmp_path) + cfg.socket_path.touch() # stale socket file + + proxy = BrokerProxy(cfg, auto_spawn=True, connect_timeout=0.3) + + mock_sock = MagicMock() + mock_sock.__enter__ = MagicMock(return_value=mock_sock) + mock_sock.__exit__ = MagicMock(return_value=False) + mock_sock.connect.side_effect = ConnectionRefusedError + + with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock): + with patch("subprocess.Popen") as mock_popen: + # Socket never appears after spawn, so TimeoutError is expected + with pytest.raises(TimeoutError): + await proxy._spawn_broker_if_needed() + + mock_popen.assert_called_once() + + @pytest.mark.asyncio + async def test_stale_socket_removes_files(self, tmp_path: Path) -> None: + """Stale socket and PID files are removed before attempting spawn.""" + cfg = _make_config(tmp_path) + cfg.socket_path.touch() + cfg.pid_file.write_text("99999") # stale PID file + + proxy = BrokerProxy(cfg, auto_spawn=True, connect_timeout=0.3) + + mock_sock = MagicMock() + mock_sock.__enter__ = MagicMock(return_value=mock_sock) + mock_sock.__exit__ = MagicMock(return_value=False) + mock_sock.connect.side_effect = ConnectionRefusedError + + with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock): + with patch("subprocess.Popen"): + with pytest.raises(TimeoutError): + await proxy._spawn_broker_if_needed() + + assert not cfg.socket_path.exists() + assert not cfg.pid_file.exists() + + @pytest.mark.asyncio + async def test_live_socket_skips_spawn(self, tmp_path: Path) -> None: + """When socket exists and connect succeeds, spawn is skipped.""" + cfg = _make_config(tmp_path) + cfg.socket_path.touch() + + proxy = BrokerProxy(cfg, auto_spawn=True, connect_timeout=0.1) + + mock_sock = MagicMock() + mock_sock.__enter__ = MagicMock(return_value=mock_sock) + mock_sock.__exit__ = MagicMock(return_value=False) + # connect() returns None (success) + + with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock): + with patch("subprocess.Popen") as mock_popen: + await proxy._spawn_broker_if_needed() + + mock_popen.assert_not_called() + + # --------------------------------------------------------------------------- # _parse_broker_args # --------------------------------------------------------------------------- From ab2e69a8f40f6c549b45085984467f7a478c3a83 Mon Sep 17 00:00:00 2001 From: Egor Merkushev Date: Sun, 1 Mar 2026 05:11:16 +0300 Subject: [PATCH 4/8] Archive task P2-T2: Self-healing stale socket and PID file recovery (PASS) --- SPECS/ARCHIVE/INDEX.md | 4 ++- ...ling_stale_socket_and_PID_file_recovery.md | 0 .../P2-T2_Validation_Report.md | 0 SPECS/INPROGRESS/next.md | 28 ++++++------------- SPECS/Workplan.md | 11 ++++---- 5 files changed, 18 insertions(+), 25 deletions(-) rename SPECS/{INPROGRESS => ARCHIVE/P2-T2_Self-healing_stale_socket_and_PID_file_recovery}/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md (100%) rename SPECS/{INPROGRESS => ARCHIVE/P2-T2_Self-healing_stale_socket_and_PID_file_recovery}/P2-T2_Validation_Report.md (100%) diff --git a/SPECS/ARCHIVE/INDEX.md b/SPECS/ARCHIVE/INDEX.md index e7740e54..de45a7a7 100644 --- a/SPECS/ARCHIVE/INDEX.md +++ b/SPECS/ARCHIVE/INDEX.md @@ -1,11 +1,12 @@ # mcpbridge-wrapper Tasks Archive -**Last Updated:** 2026-03-01 (BUG-T8 + REVIEW archived) +**Last Updated:** 2026-03-01 (P2-T2 archived) ## Archived Tasks | Task ID | Folder | Archived | Verdict | |---------|--------|----------|---------| +| P2-T2 | [P2-T2_Self-healing_stale_socket_and_PID_file_recovery/](P2-T2_Self-healing_stale_socket_and_PID_file_recovery/) | 2026-03-01 | PASS | | BUG-T8 | [BUG-T8_Fix_broker_proxy_bridge_exits_after_first_write_due_to_BaseProtocol_missing_drain_helper/](BUG-T8_Fix_broker_proxy_bridge_exits_after_first_write_due_to_BaseProtocol_missing_drain_helper/) | 2026-03-01 | PASS | | P1-T3 | [P1-T3_Improve_MCP_settings_examples_in_README_to_present_broker_setup_first/](P1-T3_Improve_MCP_settings_examples_in_README_to_present_broker_setup_first/) | 2026-03-01 | PASS | | P1-T2 | [P1-T2_Add_Xcode_26_4_known_issue_release_notes_link_to_README/](P1-T2_Add_Xcode_26_4_known_issue_release_notes_link_to_README/) | 2026-02-28 | PASS | @@ -290,6 +291,7 @@ | Date | Task ID | Action | |------|---------|--------| +| 2026-03-01 | P2-T2 | Archived Self-healing stale socket and PID file recovery (PASS) | | 2026-03-01 | WORKPLAN | Archived Workplan_0.4.0.md to _Historical and reset SPECS/Workplan.md | | 2026-02-28 | P15-T1 | Archived REVIEW_p15_t1_next_release_readiness report | | 2026-02-28 | P15-T1 | Archived Validate_project_readiness_for_the_next_release (PASS) | diff --git a/SPECS/INPROGRESS/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md b/SPECS/ARCHIVE/P2-T2_Self-healing_stale_socket_and_PID_file_recovery/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md similarity index 100% rename from SPECS/INPROGRESS/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md rename to SPECS/ARCHIVE/P2-T2_Self-healing_stale_socket_and_PID_file_recovery/P2-T2_Self-healing_stale_socket_and_PID_file_recovery.md diff --git a/SPECS/INPROGRESS/P2-T2_Validation_Report.md b/SPECS/ARCHIVE/P2-T2_Self-healing_stale_socket_and_PID_file_recovery/P2-T2_Validation_Report.md similarity index 100% rename from SPECS/INPROGRESS/P2-T2_Validation_Report.md rename to SPECS/ARCHIVE/P2-T2_Self-healing_stale_socket_and_PID_file_recovery/P2-T2_Validation_Report.md diff --git a/SPECS/INPROGRESS/next.md b/SPECS/INPROGRESS/next.md index 5e807488..1bb60f4b 100644 --- a/SPECS/INPROGRESS/next.md +++ b/SPECS/INPROGRESS/next.md @@ -1,27 +1,17 @@ -# Active Task: P2-T2 +# No Active Task -**Status:** In Progress (2026-03-01) - -## Task - -- **ID:** P2-T2 -- **Name:** Self-healing stale socket and PID file recovery -- **Priority:** P0 -- **Branch:** feature/P2-T2-stale-socket-recovery -- **Dependencies:** none - -## Summary - -When the broker daemon crashes or is killed, it leaves `broker.sock` and `broker.pid` on disk. The proxy's `_spawn_broker_if_needed` checks `socket_path.exists()` and skips spawning if the socket file is present — even if no process is listening. Fix by validating socket liveness via `connect()` before concluding a broker is running: if `connect()` fails with `ConnectionRefusedError`, treat both files as stale, remove them, and proceed with spawn. Also add `atexit` cleanup in the daemon so the socket file is removed on daemon exit. - -## Deliverables - -- `src/mcpbridge_wrapper/broker/proxy.py` — liveness check in `_spawn_broker_if_needed` -- `src/mcpbridge_wrapper/broker/daemon.py` — atexit socket cleanup on exit +**Status:** Idle — P2-T2 archived. Select the next task from `SPECS/Workplan.md`. ## Recently Archived +- **P2-T2** — Self-healing stale socket and PID file recovery (2026-03-01, PASS) - **BUG-T8** — Fix broker proxy bridge exits after first write due to BaseProtocol missing _drain_helper (2026-03-01, PASS) - **P1-T3** — Improve MCP settings examples in README to present broker setup first (2026-03-01, PASS) - **P1-T2** — Add Xcode 26.4 known issue release-notes link to README (2026-02-28, PASS) - **P1-T1** — Add the version badge in the README.md (2026-02-28, PASS) + +## Suggested Next Tasks + +- **P2-T1** (P1) — Replace --broker-spawn/--broker-connect with single --broker flag +- **P2-T3** (P1) — Fix double-spawn race condition when MCP client toggles rapidly (depends on P2-T2 ✅) +- **P2-T4** (P1) — Surface broker unavailability as JSON-RPC error instead of silent timeout diff --git a/SPECS/Workplan.md b/SPECS/Workplan.md index ea237bcc..52eb7d09 100644 --- a/SPECS/Workplan.md +++ b/SPECS/Workplan.md @@ -71,7 +71,8 @@ Add new tasks using the canonical template in [TASK_TEMPLATE.md](TASK_TEMPLATE.m - [ ] All MCP settings examples in README and DocC use `--broker` - [ ] All existing tests pass -#### ⬜️ P2-T2: Self-healing stale socket and PID file recovery +#### ✅ P2-T2: Self-healing stale socket and PID file recovery +- **Status:** ✅ Completed (2026-03-01) - **Description:** When the broker daemon crashes or is killed, it leaves `broker.sock` and `broker.pid` on disk. The proxy's `_spawn_broker_if_needed` checks `socket_path.exists()` and skips spawning if the socket file is present — even if no process is listening. This silently blocks all future broker mode sessions until the user manually deletes the files. Fix by validating socket liveness via `connect()` before concluding a broker is running: if `connect()` fails with `ConnectionRefusedError`, treat both files as stale, remove them, and proceed with spawn. Also clean up socket file on daemon exit via `atexit`/signal handler. - **Priority:** P0 - **Dependencies:** none @@ -80,10 +81,10 @@ Add new tasks using the canonical template in [TASK_TEMPLATE.md](TASK_TEMPLATE.m - `src/mcpbridge_wrapper/broker/proxy.py` — liveness check in `_spawn_broker_if_needed` - `src/mcpbridge_wrapper/broker/daemon.py` — socket cleanup on exit - **Acceptance Criteria:** - - [ ] After broker crash, next `--broker-spawn` (or `--broker`) session auto-recovers without manual file removal - - [ ] Liveness check uses `connect()` not `exists()` - - [ ] Daemon removes `broker.sock` on clean exit and on SIGTERM - - [ ] All existing broker tests pass + - [x] After broker crash, next `--broker-spawn` (or `--broker`) session auto-recovers without manual file removal + - [x] Liveness check uses `connect()` not `exists()` + - [x] Daemon removes `broker.sock` on clean exit and on SIGTERM + - [x] All existing broker tests pass #### ⬜️ P2-T3: Fix double-spawn race condition when MCP client toggles rapidly - **Description:** When an MCP client (e.g. Zed) toggles the connection off/on quickly, two proxy processes start simultaneously. Both check for a running broker, find none, and both spawn a daemon. Two competing daemons fight over the socket path: one wins, the other crashes. The losing proxy's client gets no broker and shows 0 tools. Fix with a filesystem lock (e.g. `fcntl.flock` on the PID file) so only one spawn attempt proceeds at a time; the second waiter detects the winner's daemon and connects. From 7971e27c5dbf880de88b4e064a70c15ea5300172 Mon Sep 17 00:00:00 2001 From: Egor Merkushev Date: Sun, 1 Mar 2026 05:12:07 +0300 Subject: [PATCH 5/8] Review P2-T2: stale socket recovery --- .../REVIEW_P2-T2_stale_socket_recovery.md | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 SPECS/INPROGRESS/REVIEW_P2-T2_stale_socket_recovery.md diff --git a/SPECS/INPROGRESS/REVIEW_P2-T2_stale_socket_recovery.md b/SPECS/INPROGRESS/REVIEW_P2-T2_stale_socket_recovery.md new file mode 100644 index 00000000..dd6769a6 --- /dev/null +++ b/SPECS/INPROGRESS/REVIEW_P2-T2_stale_socket_recovery.md @@ -0,0 +1,69 @@ +## REVIEW REPORT — P2-T2 Stale Socket Recovery + +**Scope:** origin/main..HEAD +**Files:** 4 (proxy.py, daemon.py, test_broker_proxy.py, test_broker_daemon.py) +**Date:** 2026-03-01 + +--- + +### Summary Verdict + +- [ ] Approve +- [x] Approve with comments +- [ ] Request changes +- [ ] Block + +--- + +### Critical Issues + +None. + +--- + +### Secondary Issues + +**[Low] Blocking socket connect inside async context** + +In `proxy.py` → `_spawn_broker_if_needed`, the liveness check uses `socket.socket(AF_UNIX, ...)` with `s.settimeout(1.0)` — a synchronous blocking call inside an `async def` function. If the daemon is hung (not refusing but also not accepting), the event loop is blocked for up to 1 second. + +In practice, Unix domain socket connects on the local machine are instantaneous; the timeout protects against edge cases. For broker mode, the 1-second block is acceptable given that: +- `_spawn_broker_if_needed` is called once per proxy session start +- The hang scenario (listening socket not accepting) is extremely rare with broker + +**Fix (optional, lower priority):** Could be converted to an async probe via `asyncio.open_unix_connection` with `asyncio.wait_for`. Defer unless profiling shows impact. + +--- + +**[Nit] atexit handler not deregistered after clean stop** + +In `daemon.py`, `atexit.register(self._cleanup_files)` is called on successful start. After `stop()` completes and removes files, the atexit handler remains registered. On interpreter exit after a clean shutdown, `_cleanup_files` runs again on already-absent files. + +This is safe because `_cleanup_files` uses `unlink(missing_ok=True)`. However, repeated registration across multiple `start()`/`stop()` cycles (not currently possible due to PID file locking, but possible in tests) would accumulate atexit registrations. + +**Fix (optional):** Use `atexit.unregister(self._cleanup_files)` in `stop()`. Low priority — current behavior is safe. + +--- + +### Architectural Notes + +- The catch-all `except OSError` in `_spawn_broker_if_needed` handles `ConnectionRefusedError`, `FileNotFoundError` (TOCTOU: socket disappears between `exists()` and `connect()`), and `OSError: [Errno 38] Socket operation on non-socket` (connecting to a regular file). This is the correct breadth of coverage. +- The atexit approach complements the existing SIGTERM/SIGINT handlers in `run_forever()`. The combination means clean files are guaranteed on: normal exit, `sys.exit()`, SIGTERM, SIGINT. Only SIGKILL (non-interceptable) leaves stale files — which the new proxy liveness check now handles automatically. +- The existing `test_spawn_noop_when_socket_exists` test was correctly updated to mock `socket.connect()` success rather than relying on a plain `touch()`'d file (which would have produced an `OSError` with the new code). + +--- + +### Tests + +- 4 new tests added (3 for stale socket proxy scenarios, 1 for atexit daemon registration). +- 1 existing test updated to reflect new liveness-check semantics. +- All 719 tests pass; coverage 91.78% (≥ 90%). +- **Gap (Low priority):** No explicit test for TOCTOU race (socket file disappears between `exists()` and `connect()`). Covered implicitly by `except OSError` but not explicitly tested. + +--- + +### Next Steps + +- **P2-T3** (P1, now unblocked by P2-T2 ✅): Fix double-spawn race condition with `fcntl.flock` spawn lock +- **Optional** (Nit): Add `atexit.unregister` in `daemon.stop()` if test-cycle isolation becomes an issue +- **Optional** (Low): Convert blocking socket probe to async `open_unix_connection` + `wait_for` From f9c2afbb881f97e9312fc7db53cc945a0cf0d189 Mon Sep 17 00:00:00 2001 From: Egor Merkushev Date: Sun, 1 Mar 2026 05:12:38 +0300 Subject: [PATCH 6/8] Archive REVIEW_P2-T2_stale_socket_recovery report --- SPECS/ARCHIVE/INDEX.md | 1 + .../_Historical}/REVIEW_P2-T2_stale_socket_recovery.md | 0 2 files changed, 1 insertion(+) rename SPECS/{INPROGRESS => ARCHIVE/_Historical}/REVIEW_P2-T2_stale_socket_recovery.md (100%) diff --git a/SPECS/ARCHIVE/INDEX.md b/SPECS/ARCHIVE/INDEX.md index de45a7a7..c65b74eb 100644 --- a/SPECS/ARCHIVE/INDEX.md +++ b/SPECS/ARCHIVE/INDEX.md @@ -172,6 +172,7 @@ | File | Description | |------|-------------| +| [REVIEW_P2-T2_stale_socket_recovery.md](_Historical/REVIEW_P2-T2_stale_socket_recovery.md) | Review report for P2-T2 | | [Workplan_0.4.0.md](_Historical/Workplan_0.4.0.md) | Archived workplan snapshot for release 0.4.0 | | [REVIEW_p1_t1_version_badge_script_tests.md](_Historical/REVIEW_p1_t1_version_badge_script_tests.md) | Review report for P1-T1 (version badge script/tests) | | [REVIEW_p1_t2_xcode_26_4_known_issue_link.md](_Historical/REVIEW_p1_t2_xcode_26_4_known_issue_link.md) | Review report for P1-T2 | diff --git a/SPECS/INPROGRESS/REVIEW_P2-T2_stale_socket_recovery.md b/SPECS/ARCHIVE/_Historical/REVIEW_P2-T2_stale_socket_recovery.md similarity index 100% rename from SPECS/INPROGRESS/REVIEW_P2-T2_stale_socket_recovery.md rename to SPECS/ARCHIVE/_Historical/REVIEW_P2-T2_stale_socket_recovery.md From e7e3083c51a05175baf0c1f24a9ec112ff269384 Mon Sep 17 00:00:00 2001 From: Egor Merkushev Date: Sun, 1 Mar 2026 05:15:03 +0300 Subject: [PATCH 7/8] Fix SIM117: combine nested with statements in test_broker_proxy --- tests/unit/test_broker_proxy.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/unit/test_broker_proxy.py b/tests/unit/test_broker_proxy.py index ca93dc69..a88809c0 100644 --- a/tests/unit/test_broker_proxy.py +++ b/tests/unit/test_broker_proxy.py @@ -265,9 +265,10 @@ async def test_spawn_noop_when_socket_exists(self, tmp_path: Path) -> None: mock_sock.__enter__ = MagicMock(return_value=mock_sock) mock_sock.__exit__ = MagicMock(return_value=False) - with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock): - with patch("subprocess.Popen") as mock_popen: - await proxy._spawn_broker_if_needed() + with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock), patch( + "subprocess.Popen" + ) as mock_popen: + await proxy._spawn_broker_if_needed() mock_popen.assert_not_called() @@ -329,11 +330,11 @@ async def test_stale_socket_triggers_spawn(self, tmp_path: Path) -> None: mock_sock.__exit__ = MagicMock(return_value=False) mock_sock.connect.side_effect = ConnectionRefusedError - with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock): - with patch("subprocess.Popen") as mock_popen: - # Socket never appears after spawn, so TimeoutError is expected - with pytest.raises(TimeoutError): - await proxy._spawn_broker_if_needed() + with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock), patch( + "subprocess.Popen" + ) as mock_popen, pytest.raises(TimeoutError): + # Socket never appears after spawn, so TimeoutError is expected + await proxy._spawn_broker_if_needed() mock_popen.assert_called_once() @@ -351,10 +352,10 @@ async def test_stale_socket_removes_files(self, tmp_path: Path) -> None: mock_sock.__exit__ = MagicMock(return_value=False) mock_sock.connect.side_effect = ConnectionRefusedError - with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock): - with patch("subprocess.Popen"): - with pytest.raises(TimeoutError): - await proxy._spawn_broker_if_needed() + with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock), patch( + "subprocess.Popen" + ), pytest.raises(TimeoutError): + await proxy._spawn_broker_if_needed() assert not cfg.socket_path.exists() assert not cfg.pid_file.exists() @@ -372,9 +373,10 @@ async def test_live_socket_skips_spawn(self, tmp_path: Path) -> None: mock_sock.__exit__ = MagicMock(return_value=False) # connect() returns None (success) - with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock): - with patch("subprocess.Popen") as mock_popen: - await proxy._spawn_broker_if_needed() + with patch("mcpbridge_wrapper.broker.proxy.socket.socket", return_value=mock_sock), patch( + "subprocess.Popen" + ) as mock_popen: + await proxy._spawn_broker_if_needed() mock_popen.assert_not_called() From 7db825f68a8501a0df3cf67a59fadb7b282b7e7e Mon Sep 17 00:00:00 2001 From: Egor Merkushev Date: Sun, 1 Mar 2026 05:16:45 +0300 Subject: [PATCH 8/8] Fix F401: remove unused atexit import in test_broker_daemon --- tests/unit/test_broker_daemon.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/test_broker_daemon.py b/tests/unit/test_broker_daemon.py index 3c843539..4952723f 100644 --- a/tests/unit/test_broker_daemon.py +++ b/tests/unit/test_broker_daemon.py @@ -969,8 +969,6 @@ class TestBrokerDaemonAtExit: @pytest.mark.asyncio async def test_atexit_registered_after_start(self, tmp_path: Path) -> None: """After start(), _cleanup_files is registered with atexit.""" - import atexit - cfg = _make_config(tmp_path) daemon = BrokerDaemon(cfg) proc = _make_mock_process()