Skip to content

Commit 10b4976

Browse files
committed
Implement P2-T8: back off broker tools probe retries
1 parent 15bb9ea commit 10b4976

2 files changed

Lines changed: 61 additions & 8 deletions

File tree

src/mcpbridge_wrapper/broker/daemon.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
# broker_id is 1_048_576. These negative/zero values can never collide.
4444
_BROKER_INIT_ID = 0
4545
_BROKER_TOOLS_ID = -1
46-
_TOOLS_PROBE_RETRY_DELAY_SECONDS = 0.25
46+
_TOOLS_PROBE_RETRY_BASE_DELAY_SECONDS = 0.25
47+
_TOOLS_PROBE_RETRY_MAX_DELAY_SECONDS = 2.0
4748

4849

4950
class BrokerDaemon:
@@ -95,6 +96,7 @@ def __init__(
9596
self._tools_catalog_ready: asyncio.Event = asyncio.Event()
9697
# Background retry task for broker-internal tools/list warm-up probes.
9798
self._tools_probe_retry_task: asyncio.Task[None] | None = None
99+
self._tools_probe_retry_attempt: int = 0
98100

99101
# ------------------------------------------------------------------
100102
# Public API
@@ -398,12 +400,26 @@ def _schedule_tools_list_probe(self, *, delay: float = 0.0) -> None:
398400
self._retry_tools_list_probe_after_delay(delay)
399401
)
400402

403+
def _reset_tools_probe_retry_backoff(self) -> None:
404+
"""Reset retry state for broker-internal tools/list warm-up probing."""
405+
self._tools_probe_retry_attempt = 0
406+
407+
def _next_tools_probe_retry_delay(self) -> float:
408+
"""Return the next bounded backoff delay for broker tools/list retries."""
409+
delay = min(
410+
_TOOLS_PROBE_RETRY_BASE_DELAY_SECONDS * (2**self._tools_probe_retry_attempt),
411+
_TOOLS_PROBE_RETRY_MAX_DELAY_SECONDS,
412+
)
413+
self._tools_probe_retry_attempt += 1
414+
return float(delay)
415+
401416
def _cancel_tools_probe_retry(self) -> None:
402417
"""Cancel any pending retry for the broker-internal tools/list probe."""
403418
task = self._tools_probe_retry_task
404419
if task is not None and not task.done():
405420
task.cancel()
406421
self._tools_probe_retry_task = None
422+
self._reset_tools_probe_retry_backoff()
407423

408424
async def _rollback_startup(self) -> None:
409425
"""Roll back a failed :meth:`start` sequence.
@@ -565,6 +581,7 @@ async def _read_upstream_loop(self) -> None:
565581
# Now fetch tools/list for the cache.
566582
upstream = self._upstream
567583
if upstream is not None and upstream.stdin is not None:
584+
self._reset_tools_probe_retry_backoff()
568585
self._schedule_tools_list_probe()
569586
continue
570587

@@ -585,18 +602,32 @@ async def _read_upstream_loop(self) -> None:
585602
else:
586603
self._tools_list_cache = None
587604
self._tools_catalog_ready.clear()
588-
logger.warning(
605+
delay = self._next_tools_probe_retry_delay()
606+
log_fn = (
607+
logger.warning
608+
if self._tools_probe_retry_attempt == 1
609+
else logger.debug
610+
)
611+
log_fn(
589612
"Broker tools/list probe returned an empty or invalid "
590-
"tool catalog; retrying warm-up probe."
613+
"tool catalog; retry %d in %.2fs.",
614+
self._tools_probe_retry_attempt,
615+
delay,
591616
)
592-
self._schedule_tools_list_probe(delay=_TOOLS_PROBE_RETRY_DELAY_SECONDS)
617+
self._schedule_tools_list_probe(delay=delay)
593618
else:
594619
self._tools_list_cache = None
595620
self._tools_catalog_ready.clear()
596-
logger.warning(
597-
"Broker tools/list probe returned no result; retrying warm-up probe."
621+
delay = self._next_tools_probe_retry_delay()
622+
log_fn = (
623+
logger.warning if self._tools_probe_retry_attempt == 1 else logger.debug
624+
)
625+
log_fn(
626+
"Broker tools/list probe returned no result; retry %d in %.2fs.",
627+
self._tools_probe_retry_attempt,
628+
delay,
598629
)
599-
self._schedule_tools_list_probe(delay=_TOOLS_PROBE_RETRY_DELAY_SECONDS)
630+
self._schedule_tools_list_probe(delay=delay)
600631
continue
601632

602633
if self._transport is not None:

tests/unit/test_broker_daemon.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1335,7 +1335,10 @@ def _write(data: bytes) -> None:
13351335
proc.stdin.write = MagicMock(side_effect=_write)
13361336

13371337
with patch(
1338-
"mcpbridge_wrapper.broker.daemon._TOOLS_PROBE_RETRY_DELAY_SECONDS",
1338+
"mcpbridge_wrapper.broker.daemon._TOOLS_PROBE_RETRY_BASE_DELAY_SECONDS",
1339+
0.0,
1340+
), patch(
1341+
"mcpbridge_wrapper.broker.daemon._TOOLS_PROBE_RETRY_MAX_DELAY_SECONDS",
13391342
0.0,
13401343
), patch(
13411344
"mcpbridge_wrapper.broker.daemon.asyncio.create_subprocess_exec",
@@ -1350,6 +1353,25 @@ def _write(data: bytes) -> None:
13501353
assert daemon.tools_catalog_ready.is_set()
13511354
assert sum('"method":"tools/list"' in msg for msg in sent_messages) == 2
13521355

1356+
def test_tools_list_probe_retry_backoff_is_bounded_and_resets(self, tmp_path: Path) -> None:
1357+
"""Retry delays should back off and reset after cancellation/success transitions."""
1358+
cfg = _make_config(tmp_path)
1359+
daemon = BrokerDaemon(cfg)
1360+
1361+
with patch(
1362+
"mcpbridge_wrapper.broker.daemon._TOOLS_PROBE_RETRY_BASE_DELAY_SECONDS",
1363+
0.25,
1364+
), patch(
1365+
"mcpbridge_wrapper.broker.daemon._TOOLS_PROBE_RETRY_MAX_DELAY_SECONDS",
1366+
1.0,
1367+
):
1368+
assert daemon._next_tools_probe_retry_delay() == 0.25
1369+
assert daemon._next_tools_probe_retry_delay() == 0.5
1370+
assert daemon._next_tools_probe_retry_delay() == 1.0
1371+
assert daemon._next_tools_probe_retry_delay() == 1.0
1372+
daemon._cancel_tools_probe_retry()
1373+
assert daemon._next_tools_probe_retry_delay() == 0.25
1374+
13531375
@pytest.mark.asyncio
13541376
async def test_upstream_initialized_cleared_on_reconnect(self, tmp_path: Path) -> None:
13551377
"""_upstream_initialized is cleared at the start of _reconnect()."""

0 commit comments

Comments
 (0)