From 736cf8bc3e821a7559a4d7d1650f9757c4af423f Mon Sep 17 00:00:00 2001 From: ProtocolWarden <32967198+ProtocolWarden@users.noreply.github.com> Date: Thu, 21 May 2026 07:06:13 -0400 Subject: [PATCH] =?UTF-8?q?add=20loop=20controller=20=E2=80=94=20replace?= =?UTF-8?q?=20/loop=20+=20ScheduleWakeup=20with=20external=20spawn?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tools/loop/controller.py spawns a fresh claude -p session for each watchdog cycle. Context never accumulates; each session reconstructs from the CLP checkpoint. Session writes .context/loop_schedule.json at STEP 10 (state + delay_s + reason); controller reads it for adaptive timing before spawning the next session. Updates: - watchdog_loop_prompt.md STEP 10: write loop_schedule.json, exit cleanly - watchdog_loop.md: intro, Starting/Stopping sections, embedded STEP 10 - LOOP_START.md: Step 3 replaced with controller start commands Usage: nohup python tools/loop/controller.py & Stop: python tools/loop/controller.py --stop Log: logs/local/loop_controller.log Co-Authored-By: Claude Sonnet 4.6 --- .console/log.md | 8 ++ .console/watchdog_loop_prompt.md | 21 ++- LOOP_START.md | 18 ++- docs/operator/watchdog_loop.md | 38 +++-- tools/loop/controller.py | 236 +++++++++++++++++++++++++++++++ 5 files changed, 300 insertions(+), 21 deletions(-) create mode 100644 tools/loop/controller.py diff --git a/.console/log.md b/.console/log.md index 0fce255b..22fd1be1 100644 --- a/.console/log.md +++ b/.console/log.md @@ -1,5 +1,13 @@ # Log +## 2026-05-21 — Add loop controller (replace /loop + ScheduleWakeup) + +tools/loop/controller.py spawns a fresh claude -p session per watchdog cycle. +Context never accumulates across cycles. Session writes .context/loop_schedule.json +at STEP 10 with {delay_s, state, reason}; controller reads it for adaptive timing. +Updated watchdog_loop_prompt.md STEP 10, watchdog_loop.md, and LOOP_START.md. +Enables overnight unattended runs without session context exhaustion. + ## 2026-05-21 — Update ADR-0003 to reference CI design Added "Related" section to ADR-0003 documenting the relationship between diff --git a/.console/watchdog_loop_prompt.md b/.console/watchdog_loop_prompt.md index b5b0ddad..5190c4f0 100644 --- a/.console/watchdog_loop_prompt.md +++ b/.console/watchdog_loop_prompt.md @@ -135,7 +135,7 @@ Update .console/backlog.md if any tasks were completed or newly blocked. Commit to branch oc-watchdog/-. One commit per repo per cycle. Run `git diff --staged` before committing. -### STEP 10 — ADAPTIVE SCHEDULEWAKEUP +### STEP 10 — WRITE SCHEDULE AND EXIT | State | Delay | Trigger | |-------|-------|---------| @@ -150,11 +150,20 @@ Run `git diff --staged` before committing. Use WORST state observed across all signals. Log chosen cadence and driving signal. -Call ScheduleWakeup with: -- prompt: contents of this file path prefixed with `/loop `: - `/loop Run the OC watchdog loop. Read /home/dev/Documents/GitHub/OperationsCenter/.console/watchdog_loop_prompt.md and follow it exactly.` -- delaySeconds: per table above -- reason: one sentence naming the driving signal +Write the schedule file — the controller reads this to determine how long to sleep before spawning the next session: + +```python +import json +from pathlib import Path +schedule = { + "delay_s": , + "state": "", + "reason": "", +} +Path(".context/loop_schedule.json").write_text(json.dumps(schedule)) +``` + +Do NOT call ScheduleWakeup. Exit cleanly after writing the schedule file. --- diff --git a/LOOP_START.md b/LOOP_START.md index 67c4fa26..6addadb6 100644 --- a/LOOP_START.md +++ b/LOOP_START.md @@ -42,10 +42,24 @@ scripts/reset-training-branches.sh --- -## Step 3 — Paste this into Claude Code +## Step 3 — Start the controller + +```bash +cd /home/dev/Documents/GitHub/OperationsCenter +nohup python tools/loop/controller.py > /dev/null 2>&1 & +python tools/loop/controller.py --status # confirm running +# To stop: python tools/loop/controller.py --stop +# Log: logs/local/loop_controller.log +``` + +Each iteration is a fresh `claude -p` session — context never accumulates across cycles. +The session writes `.context/loop_schedule.json` at STEP 10; the controller reads it for +adaptive delay before spawning the next session. + +### What the controller passes to each session ``` -/loop Run the OC/Platform stabilization and audit cycle from /home/dev/Documents/GitHub/OperationsCenter. Source .env.operations-center.local first. Use .venv/bin/ for all CLIs. This loop is session-bound and uses ScheduleWakeup, not cron/systemd. +Run the OC/Platform stabilization and audit cycle from /home/dev/Documents/GitHub/OperationsCenter. Source .env.operations-center.local first. Use .venv/bin/ for all CLIs. This loop is controller-driven — do NOT call ScheduleWakeup. STEP 0 — OWNERSHIP + PREFLIGHT: Acquire/verify logs/local/watchdog_loop.lock via: diff --git a/docs/operator/watchdog_loop.md b/docs/operator/watchdog_loop.md index abb081da..df608718 100644 --- a/docs/operator/watchdog_loop.md +++ b/docs/operator/watchdog_loop.md @@ -12,9 +12,10 @@ The loop is **not merely an hourly audit runner**. When the platform is unhealthy it shortens its cadence and actively works to restore forward progress. When healthy it backs off to maintenance frequency. -The loop is session-bound: it runs as long as the Claude Code session is open. -It uses `ScheduleWakeup`, not cron/systemd/daemon behavior. Do not replace it -with a system scheduler. +The loop is controller-driven: `tools/loop/controller.py` spawns a fresh +`claude -p` session for each iteration so context never accumulates. Each session +exits cleanly after writing `.context/loop_schedule.json`; the controller reads +that file for adaptive timing before launching the next session. **Related docs:** - [`self_healing_model.md`](self_healing_model.md) — convergence phases 1–7, architecture, ownership model @@ -225,10 +226,17 @@ must only run when this loop owns the lock. ## Starting the Loop -Invoke in the Claude Code session: +```bash +cd /home/dev/Documents/GitHub/OperationsCenter +nohup python tools/loop/controller.py > /dev/null 2>&1 & +python tools/loop/controller.py --status +# Log: logs/local/loop_controller.log +``` + +Each session receives this prompt: ``` -/loop Run the OC/Platform stabilization and audit cycle from /home/dev/Documents/GitHub/OperationsCenter. Source .env.operations-center.local first. Use .venv/bin/ for all CLIs. This loop is session-bound and uses ScheduleWakeup, not cron/systemd. +Run the OC/Platform stabilization and audit cycle from /home/dev/Documents/GitHub/OperationsCenter. Source .env.operations-center.local first. Use .venv/bin/ for all CLIs. This loop is session-bound and uses ScheduleWakeup, not cron/systemd. STEP 0 — OWNERSHIP + PREFLIGHT: Acquire/verify logs/local/watchdog_loop.lock via: @@ -580,8 +588,8 @@ explicitly allowed it for the current task/session. If not allowed, create a bra One logical commit per repo per cycle. Commit message must name: root cause, affected repo, gate/check fixed. Never force-push, amend old loop commits, or commit generated noise. -STEP 10 — ADAPTIVE SCHEDULEWAKEUP: -Assess platform health state and choose ScheduleWakeup delay accordingly: +STEP 10 — WRITE SCHEDULE AND EXIT: +Assess platform health state and choose the appropriate delay: CRITICAL — crash loops / graph broken / autonomy failing repeatedly: 180s DEGRADED — watcher crashes (non-143) / blocked queue unchanged / flow gaps: 300s @@ -638,7 +646,8 @@ Automation self-deception: DEGRADED minimum cadence + create Plane escalation ta Use the WORST health state observed across all steps. Starvation/stagnation/convergence signals force STALLED minimum immediately — single cycle evidence is sufficient. Log the chosen cadence and the driving signal in the cycle summary. -Pass this full /loop prompt verbatim as the ScheduleWakeup prompt. +Write .context/loop_schedule.json with {"delay_s": , "state": "", "reason": ""} +then exit cleanly. Do NOT call ScheduleWakeup — the controller reads this file. ``` --- @@ -953,13 +962,16 @@ incidental remediation. ## Stopping the Loop -The loop stops when you close the Claude Code session, or tell Claude to stop. -To stop explicitly, tell Claude: "stop the loop" — it will omit the next -`ScheduleWakeup` call and the loop ends naturally. +```bash +python tools/loop/controller.py --stop +``` -Before stopping, release the lock: +This writes a stop flag; the current session finishes its iteration normally, +then the controller exits. The controller lock is released on exit. + +To stop immediately (kills the running session): ```bash -scripts/operations-center.sh watchdog-loop-release +kill $(python -c "import json; print(json.load(open('logs/local/loop_controller.lock'))['pid'])") ``` --- diff --git a/tools/loop/controller.py b/tools/loop/controller.py new file mode 100644 index 00000000..1caa32d2 --- /dev/null +++ b/tools/loop/controller.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +"""OC platform watchdog loop controller. + +Replaces /loop + ScheduleWakeup. Spawns a fresh bounded claude -p session +for each watchdog cycle. Context never accumulates; each session reconstructs +from .context/checkpoints/. The session writes .context/loop_schedule.json +at STEP 10 (instead of calling ScheduleWakeup) to communicate the adaptive delay. + +Usage: + python tools/loop/controller.py # start (foreground; nohup & for overnight) + python tools/loop/controller.py --stop # write stop flag; current session finishes + python tools/loop/controller.py --status # show lock state +""" + +import argparse +import json +import os +import signal +import socket +import subprocess +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +REPO_ROOT = Path("/home/dev/Documents/GitHub/OperationsCenter") +LOCK_PATH = REPO_ROOT / "logs/local/loop_controller.lock" +STOP_FLAG = REPO_ROOT / "logs/local/loop_stop.flag" +SCHEDULE_FILE = REPO_ROOT / ".context/loop_schedule.json" +LOG_FILE = REPO_ROOT / "logs/local/loop_controller.log" + +# Fallback delays (seconds) when session doesn't write loop_schedule.json. +# Maps health state name → delay; also used as documentation of the full table. +STATE_DELAYS: dict[str, int] = { + "CRITICAL": 180, + "DEGRADED": 300, + "STALLED": 600, + "ACTIVE": 900, + "PARKED_OPERATOR_BLOCKED": 1800, + "HEALTHY": 3600, +} +DEFAULT_DELAY = 600 # conservative fallback (STALLED equivalent) + +SESSION_PROMPT = ( + "You are running one iteration of the OC platform watchdog cycle, " + "managed by an external controller. " + "Full instructions are in docs/operator/watchdog_loop.md. " + "Read .console/.context first. " + "Do NOT call ScheduleWakeup — the controller manages timing. " + "At STEP 10, assess platform health state, choose the appropriate delay " + "from the cadence table, then write .context/loop_schedule.json with: " + '{"delay_s": , "state": "", "reason": ""} ' + "and exit cleanly." +) + +_stop = False + + +def _ts() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _log(msg: str) -> None: + line = f"[{_ts()}] {msg}" + print(line, flush=True) + try: + LOG_FILE.parent.mkdir(parents=True, exist_ok=True) + with LOG_FILE.open("a") as f: + f.write(line + "\n") + except OSError: + pass + + +def handle_signal(signum, frame) -> None: + global _stop + _stop = True + STOP_FLAG.touch() + _log(f"Signal {signum} received — stop flag set; waiting for current session to finish.") + + +def write_lock() -> None: + LOCK_PATH.parent.mkdir(parents=True, exist_ok=True) + lock = { + "pid": os.getpid(), + "started": _ts(), + "hostname": socket.gethostname(), + "purpose": "oc_watchdog_loop_controller", + "repo_root": str(REPO_ROOT), + } + LOCK_PATH.write_text(json.dumps(lock, indent=2)) + + +def check_and_acquire_lock() -> bool: + """Return True if we acquired the lock (no live owner).""" + if LOCK_PATH.exists(): + try: + d = json.loads(LOCK_PATH.read_text()) + pid = d.get("pid") + if pid: + try: + os.kill(pid, 0) + _log(f"Lock held by live pid={pid} (started {d.get('started')}) — aborting.") + return False + except ProcessLookupError: + _log(f"Stale lock (pid={pid} dead) — reclaiming.") + except (json.JSONDecodeError, KeyError): + _log("Malformed lock file — reclaiming.") + write_lock() + return True + + +def release_lock() -> None: + LOCK_PATH.unlink(missing_ok=True) + + +def get_delay() -> int: + """Read delay from schedule file written by the session at STEP 10.""" + try: + if SCHEDULE_FILE.exists(): + s = json.loads(SCHEDULE_FILE.read_text()) + delay = s.get("delay_s") + state = s.get("state", "?") + reason = s.get("reason", "") + if isinstance(delay, int) and delay > 0: + _log(f"Schedule: state={state}, delay={delay}s — {reason}") + return delay + except Exception as e: + _log(f"Failed to read schedule file: {e}") + + _log(f"No valid schedule file — using default {DEFAULT_DELAY}s (STALLED)") + return DEFAULT_DELAY + + +def run_session() -> int: + """Spawn one bounded claude -p session. Returns exit code.""" + cmd = ["claude", "-p", SESSION_PROMPT, "--output-format", "text"] + _log(f"Spawning session: {' '.join(cmd[:2])} ...") + # Source env before spawning so the session inherits OC credentials + env = os.environ.copy() + env_file = REPO_ROOT / ".env.operations-center.local" + if env_file.exists(): + for line in env_file.read_text().splitlines(): + line = line.strip() + if line and not line.startswith("#") and "=" in line: + k, _, v = line.partition("=") + env.setdefault(k.strip(), v.strip()) + proc = subprocess.run(cmd, cwd=REPO_ROOT, env=env) + return proc.returncode + + +def interruptible_sleep(seconds: int) -> None: + """Sleep in small increments so SIGTERM is handled promptly.""" + deadline = time.monotonic() + seconds + while time.monotonic() < deadline and not _stop and not STOP_FLAG.exists(): + time.sleep(min(5.0, deadline - time.monotonic())) + + +def cmd_status() -> None: + if not LOCK_PATH.exists(): + print("No lock file — controller is not running.") + return + try: + d = json.loads(LOCK_PATH.read_text()) + pid = d.get("pid") + try: + os.kill(pid, 0) + print(f"ACTIVE: pid={pid}, started={d.get('started')}, host={d.get('hostname')}") + except ProcessLookupError: + print(f"STALE: pid={pid} is dead (lock from {d.get('started')})") + except Exception as e: + print(f"ERROR reading lock: {e}") + if SCHEDULE_FILE.exists(): + try: + s = json.loads(SCHEDULE_FILE.read_text()) + print(f"Last schedule: state={s.get('state')}, delay={s.get('delay_s')}s — {s.get('reason')}") + except Exception: + pass + + +def cmd_stop() -> None: + STOP_FLAG.touch() + print(f"Stop flag written to {STOP_FLAG}. Current session will finish, then controller exits.") + + +def main() -> None: + parser = argparse.ArgumentParser(description="OC watchdog loop controller") + parser.add_argument("--stop", action="store_true", help="Signal the controller to stop after the current session") + parser.add_argument("--status", action="store_true", help="Show controller lock state and last schedule") + args = parser.parse_args() + + if args.status: + cmd_status() + return + + if args.stop: + cmd_stop() + return + + signal.signal(signal.SIGTERM, handle_signal) + signal.signal(signal.SIGINT, handle_signal) + + if not check_and_acquire_lock(): + sys.exit(1) + + STOP_FLAG.unlink(missing_ok=True) + _log(f"OC watchdog loop controller started. pid={os.getpid()}") + _log(f"Stop with: python tools/loop/controller.py --stop") + _log(f"Status: python tools/loop/controller.py --status") + _log(f"Log: {LOG_FILE}") + + iteration = 0 + try: + while not _stop and not STOP_FLAG.exists(): + iteration += 1 + _log(f"--- Iteration {iteration} ---") + + # Clear stale schedule from previous session before spawning + SCHEDULE_FILE.unlink(missing_ok=True) + + rc = run_session() + _log(f"Session exited rc={rc}") + + if _stop or STOP_FLAG.exists(): + break + + delay = get_delay() + _log(f"Sleeping {delay}s ...") + interruptible_sleep(delay) + finally: + release_lock() + _log(f"OC watchdog loop controller stopped after {iteration} iteration(s).") + print(f"\nStopped. Log: {LOG_FILE}") + + +if __name__ == "__main__": + main()