diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 0dcb75d..f39f5c4 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -18,6 +18,14 @@ "source": "./qa", "homepage": "https://github.com/browser-use/plugins/tree/main/qa", "keywords": ["qa", "testing", "browser-use", "cloud-browser", "score", "evaluation", "ngrok"] + }, + { + "name": "watch", + "description": "Monitor your Chrome in real time — network, console logs, and your actions (clicks/typing/navigations) plus a dense screen recording — then answer what happened. Follows the active tab across all tabs. Run as /watch. Requires the browser-harness plugin + CLI.", + "category": "automation", + "source": "./watch", + "homepage": "https://github.com/browser-use/plugins/tree/main/watch", + "keywords": ["watch", "monitor", "browser-use", "recording", "timeline", "network", "console", "observability"] } ] } diff --git a/README.md b/README.md index 6b68ce6..c9090b9 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ This repo is a catalog of Browser Use plugins for Claude Code. Most entries remo |---|---|---| | **browser-harness** | Direct CDP browser control — coordinate clicks, screenshots, persistent Python session, local Chrome or Browser Use cloud. | [browser-use/browser-harness](https://github.com/browser-use/browser-harness) | | **qa** | QA-test a website or app and return a 1–5 quality score with evidence. Drives a Browser Use cloud browser and tunnels localhost automatically. Run as `/qa `. | [`qa/`](./qa) (colocated; requires browser-harness) | +| **watch** | Monitor your Chrome in real time — network, console, and your actions (clicks/typing/navigations) plus a dense screen recording — then answer what happened. Follows the active tab across all tabs. Run as `/watch`. | [`watch/`](./watch) (colocated; requires browser-harness) | -Both ship **skills only**. The `browser-harness` CLI is a one-time install prerequisite documented inside the plugin; `qa` runs through browser-harness, so install that first. +All ship **skills only**. The `browser-harness` CLI is a one-time install prerequisite documented inside the plugin; `qa` and `watch` run through browser-harness, so install that first. ## Install @@ -19,6 +20,7 @@ Both ship **skills only**. The `browser-harness` CLI is a one-time install prere claude plugin marketplace add browser-use/plugins claude plugin install browser-harness@browser-use claude plugin install qa@browser-use # adds /qa +claude plugin install watch@browser-use # adds /watch ``` ## Layout diff --git a/watch/.claude-plugin/plugin.json b/watch/.claude-plugin/plugin.json new file mode 100644 index 0000000..6d18985 --- /dev/null +++ b/watch/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "watch", + "version": "0.1.0", + "description": "Monitor the user's Chrome in real time — network, console logs, and user actions (clicks/typing/navigations) — plus a dense screen recording — then answer what happened. Follows the active tab across all tabs. Requires the browser-harness plugin + CLI.", + "author": { + "name": "Browser Use", + "url": "https://browser-use.com" + }, + "homepage": "https://github.com/browser-use/plugins", + "repository": "https://github.com/browser-use/plugins", + "license": "MIT", + "keywords": ["watch", "monitor", "browser", "browser-use", "recording", "timeline", "network", "console", "observability"] +} diff --git a/watch/.gitignore b/watch/.gitignore new file mode 100644 index 0000000..7a60b85 --- /dev/null +++ b/watch/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.pyc diff --git a/watch/skills/watch/SKILL.md b/watch/skills/watch/SKILL.md new file mode 100644 index 0000000..f6ce711 --- /dev/null +++ b/watch/skills/watch/SKILL.md @@ -0,0 +1,67 @@ +--- +name: watch +description: Monitor the user's Chrome in real time — network requests, console logs/errors, and user actions (clicks, form changes, navigations) — plus a dense screen recording. Use when the user wants you to "watch", "monitor", or "keep an eye on" their browser so you can later answer what happened ("what was that error?", "did the upload finish?", "why did checkout fail?"). You keep watching across turns and decide when to stop — when the user asks about what happened, stop, review, and answer. +--- + +# watch + +Watch the user's Chrome so you can later tell them exactly what happened. You run a background monitor that records **two streams**: + +- **Timeline** (`timeline.jsonl`) — your primary signal: network requests/responses, console logs + exceptions, and user actions (clicks, form `change`s with field identity + value *length* only, submits, navigations). This is *what* happened, timestamped. +- **Frames** (`frames/.png`) — a dense ~2.5 fps recording. This is *how it looked*. You consult it on demand by reading the frames around a timestamp — don't load the whole thing. + +Captures **only Chrome** (the page's own CDP stream), needs **no screen-recording permission**, and is **not a daemon** — you start it and you stop it, within this session. Requires the `browser-harness` CLI on PATH (verify with `command -v browser-harness`; if missing, tell the user to install it — see https://github.com/browser-use/browser-harness). + +## Start watching + +When asked to watch/monitor, spawn the monitor **in the background** and confirm, then go quiet — it accumulates on its own while you're idle between turns. + +```bash +MON="" # e.g. ${CLAUDE_PLUGIN_ROOT}/skills/watch/monitor.py +WATCH_DIR="/tmp/watch-$(date +%s)"; mkdir -p "$WATCH_DIR" +echo "$WATCH_DIR" > /tmp/watch-current # so you can find it next turn +WATCH_DIR="$WATCH_DIR" browser-harness < "$MON" # run this with run_in_background +``` + +Use the real absolute path to `monitor.py` in this skill's directory (you know where this SKILL.md lives; under a plugin it's `${CLAUDE_PLUGIN_ROOT}/skills/watch/monitor.py`). Run that last line as a **background** command. Tell the user: "Watching your Chrome — go do your thing, then ask me what happened." Do **not** poll or screenshot yourself between turns; the monitor handles it. + +## When the user asks what happened + +You decide this is the cue to stop (a question about what occurred = stop, review, answer). Then: + +1. **Stop the monitor** cleanly: `touch "$(cat /tmp/watch-current)/STOP"` and give it ~1s to flush. +2. **Read the FULL structured timeline first — this is the authoritative record, the frames are not.** Always reconstruct from the complete event stream before looking at a single image: + - **Every action** (`action` kind): clicks, typing, keys, scrolls, submits — the whole list, chronological, not a filtered subset. + - **Every navigation** (`watch.tab` + `action.k=="nav"`): the tab/URL trail. **Searches and direct URL visits live here, not in actions** — a Google/address-bar search shows up only as a navigation whose URL holds the `q=` query (page listeners can't see omnibox typing). If you skip navigations you will miss searches and page-to-page movement. + - **Network** (`net` / `net.fail`): statuses, failures, the API calls behind each step. + ```bash + D="$(cat /tmp/watch-current)" + # full action + navigation trail, chronological — read ALL of it, don't pre-filter to a keyword + python3 -c " + import json,datetime + for l in open('$D/timeline.jsonl'): + e=json.loads(l); k=e['kind']; d=e['data'] + if k not in ('action','watch.tab'): continue + ts=datetime.datetime.fromtimestamp(e['t']).strftime('%H:%M:%S') + if k=='watch.tab': print(f'[{ts}] TAB {d[\"url\"]}') + else: print(f'[{ts}] {d.get(\"k\"):7} {d.get(\"target\",\"\")[:40]} {d.get(\"text\") or d.get(\"to\") or d.get(\"value\") or d.get(\"key\") or d.get(\"y\",\"\")}') + " + # network errors / notable statuses + grep -E '"net.fail"|"status": [45][0-9][0-9]' "$D/timeline.jsonl" + ``` + Build the answer from this complete picture. Each line has `t` (epoch seconds) → your index into the frames. +3. **Then use the images as extra help** — only for the few moments where a line needs visual detail (what a page/error/post actually looked like). List frames whose `` falls in `[t-3s, t+3s]` and `Read` that slice. Frames are supplementary confirmation, never the primary source, and never the whole `frames/` dir. + ```bash + T=; lo=$(( (T-3)*1000 )); hi=$(( (T+3)*1000 )) + for f in "$D"/frames/*.jpg; do n=$(basename "$f" .jpg); [ "$n" -ge "$lo" ] && [ "$n" -le "$hi" ] && echo "$f"; done + ``` +4. **Answer from the full timeline, backed by frames where useful** — include the navigation/search trail, e.g. "…then you searched Google for **shawn pan poop**, opened the first result, and `POST /api/x` returned **500** (frame `…188.jpg` shows the red toast)." Don't report only the in-page clicks; the searches and tab moves are part of "what I did." + +Optional — if the user wants to watch it themselves, assemble an mp4 from the frame slice with ffmpeg (`ffmpeg -pattern_type glob -i '…/frames/*.jpg' clip.mp4`). + +## Notes + +- **Granular by design:** the recording is dense so playback shows motion (spinner → toast → redirect), but you only ever *read* the slice the question needs — the timeline tells you where to look. +- **Privacy:** raw keystrokes and input values are **not** captured — only field identity + value length, and password fields are redacted. Say so if the user asks. +- **Cleanup:** the recorder keeps disk bounded on its own — it trims frames older than `WATCH_MAX_AGE` (30 min) within a session, and on every start it purges old `/tmp/watch-*` dirs, keeping only the newest `WATCH_KEEP_DIRS` (3). After you've fully answered and stopped the monitor, you can also `rm -rf "$(cat /tmp/watch-current)"` to drop the current session's frames immediately. +- If the user says "keep watching" or asks something mid-stream, you may answer from the timeline without stopping — only stop when the episode is clearly over. diff --git a/watch/skills/watch/monitor.py b/watch/skills/watch/monitor.py new file mode 100644 index 0000000..1c8f364 --- /dev/null +++ b/watch/skills/watch/monitor.py @@ -0,0 +1,163 @@ +# Background Chrome monitor for the /watch skill — v3. +# Follows the ACTIVE tab across all tabs; actions pushed via CDP binding (nav-proof, immediate). +# Run via: WATCH_DIR=/tmp/watch-XXXX browser-harness < monitor.py (backgrounded) +# Writes: $WATCH_DIR/frames/.jpg (dense recording of whichever tab is in front) +# $WATCH_DIR/timeline.jsonl (actions + console + network, tagged with tab url) +# Stop: touch $WATCH_DIR/STOP +# Captures ONLY Chrome, no OS permission. Privacy: input is length-only; passwords redacted; no raw text. + +import os, time, json, base64, glob + +WATCH_DIR = os.environ["WATCH_DIR"] +FRAMES = os.path.join(WATCH_DIR, "frames"); os.makedirs(FRAMES, exist_ok=True) +TL = open(os.path.join(WATCH_DIR, "timeline.jsonl"), "a", buffering=1) +STOP = os.path.join(WATCH_DIR, "STOP") +INTERVAL = float(os.environ.get("WATCH_INTERVAL", "0.4")) +MAX_AGE = float(os.environ.get("WATCH_MAX_AGE", "1800")) # trim frames older than this (sec) +KEEP_DIRS = int(os.environ.get("WATCH_KEEP_DIRS", "3")) # how many session dirs to keep around + +def purge_old_sessions(): + """Delete stale /tmp/watch-* session dirs so /watch can't leak GB of frames over time. + Keeps the newest KEEP_DIRS (by mtime), including the current one; removes the rest.""" + import shutil + base = os.path.dirname(WATCH_DIR) + dirs = [] + for n in os.listdir(base): + p = os.path.join(base, n) + if n.startswith("watch-") and n[6:].isdigit() and os.path.isdir(p): + try: dirs.append((os.path.getmtime(p), p)) + except OSError: pass + for _, p in sorted(dirs, reverse=True)[KEEP_DIRS:]: + if p != WATCH_DIR: + shutil.rmtree(p, ignore_errors=True) + +purge_old_sessions() + +# Listeners push each action through the __watchEmit CDP binding the instant it happens — +# so a click that triggers a navigation is captured before the page unloads. +BOOT = r""" +(function(){ + if (window.__watch__) return; window.__watch__ = 1; + var emit=function(o){ try{ o.t=Date.now()/1000; o.url=location.href; + if (window.__watchEmit) { window.__watchEmit(JSON.stringify(o)); } // fresh pages: instant, nav-proof + else { (window.__watchLog=window.__watchLog||[]).push(o); // pre-existing pages: buffer, polled + if(window.__watchLog.length>1000) window.__watchLog.shift(); } }catch(e){} }; + var d=function(el){ if(!el||!el.tagName) return ''; var s=el.tagName; + if(el.id)s+='#'+el.id; if(el.name)s+='[name='+el.name+']'; + if(typeof el.className==='string'&&el.className)s+='.'+el.className.split(' ')[0]; return s; }; + var val=function(t){ return (t.type==='password')?'':((t.value||'').length+' chars'); }; + document.addEventListener('click',function(e){emit({k:'click',target:d(e.target),text:(e.target.innerText||'').slice(0,60)});},true); + document.addEventListener('submit',function(e){emit({k:'submit',target:d(e.target),action:e.target.action||''});},true); + document.addEventListener('change',function(e){emit({k:'change',target:d(e.target),value:val(e.target)});},true); + document.addEventListener('input',function(e){var t=e.target; clearTimeout(t.__wt); + t.__wt=setTimeout(function(){emit({k:'type',target:d(t),value:val(t)});},600);},true); + document.addEventListener('keydown',function(e){ if(['Enter','Escape','Tab'].indexOf(e.key)>=0) + emit({k:'key',key:e.key,target:d(e.target)});},true); + var st; window.addEventListener('scroll',function(){ if(st)return; + st=setTimeout(function(){emit({k:'scroll',y:Math.round(window.scrollY||0)}); st=null;},800);},true); + ['error','warn'].forEach(function(lvl){ var o=console[lvl]; + console[lvl]=function(){ try{emit({k:'console.'+lvl,args:[].slice.call(arguments).map(String).slice(0,5)});}catch(e){} + return o.apply(console,arguments);};}); + var last=location.href; setInterval(function(){ if(location.href!==last){emit({k:'nav',from:last,to:location.href}); last=location.href;} },300); +})(); +""" + +cdp("Target.setAutoAttach", autoAttach=True, flatten=True, waitForDebuggerOnStart=False) +sessions = {} + +def page_targets(): + return [t for t in cdp("Target.getTargets")["targetInfos"] + if t["type"] == "page" and not t["url"].startswith(("devtools://", "chrome://"))] + +def ev(sid, expr): + return cdp("Runtime.evaluate", session_id=sid, expression=expr, returnByValue=True).get("result", {}).get("value") + +def rec(kind, data): + TL.write(json.dumps({"t": time.time(), "kind": kind, "data": data}) + "\n") + +def attach(t): + sid = cdp("Target.attachToTarget", targetId=t["targetId"], flatten=True)["sessionId"] + cdp("Runtime.enable", session_id=sid) + cdp("Runtime.addBinding", session_id=sid, name="__watchEmit") # nav-proof action channel + try: cdp("Network.enable", session_id=sid) + except Exception: pass + try: cdp("Page.enable", session_id=sid) # so every full navigation is logged + except Exception: pass + cdp("Page.addScriptToEvaluateOnNewDocument", session_id=sid, source=BOOT) # arm future docs + try: ev(sid, BOOT) # arm current doc + except Exception: pass + sessions[t["targetId"]] = sid + rec("watch.tab", {"url": t["url"]}) + return sid + +rec("watch.started", {"dir": WATCH_DIR}) +req_urls = {} # requestId -> url, so a failed request can name its domain +last_sweep = 0.0 # wall-clock of the last frame-retention sweep +while not os.path.exists(STOP): + now = time.time() + tgts = page_targets() + live = {t["targetId"] for t in tgts} + for tid in [k for k in sessions if k not in live]: + sessions.pop(tid, None) + active = None + for t in tgts: + sid = sessions.get(t["targetId"]) or attach(t) + try: + if ev(sid, "document.visibilityState") == "visible": + foc = ev(sid, "document.hasFocus()") + if active is None or foc: + active = sid + except Exception: + pass + # poll JS-side buffer too — covers tabs already open before watching started + try: + buf = ev(sid, "JSON.stringify((window.__watchLog||[]).splice(0))") + if buf and buf != "[]": + for o in json.loads(buf): + rec("action", o) + except Exception: + pass + # one drain covers ALL attached sessions: actions (bindingCalled) + network + try: + for e in drain_events(): + m = e.get("method", "") + if m == "Runtime.bindingCalled" and e["params"].get("name") == "__watchEmit": + try: rec("action", json.loads(e["params"]["payload"])) + except Exception: pass + elif m == "Page.frameNavigated": + fr = e["params"].get("frame", {}) + if not fr.get("parentId"): # main frame only (full nav / search) + rec("nav", {"url": fr.get("url")}) + elif m == "Runtime.exceptionThrown": + ed = e["params"].get("exceptionDetails", {}); rec("page.error", {"text": ed.get("text")}) + elif m == "Network.requestWillBeSent": + req_urls[e["params"].get("requestId")] = e["params"].get("request", {}).get("url") + if len(req_urls) > 3000: # bound memory + for k in list(req_urls)[:1000]: req_urls.pop(k, None) + elif m == "Network.responseReceived": + r = e["params"].get("response", {}); rec("net", {"status": r.get("status"), "url": r.get("url")}) + elif m == "Network.loadingFailed": + rid = e["params"].get("requestId") + rec("net.fail", {"error": e["params"].get("errorText"), "type": e["params"].get("type"), + "url": req_urls.get(rid, "")}) + except Exception: + pass + if active: + try: + shot = cdp("Page.captureScreenshot", session_id=active, format="jpeg", quality=60) + if shot.get("data"): + open(os.path.join(FRAMES, "%d.jpg" % int(now * 1000)), "wb").write(base64.b64decode(shot["data"])) + except Exception: + pass + if now - last_sweep > 12: # time-based — runs even when iterations are slow (many tabs) + last_sweep = now + cutoff = (now - MAX_AGE) * 1000 + for f in glob.glob(os.path.join(FRAMES, "*.jpg")): + try: + if int(os.path.basename(f)[:-4]) < cutoff: os.remove(f) + except Exception: pass + time.sleep(INTERVAL) + +rec("watch.stopped", {}) +TL.close() +print("watch monitor stopped:", WATCH_DIR)