From 244cd064f3b647b4f344960e5ceb5f29448b23e4 Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Wed, 20 May 2026 16:30:49 +0530
Subject: [PATCH 01/11] docs(k8s-proxy): add developer + llm workflow playbook
 page

Sibling to the existing k8s-proxy-developer-workflow page. Documents
an autonomous Keploy workflow driven from an MCP-aware editor (Claude
Code, Cursor, Windsurf, Claude Desktop, VS Code Copilot, Trae). The
developer types one of two prompts; the agent does everything else.

The two prompts:
  1. "my keploy cloud replay is failing, please analyse and fix it."
     (or "the keploy cloud replay pipeline is failing..." for CI)
  2. "Add new keploy tests for my changes."

The page ships a single pasteable playbook that installs as a Claude
Code skill or any other editor's rules / memory file. Inside the
playbook the agent:

  - Resolves app_id from `basename $(pwd)` + listApps.
  - Resolves branch_id from `git rev-parse --abbrev-ref HEAD` +
    create_branch (find-or-create, idempotent, sticky for the session).
  - Diagnoses failing runs via two cases: Case 1 (app regression, agent
    fixes handler code and announces file:line before applying);
    Case 2 (test data stale, with sub-actions 2a noise / 2a response
    edit / 2b mock edit / 2b delete_recording + re-record).
  - For new tests: git diff to find changed handlers, pre-flight the
    dev's local run command, then `keploy record -c "<cmd>" --sync` +
    `keploy upload test-set` to land the bundle on the branch.

Sidebar updated to surface the page under K8s Proxy.

Signed-off-by: Charan Kamarapu <kamarapucharan@gmail.com>
---
 .../config/vocabularies/Base/accept.txt       | 209 ++++-----
 .../quickstart/k8s-proxy-llm-workflow.md      | 402 ++++++++++++++++++
 .../version-4.0.0-sidebars.json               |   1 +
 3 files changed, 510 insertions(+), 102 deletions(-)
 create mode 100644 versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
diff --git a/vale_styles/config/vocabularies/Base/accept.txt b/vale_styles/config/vocabularies/Base/accept.txt
index fdf0d75b6..e99ef0ca2 100644
--- a/vale_styles/config/vocabularies/Base/accept.txt
+++ b/vale_styles/config/vocabularies/Base/accept.txt
@@ -1,29 +1,80 @@
+[Aa]ir-?gap(?:ped|ping)?
+[Aa]uditable
+[Cc]group[s]?
+[Cc]leartext
+[Cc]onfigMap[s]?
+[Cc]ooldown
+[Cc]RD[s]?
+[Cc]ron
+[Cc]Rs?
+[Dd]aemon[Ss]et[s]?
+[Dd]edup
+[Dd]edups
+[Dd]ev
+[Ee]nv
+[Hh]ostname
+[Kk]0s
+[Kk]3s
+[Kk]8s
+[Kk]araf
+[Kk]eploy
+[Ll]inux
+[Ll]iveness
+[Mm]utatingAdmissionWebhook
+[Nn]amespace[s]?
+[Pp]assthrough
+[Pp]refill[s]?
+[Rr]eachability
+[Rr]efcount[s]?
+[Rr]ehydrate[ds]?
+[Rr]eplayer
+[Rr]epo
+[Rr]ollout[s]?
+[Rr]untime[s]?
+[Ss]ubresource[s]?
+[Tt]eardown
+[Tt]oolchain
+[Uu]serspace
+[Ww]alkthrough
 api
 API
 APIs
 Appium
 Applitools
-Arkade
 ArgoCD
+Arkade
 Asana
 Auditability
+backoff
+behaviour
 borderRadius
+classpath
+cluster-mode
 Cmd
 Cobertura
 config
+containerd
 containerName
-classpath
+crd
+created_at
+CreateReplayJobRequest
 custom_functions
 DBs
 declaratively
 Deduplication
+deploymentType
 distros
 dockerfile
 Docusaurus
 Dropwizard
+eBPF
+enum
+envFrom
 expected_string
+frontmatter
 Functionize
 GitHub
+GitOps
 gjson
 Gradle
 graphql
@@ -34,13 +85,17 @@ header_contains
 header_equal
 header_exists
 header_matches
-[Hh]ostname
 Hoppscotch
+HostPath
 html
 HTTPProxy
 Idempotency
-JaCoCo
+imagePullSecret[s]?
+ingressUrl
+initialised
+IPs
 Jacoco
+JaCoCo
 JBehave
 Jersey
 JMeter
@@ -48,148 +103,98 @@ json_contains
 json_equal
 json_path
 JUnit
-[Kk]araf
+keploy-agent
+keploy-daemonset
+keploy-replay-runner
+keployContext
+kindNet
 kubectl
+kubelet
 kubernetes
-test-gen
-[Kk]eploy
+launchd
 LLMs
 mabl
+Makefile
+matchLabels
 middleware
 mock
 Mockaroo
-[Nn]amespace[s]?
+MongoIDs
+mTLS
+NetPolic(y|ies)
+NetworkPolic(y|ies)
+nextCursor
 Nhost
+normalisation
 npm
+nullable
 NUnit
 Onboarding
+orderId
 params
-[Pp]assthrough
+pm2
+podSelector
+PodTemplate[Ss]pec
+polyglot
 Postgres
+PostStart
+Procfile
+protobuf
 Pytest
+randAlphaNum
 realtime
+RecordingSession[s]?
+recordingsessions
 Redis
-[Rr]epo
+ReplayJob[s]?
+ReplaySession[s]?
+replaysessions
 Reqnroll
+runner
+runner-mode
 SDK
+secretKeyRef
+ServiceAccount[s]?
 servlet
+sharedToken
+shipping_address_id
+shippingAddress
+sidecar
 signin
+SPDY
 Spotify
 status_code
 status_code_class
 status_code_in
 subcommand
 substring
+systemd
 templatize
+test-gen
 Testcase
 Testcases
 Testim
 testmode
 Testrun
 testsets
-toolchain
+TGID[s]?
 timeFreezing
+toolchain
+total_amount
+Trae
 Traefik
 Twilio
 Unittest
+updated_at
 url
 UTGen
 UUIDs
+valueFrom
 VM
-VMs
 VM's
+VMs
 wiremessages
 Woohoo
 wsl
 WSL
 YAMLs
-nullable
-enum
-nextCursor
-orderId
-shippingAddress
-total_amount
-created_at
-updated_at
-shipping_address_id
-[Ll]inux
-[Ee]nv
-[Kk]8s
-IPs
-[Dd]edup
-[Dd]edups
-[Rr]ollout[s]?
-[Pp]refill[s]?
-[Aa]uditable
-[Cc]ooldown
-[Ll]iveness
-[Cc]ron
-[Tt]oolchain
-[Rr]untime[s]?
-MongoIDs
-initialised
-normalisation
-behaviour
-polyglot
-[Dd]aemon[Ss]et[s]?
-[Cc]RD[s]?
-eBPF
-[Mm]utatingAdmissionWebhook
-RecordingSession[s]?
-ReplaySession[s]?
-keploy-daemonset
-keploy-agent
-recordingsessions
-replaysessions
-TGID[s]?
-[Rr]efcount[s]?
-GitOps
-envFrom
-valueFrom
-[Cc]onfigMap[s]?
-ServiceAccount[s]?
-imagePullSecret[s]?
-NetPolic(y|ies)
-NetworkPolic(y|ies)
-containerd
-launchd
-systemd
-pm2
-SPDY
-mTLS
-PodTemplate[Ss]pec
-podSelector
-matchLabels
-backoff
-[Aa]ir-?gap(?:ped|ping)?
-kubelet
-keployContext
-keploy-replay-runner
-ReplayJob[s]?
-CreateReplayJobRequest
-runner-mode
-cluster-mode
-crd
-runner
-sidecar
-[Kk]3s
-[Kk]0s
-kindNet
-randAlphaNum
-secretKeyRef
-HostPath
-PostStart
-[Cc]group[s]?
-[Uu]serspace
-[Tt]eardown
-[Rr]eplayer
-[Rr]ehydrate[ds]?
-[Rr]eachability
-[Ww]alkthrough
-[Dd]ev
-[Cc]Rs?
-[Ss]ubresource[s]?
-sharedToken
-deploymentType
-ingressUrl
-[Cc]leartext
diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
new file mode 100644
index 000000000..5b2c2e1d5
--- /dev/null
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -0,0 +1,402 @@
+---
+id: k8s-proxy-llm-workflow
+title: Developer + LLM Workflow with Keploy Proxy
+sidebar_label: LLM Workflow
+description: Wire up Keploy's MCP server, paste a single autonomous playbook into your editor, and run the whole Keploy workflow with exactly two developer prompts—the LLM does everything else.
+tags:
+  - K8s
+  - Developer Workflow
+  - LLM
+  - MCP
+  - Claude Code
+  - Cursor
+  - AI Agent
+  - Branches
+keywords:
+  - Keploy MCP
+  - Claude Code
+  - Cursor MCP
+  - Windsurf MCP
+  - LLM workflow
+  - Keploy branches
+  - cloud replay
+  - branch-merge
+  - autonomous agent
+---
+
+# Developer + LLM Workflow with Keploy Proxy
+
+import ProductTier from '@site/src/components/ProductTier';
+import useBaseUrl from '@docusaurus/useBaseUrl';
+
+<ProductTier tiers="Enterprise" offerings="Self-Hosted, Dedicated" />
+
+The [Developer Workflow](/docs/quickstart/k8s-proxy-developer-workflow) page walks through the manual flow end-to-end—things like creating a branch, editing mocks and test cases, replaying changes, opening a PR, merging, and so on. Every step has an MCP tool behind it. This page goes one step further: install the playbook below as a **Claude Code skill** (or the equivalent rules / memory entry in any other MCP-aware editor—Cursor, Windsurf, Claude Desktop, VS Code Copilot, Trae) and you only ever say **one of two things** to the agent. It handles the rest.
+
+The two prompts are:
+
+1. **"my keploy cloud replay is failing, please analyse and fix it."**—for a local replay that came back red (agent fetches the latest report from the api-server). Say **"the keploy cloud replay pipeline is failing, please analyse and fix it."** when the failure was in CI—agent extracts the `test_run_id` from your CI log instead. Same diagnose-and-fix routine either way.
+2. **"Add new keploy tests for my changes."**
+
+The agent discovers the app, resolves the Keploy branch, finds the failing run, reads the diff, decides whether the tests need updating or the app has regressed, applies the fix (a code change to the handler, or a test update on the Keploy branch), re-runs replay, and reports back—without follow-up questions. CI still owns the merge.
+
+This page has three parts:
+
+1. **Wire up the Keploy MCP server in your editor**—one-time config; same JSON shape across every supported editor.
+2. **Install the playbook**—a single block that goes into a Claude Code skill, a Cursor rules file, a Windsurf memory file, or any equivalent. It loads automatically whenever the agent sees a Keploy-related prompt.
+3. **Use the two prompts**—what to type, what the agent does.
+
+This page picks up after two one-time setups are already done: the application is [recording in your cluster](/docs/quickstart/k8s-proxy), and the [CI pipeline](/docs/quickstart/k8s-proxy-developer-workflow#wiring-up-your-ci-pipeline) (replay on PR open, branch-merge on PR merge) is wired into your repo as instructed on the Developer Workflow page. The agent only drives the dev-side loop—it never touches CI.
+
+---
+
+## Before you start
+
+- A **Keploy PAT**—Dashboard → Settings → API Keys. Copy the `kep_...` value (shown only once).
+- An **MCP-aware editor**: Claude Code, Cursor, Windsurf, Claude Desktop, VS Code, or Trae.
+
+---
+
+## Step 1—Wire up the Keploy MCP server
+
+All MCP-aware editors accept the exact same JSON config; only the config file path differs. The Claude Code snippet is shown below as the example; for the equivalent config paths on Cursor, Windsurf / Antigravity, GitHub Copilot, and other clients, see [MCP Client Configuration](/docs/running-keploy/agent-test-generation#mcp-client-configuration) on the Agent Test Generation page—the same JSON shape works there too.
+
+**Claude Code** uses `~/.claude.json`:
+
+```json
+{
+  "mcpServers": {
+    "keploy": {
+      "type": "http",
+      "url": "https://api.keploy.io/client/v1/mcp",
+      "headers": {"Authorization": "Bearer kep_..."}
+    }
+  }
+}
+```
+
+Fully quit and reopen your editor after editing the config. MCP clients only re-read config on startup.
+
+---
+
+## Step 2—Install the playbook
+
+The playbook below teaches your agent to run the whole workflow autonomously from the two prompts. Without it, the agent has to rediscover the workflow on every call by reading each tool's individual description—slower and prone to skipping the branch-resolution step.
+
+The exact same block works on every MCP-aware editor; only the file path changes. The walkthrough below uses Claude Code's native Skills system as the example.
+
+### Where the playbook goes
+
+| Editor             | Install path                                                                                                                                                                 |
+| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Claude Code**    | `~/.claude/skills/keploy/SKILL.md` (global) **or** `<repo>/.claude/skills/keploy/SKILL.md` (committed). Auto-loaded when the dev's prompt matches the skill's `description`. |
+| **Cursor**         | `.cursor/rules/keploy.md` in the repo (committed) **or** Cursor Settings → Rules for AI (global).                                                                            |
+| **Windsurf**       | `.windsurfrules` at the repo root, **or** `~/.codeium/windsurf/memory/global_rules.md` (global).                                                                             |
+| **Claude Desktop** | A Project's "Project knowledge" section, or paste as the first message of every conversation.                                                                                |
+| **VS Code**        | `.github/copilot-instructions.md` (Copilot Chat reads this) or paste in chat per session.                                                                                    |
+
+All five locations accept exactly the same block below. Only Claude Code reads the YAML frontmatter at the top (the `---`-delimited block) to decide when to load the skill; other editors ignore it harmlessly—either keep it for portability or strip it; both work.
+
+Commit the file when you want every teammate's agent to follow the same playbook. Whichever location you pick, **fully restart the editor afterward**—every editor reads skills / rules / memory only at startup.
+
+### The playbook
+
+Use the copy button on the block below and paste it into the file at the path you picked above.
+
+````markdown
+---
+name: keploy
+description: Use this skill whenever the dev mentions keploy—a failing "cloud replay" (local or CI pipeline), a request to "add new keploy tests" or similar, or any Keploy MCP tool. Drives the autonomous Keploy branch workflow end-to-end from two fixed dev prompts—agent resolves app + branch, diagnoses failing runs (local or CI), fixes mocks/tests on a branch, captures new traffic, and validates without follow-up questions.
+---
+
+# Keploy MCP playbook—autonomous developer workflow
+
+The developer will only ever say one of two things to you:
+
+**Prompt A:** "my keploy cloud replay is failing, please analyse and fix it." OR "the keploy cloud replay pipeline is failing, please analyse and fix it."—both forms route to the same routine; the first means the dev's last local replay run failed (find the latest test_run on the branch via api-server), the second means a CI pipeline run failed (the dev should paste the CI log or dashboard URL; extract `test_run_id` from it).
+**Prompt B:** "Add new keploy tests for my changes."
+
+You handle EVERYTHING else autonomously. Discover the app, the branch, the failing run, the code changes—from the filesystem, from git, and from the Keploy api-server. Make decisions. Execute fixes. Report what you did. Do NOT ask the developer follow-up questions unless you are truly blocked (see "When you may ask" at the bottom).
+
+## Hard rules
+
+1. **Branch-first.** Every write to mocks / tests / recordings is branch-scoped. Resolve `branch_id` before any write. If a tool returns "branch_id is required", you skipped this—fix and retry, don't ask the dev.
+2. **Keploy branch name = git branch name.** Detect via `git rev-parse --abbrev-ref HEAD`. Pass that string to `create_branch` (find-or-create, idempotent). Reuse the returned `branch_id` for every subsequent write in this session.
+3. **App resolution from cwd.** `basename $(pwd)` → `listApps({q: <basename>})`. Exactly one match → use it. Multiple → pick the one whose name most specifically matches the dev's compose service. Zero matches → ask the dev once.
+4. **Fix the root cause—app code or test data.** When a test fails because the contract changed intentionally, fix the test on the Keploy branch (`update_mock` / `update_test_suite`). When a test fails because the app regressed, edit the handler code yourself to restore the correct behavior. Announce the file:line change in clear terms before re-running replay so the dev can interrupt if they object; otherwise proceed. Re-run replay to verify in both cases.
+5. **Don't ask what you can find out.** Use `git log`, `git diff`, file reads, and api-server calls. Never ask "what did you change", "which app", or "which branch"—discover them.
+6. **Always end with two dashboard URLs.** The branch diff page and the test-run report page. Format:
+   - `Branch diff: https://app.keploy.io/api-testing/branch-diff?appId=<app_id>&branchId=<branch_id>`
+   - `Run report: https://app.keploy.io/tr?appId=<app_id>&branch=<branch_name>`
+     Swap the base for self-hosted.
+
+## Discovery (run at the start of every conversation, before either routine)
+
+1. **App.** `basename $(pwd)` → `listApps({q: <basename>})` → pick the unambiguous match. Cache `app_id` for the session.
+2. **Branch.** `git rev-parse --abbrev-ref HEAD` → `create_branch({app_id, name: <git branch>})` → cache `branch_id`. If `git rev-parse` returns `HEAD` or exits non-zero, ask the dev for a branch name ONCE.
+
+Both values are sticky for the rest of the conversation. Don't re-discover unless the dev switches git branches.
+
+---
+
+## Routine A—failing cloud replay (local or CI), analyse and fix
+
+### Phase A1—Resolve the `test_run_id`
+
+The goal of this phase is exactly one thing: produce a `test_run_id` you can pass to `get_session_report` in Phase A2. Pick how you get it based on the form of Prompt A:
+
+- **Local form** ("my keploy cloud replay is failing…") → call `listTestRuns({app_id, branch_id, kind: "test_suite_run", limit: 5})` (or the equivalent op-id surfaced by the OpenAPI-generated tool list), pick the most recent run whose status is `failed`, and take its `id`. That's the dev's last local `keploy cloud replay --branch-name` invocation.
+- **CI form** ("the keploy cloud replay pipeline is failing…") → the dev usually pastes a CI log URL or dashboard URL. Extract `test_run_id` from it. If they didn't paste anything, fall back to the local-form lookup above—a CI failure posts the same `test_suite_run` record to the api-server, so the latest-failed lookup still finds it.
+
+Either way, Phase A2 onward is identical—same `get_session_report` call, same routes, same fixes.
+
+### Phase A2—Fetch the full report
+
+Call `get_session_report({app_id, test_run_id, verbose: true})`. Read:
+
+- `status`—`has_failures` is your trigger to continue.
+- `failed_steps[]`—for each entry note `suite_id`, `suite_name`, `step_name`, `method`, `url`, `diff`, `authored_assertions`, `authored_response_body`, `mock_mismatch_failure`, `mock_mismatches`.
+- `mock_mismatch_dominant`—true when >50% of failures are mock-mismatches (the signature of a keploy-side egress-hook issue, not an app regression).
+
+### Phase A3—Diagnose each failing step
+
+Two cases. Decide per step from `git log` / `git diff origin/main...HEAD` (commits on the failing endpoint or its dependencies) and the report's `failed_steps[]` (the test diff and any `mock_mismatches`):
+
+#### Case 1—Bug in the app (regression). You fix the code.
+
+The handler used to behave correctly; a recent commit broke it. Signal: a recent commit touched the failing endpoint or its dependencies AND the test's `authored_response_body` still represents the correct behavior.
+
+Action: edit the handler code yourself to restore the expected behavior—minimal change, consistent with the test's contract. Announce the file:line and a one-line description of the edit **before** applying it so the dev can interrupt if they object; otherwise proceed. Do NOT touch the test—its captured baseline is still correct.
+
+#### Case 2—App behavior drifted intentionally. You fix the test data on the branch.
+
+The contract changed on purpose; the test's recorded baseline is stale. Read `failed_steps[].diff` and `mock_mismatches` together, then pick a sub-action:
+
+**2a—Only a test diff (no mock mismatch driving it).** Update the test step on the branch:
+
+- If the diverging field is genuinely non-deterministic (timestamps, request IDs, generated UUIDs—anything that legitimately changes every run), add its JSONPath to the step's `noise` list via `update_test_suite`. Marking a field as noise tells the runner to ignore diffs on that path.
+- Otherwise update the recorded `response` body on the step via `update_test_suite`. **MUST preserve every kept step's existing `id`**—fetch the test first via `getTestSuite`, copy each step's `id` into your merged `steps_json`, and change only the field(s) the new contract dictates. Omitting step IDs is rejected as a "full rewrite".
+
+**2b—Test diff plus a mock mismatch that's plausibly causing the diff.** The recorded mock is what's out of date—the downstream call's shape changed. Update the mock via `update_mock({app_id, test_set_id, mock_id, branch_id, mock_yaml: <updated yaml>})`. Read the existing mock with `getMock` first to preserve fields you're not changing, then re-run replay.
+
+- If the test still fails after one or two mock edits, the recorded baseline is too far gone to patch piecemeal. Fall back: drop the stale test data (`delete_recording` on the affected test set) and re-capture from scratch using Routine B's flow (`keploy record` against the current behavior, then `keploy upload test-set --branch <git branch>` to land it on the branch).
+
+Multiple failing steps can land in different cases—handle each independently.
+
+### Phase A4—Verify
+
+After every Case-1 (app code edit) or Case-2 (test data edit) fix, run via Bash:
+
+```bash
+keploy cloud replay --app <ns.deployment> --branch-name <git branch>
+```
+
+If still failing, re-enter Phase A2 with the new `test_run_id`. If passing, proceed to A5. Cap retry attempts at 3—if it's still red, the failures are likely a keploy-side proxy issue (your fixes aren't taking effect). Report the residual failures honestly with the `test_run_id` and the run-report URL so the dev can file a keploy bug, then stop.
+
+### Phase A5—Report (exact format)
+
+```
+### Diagnosis
+| Test | Step | Case | Cause |
+| --- | --- | --- | --- |
+| <name> | <step> | 1 / 2a / 2b | <one-line cause from repo inspection> |
+
+### Fixes applied
+- (Case 1) Edited `<file:line>`—`<one-line change description>`.
+- (Case 2a) `update_test_suite` on `<suite_name>`—set noise on `<path>` OR updated response field `<path>`.
+- (Case 2b) `update_mock` on `<mock_name>` (test set `<test_set_id>`) OR `delete_recording` + re-capture via `keploy record` + `keploy upload test-set`.
+- `keploy cloud replay` re-run: `<p>/<t>` tests passed.
+
+### Next step for you
+- (Case 1) Review the code edit at `<file:line>`. Push when satisfied; CI will replay automatically.
+- (Case 2) Push your code change—CI replay will pick up the updated Keploy branch.
+- (Retry cap hit) File a keploy bug with `test_run_id=<id>` and the run-report URL.
+
+Branch diff: https://app.keploy.io/api-testing/branch-diff?appId=<app_id>&branchId=<branch_id>
+Run report: https://app.keploy.io/tr?appId=<app_id>&branch=<branch_name>
+```
+
+---
+
+## Routine B—"add new keploy tests for my changes"
+
+### Phase B1—Identify the changes
+
+1. `git diff origin/main...HEAD --name-only` (fall back to `main...HEAD` if `origin/main` isn't fetched).
+2. Filter to **HTTP-handler files**—route definitions, controllers, request handlers. Skip refactors, test files, docs, generated code, migrations.
+3. For each handler file, read the diff hunks (`git diff origin/main...HEAD -- <file>`) and list the endpoints that were added or modified. Note each one's `method` + `path` + a one-line description.
+4. If nothing handler-relevant changed, tell the dev "no API-handler changes detected on this branch—no new tests needed", and stop.
+
+### Phase B2—Capture traffic for the new endpoints
+
+**Pre-flight: confirm the app starts under the dev's local setup.** Discover the dev's run command from the repo (priority order: `Makefile` targets like `run` / `start` / `dev`, `docker-compose.yml` (`docker compose up -d`), `Procfile`, `package.json` scripts, the README's quickstart section). Start the app with that command, curl a reachable endpoint (`/health`, the root, anything that 200s) to confirm it's serving traffic, then stop it cleanly. Don't ask the dev for help here unless you literally cannot get the app to start—discovering the run command from the repo is on you.
+
+**Make sure the app is fully stopped before running `keploy record`.** `keploy record -c "<dev run command>"` spawns its own instrumented copy of the app; if your pre-flight instance is still running, the two will fight over the port (or the container name). `docker compose down` / kill the PID / whatever stops it cleanly before continuing.
+
+**Capture:**
+
+1. Run `keploy record -c "<dev run command>" --sync` via Bash. The `-c` value is the exact command from your pre-flight; `--sync` records test cases synchronously so each curl is captured in order with no race against the next one. Cloud association happens in Phase B3's upload step, not here—`keploy record` itself is the local OSS command and doesn't take `--cloud-app-id`.
+2. For each new/changed endpoint, drive ONE realistic curl. Infer body shape from the OpenAPI spec if there is one, otherwise from the handler signature itself.
+3. Stop `keploy record` (kill the PID you captured at step 1, or send Ctrl-C equivalent).
+4. The recording lands at `keploy/test-set-N/` on disk.
+
+### Phase B3—Upload to the Keploy branch
+
+```bash
+keploy upload test-set \
+  --app <ns.deployment> \
+  --branch <git branch> \
+  --test-set keploy/test-set-N \
+  --name <descriptive-name>
+```
+
+`<descriptive-name>` should reflect the dev's change (e.g. "checkout-with-discount" if they added a discount field). The `--branch` flag scopes the upload to your sticky branch; subsequent dashboard reviewers see only this diff.
+
+### Phase B4—Validate
+
+```bash
+keploy cloud replay --app <ns.deployment> --branch-name <git branch>
+```
+
+If anything failed, enter Routine A from Phase A2—the diagnosis routine handles it.
+
+### Phase B5—Report (exact format)
+
+```
+### Captured
+| Endpoint | Test set | Cases |
+| --- | --- | --- |
+| <method> <path> | <name> | <N> |
+
+### Replay
+<p>/<t> tests passed on branch `<git branch>`.
+
+### Next step
+Open your PR. CI will replay this branch automatically; merge will fold the test data into main.
+
+Branch diff: https://app.keploy.io/api-testing/branch-diff?appId=<app_id>&branchId=<branch_id>
+Run report: https://app.keploy.io/tr?appId=<app_id>&branch=<branch_name>
+```
+
+---
+
+## When you MAY ask the dev (and only then)
+
+- The PAT is missing or invalid (auth error from the MCP tool itself) → ask the dev to mint a fresh PAT.
+- `git rev-parse --abbrev-ref HEAD` returns `HEAD` (detached) or exits non-zero → ask the dev for a Keploy branch name ONCE.
+- `listApps` returns multiple ambiguous matches that you cannot narrow by compose-service name → list the candidates and ask ONCE.
+- Phase B2's pre-flight cannot start the app (discovered the run command from compose / Makefile / Procfile / README but it failed) → name the command you tried and the error, then ask ONCE.
+
+Everything else—what failed and why, which mock to update, what test-set name to use, whether the dev's commit was intentional, what the new endpoint's contract should look like—you discover from the repo and the api-server. Do not ask.
+
+## Anti-patterns (refuse these)
+
+- Editing handler code on a Case-2-shaped failure (contract changed intentionally). The test data is what's stale—update it on the branch instead.
+- Writing to `main` (any tool that omits `branch_id`). Always branch-first.
+- Re-recording to absorb a failure without first reading the diff and deciding the route. Re-record only when Route C applies.
+- Inventing a PAT, branch name, or secret value.
+````
+
+Save the file and fully restart your editor so the skill / rules / memory entry is available in your next session.
+
+---
+
+## Step 3—Use the two prompts
+
+That's it. From now on, you only ever type one of:
+
+> **"my keploy cloud replay is failing, please analyse and fix it."**
+
+_or, when the failure was in CI:_
+
+> **"the keploy cloud replay pipeline is failing, please analyse and fix it."**
+
+or
+
+> **"Add new keploy tests for my changes."**
+
+What happens behind the scenes for each:
+
+### Prompt A—analyse and fix a failing replay (local or CI)
+
+| Phase | What the agent does                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| A0    | Resolve `app_id` from `basename $(pwd)` + `listApps`. Resolve `branch_id` from `git rev-parse --abbrev-ref HEAD` + `create_branch`.                                                                                                                                                                                                                                                                                                                                                                             |
+| A1    | Get a `test_run_id` to fetch the report against. Local form → list the branch's recent test runs and take the latest failed one's id. CI form → extract `test_run_id` from the CI log or dashboard URL the dev pasted (falls back to the local lookup if nothing was pasted).                                                                                                                                                                                                                                   |
+| A2    | Fetch the full report (`get_session_report` with `verbose=true`).                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| A3    | Per failing step, decide Case 1 (bug in the app—recent commit broke it, test is still correct) or Case 2 (app behavior drifted intentionally—test data is stale, with sub-actions 2a noise / 2a response edit / 2b mock edit / 2b delete + re-record). Decision is from `git log` / `git diff` plus the report's `mock_mismatches`, never from a dev question.                                                                                                                                                  |
+| A4    | For Case 1: announce the file:line and a one-line description, then edit the handler code so the dev can stop the agent if they object. For Case 2a: `update_test_suite` to add noise on a non-deterministic field, or to update the recorded `response` body (preserve every existing step `id`). For Case 2b: `update_mock` on the affected mock, or—if the baseline is too far gone—`delete_recording` and re-record via Routine B's flow. Either way, re-run `keploy cloud replay --branch-name` to verify. |
+| A5    | Report: diagnosis table (case per step) + fixes applied + next-step-for-you + branch-diff URL + run-report URL.                                                                                                                                                                                                                                                                                                                                                                                                 |
+
+### Prompt B—author new keploy tests
+
+| Phase | What the agent does                                                                                                                                                                                                                                                                                                                                                                |
+| ----- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| B0    | Discovery (same as A0).                                                                                                                                                                                                                                                                                                                                                            |
+| B1    | `git diff origin/main...HEAD` to find handler files that changed; extract added/modified endpoints.                                                                                                                                                                                                                                                                                |
+| B2    | Pre-flight: discover the dev's run command from the repo (Makefile → docker-compose.yml → Procfile → package.json → README), start the app, curl any 200-returning endpoint to confirm it's serving traffic, stop it. Then run `keploy record -c "<dev run command>" --sync`, drive a realistic curl per new endpoint, stop the recorder. Recording lands at `keploy/test-set-N/`. |
+| B3    | `keploy upload test-set --app <ns.deployment> --branch <git branch> --test-set keploy/test-set-N --name <descriptive-name>` to land the bundle on the Keploy branch.                                                                                                                                                                                                               |
+| B4    | `keploy cloud replay --app <ns.deployment> --branch-name <git branch>` to validate. On failure, drop into Routine A.                                                                                                                                                                                                                                                               |
+| B5    | Report: captured endpoints table + replay result + next-step (open PR) + branch-diff URL + run-report URL.                                                                                                                                                                                                                                                                         |
+
+For everything not covered by these two prompts—manually inspecting test data, editing one mock, listing recordings—use the manual flow on the [Developer Workflow](/docs/quickstart/k8s-proxy-developer-workflow) page directly. The two-prompt workflow handles the 90% case; the manual flow is the escape hatch.
+
+---
+
+## Putting it together
+
+Here are the typical scenarios the agent handles—one per case it decides between. Every one starts with the same two-prompt UX and ends with the dev pushing once CI catches up. The variable bit is what the agent does in the middle.
+
+### Scenario 1—App regression (Case 1)
+
+You merged a refactor that accidentally broke the price calculation on `/orders/{id}`. The test still expects the right total.
+
+> _"my keploy cloud replay is failing, please analyse and fix it."_
+
+A0 → A1 (latest failed run) → A2 (report shows `total_amount: 0` vs expected `99.99`). A3 sees your recent commit on the price-calc helper and the test's authored response is still correct → **Case 1**. A4 announces the edit at `pkg/order/calc.go:42`—restoring the line-item subtotal branch—then applies the fix and re-runs replay (green). A5 reports the edit + URLs.
+
+### Scenario 2—Test data drift on the response (Case 2a, response edit)
+
+You renamed a response field from `username` to `display_name` on `/users/{id}` on purpose. CI replay now fails because the recorded response still says `username`.
+
+> _"the keploy cloud replay pipeline is failing, please analyse and fix it."_
+
+A3 sees the rename commit and `authored_assertions` pinned to `username` → **Case 2a**. A4 calls `update_test_suite` to swap the field name on the recorded response (preserving every kept step's `id`), re-runs replay (green). A5 reports the test edit + URLs.
+
+### Scenario 3—Test data drift, non-deterministic field (Case 2a, noise)
+
+The replay started failing on `$.created_at`—a timestamp that differs each run. No code changes near it.
+
+> _"my keploy cloud replay is failing, please analyse and fix it."_
+
+A3 sees the diverging field is genuinely time-varying with no related commit → **Case 2a (noise)**. A4 calls `update_test_suite` to add `$.created_at` to that step's noise list; replay re-runs green.
+
+### Scenario 4—Mock drift from a DB query change (Case 2b, mock edit)
+
+You added a `discount_percent` column to the orders table and updated the `SELECT` to return it. The handler emits the new field, the test expects it, but the recorded mock for the DB call still has the old shape.
+
+> _"my keploy cloud replay is failing, please analyse and fix it."_
+
+A3 sees the schema-change commit and `mock_mismatches` on the SELECT row → **Case 2b**. A4 calls `update_mock` to add `discount_percent` to the mock spec; replay re-runs green. A5 reports the mock edit + URLs.
+
+### Scenario 5—Mock too far gone, full re-record (Case 2b, fallback)
+
+A downstream gRPC client was swapped for HTTP; the recorded mocks are protobuf bytes that no longer apply.
+
+> _"my keploy cloud replay is failing, please analyse and fix it."_
+
+A3 → **Case 2b**. A4 tries one `update_mock` edit—it doesn't pass. The agent falls back: `delete_recording` on the affected test set, then re-records via Routine B's flow (pre-flight → `keploy record -c "<run cmd>" --sync` → curl → `keploy upload test-set --branch <git branch>`). Replay re-runs green.
+
+### Scenario 6—Adding tests for a new endpoint (Routine B)
+
+You added `POST /coupons/redeem`.
+
+> _"Add new keploy tests for my changes."_
+
+B0 → B1 (`git diff origin/main...HEAD` surfaces the new route). B2 pre-flight: agent finds `make run` in the Makefile, brings the app up, `curl /health` returns 200, stops it. Then `keploy record -c "make run" --sync`, curls `POST /coupons/redeem` with a realistic body, stops the recorder. B3 uploads via `keploy upload test-set --app <ns.deployment> --branch <git branch> --name coupons-redeem`. B4 replay returns 1/1 passed. B5 reports the captured endpoint + URLs.
+
+---
+
+Across every scenario, you only ever spoke one of two sentences. You push your code change (and, for Case 1, the agent's app-side edit). CI replays the branch on the PR; merge runs `keploy cloud branch-merge` and the test data lands on main.
+
+For the same flow done manually (CLI / dashboard, no agent), see [Developer Workflow with Keploy Proxy](/docs/quickstart/k8s-proxy-developer-workflow).
diff --git a/versioned_sidebars/version-4.0.0-sidebars.json b/versioned_sidebars/version-4.0.0-sidebars.json
index 6f741ddee..55f463295 100644
--- a/versioned_sidebars/version-4.0.0-sidebars.json
+++ b/versioned_sidebars/version-4.0.0-sidebars.json
@@ -149,6 +149,7 @@
           "items": [
             "quickstart/k8s-proxy",
             "quickstart/k8s-proxy-developer-workflow",
+            "quickstart/k8s-proxy-llm-workflow",
             "running-keploy/k8s-proxy-daemonset-architecture"
           ]
         },

From 3b31622d2b2aa0999456716af6f537110989587c Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 13:41:36 +0530
Subject: [PATCH 02/11] docs(k8s-proxy-llm-workflow): trim playbook to
 verified-working form
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the long-form playbook with the trimmed, validated form
(11,305 → 7,939 tok + 2 anti-patterns ≈ 8,095 tok in source). Same
load-bearing rules preserved verbatim:
- Step 0 ALLOWLIST + uncommitted-edit revert mandate
- listTestReports EXACTLY ONCE per session
- getApp memoize (≤1 call/session)
- fields=[...] on getTestReportFull + getApp
- drop listMocks default; targeted getMock instead
- record → upload → delete order for 2b-recapture
- sql_ast_hash CLI mandate (use `keploy mock patch`, not MCP update_mock)
- --disableReportUpload=false and --cluster mandatory
- pipe all keploy/docker output through tail/grep
- two new anti-patterns: ban keploy --help dump, ban Read of
  keploy/ local cache files

Verified against S1 scenario at 632k total tokens, 13/13 effective
asserts.
---
 .../quickstart/k8s-proxy-llm-workflow.md      | 367 +++++++-----------
 1 file changed, 150 insertions(+), 217 deletions(-)

diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index 5b2c2e1d5..53fe02954 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -24,187 +24,212 @@ keywords:
   - autonomous agent
 ---
 
-# Developer + LLM Workflow with Keploy Proxy
-
-import ProductTier from '@site/src/components/ProductTier';
-import useBaseUrl from '@docusaurus/useBaseUrl';
+# Keploy MCP playbook—autonomous developer workflow
 
-<ProductTier tiers="Enterprise" offerings="Self-Hosted, Dedicated" />
+The developer will only ever say one of two things to you:
 
-The [Developer Workflow](/docs/quickstart/k8s-proxy-developer-workflow) page walks through the manual flow end-to-end—things like creating a branch, editing mocks and test cases, replaying changes, opening a PR, merging, and so on. Every step has an MCP tool behind it. This page goes one step further: install the playbook below as a **Claude Code skill** (or the equivalent rules / memory entry in any other MCP-aware editor—Cursor, Windsurf, Claude Desktop, VS Code Copilot, Trae) and you only ever say **one of two things** to the agent. It handles the rest.
+**Prompt A:** "my keploy cloud replay is failing, please analyse and fix it." OR "the keploy cloud replay pipeline is failing, please analyse and fix it."—both forms route to the same routine; the first means the dev's last local replay run failed (find the latest test_run on the branch via api-server), the second means a CI pipeline run failed (the dev should paste the CI log or dashboard URL; extract `test_run_id` from it).
+**Prompt B:** "Add new keploy tests for my changes."
 
-The two prompts are:
+You handle EVERYTHING else autonomously. Discover the app, the branch, the failing run, the code changes—from the filesystem, from git, and from the Keploy api-server. Make decisions. Execute fixes. Report what you did. Do NOT ask the developer follow-up questions unless you are truly blocked (see "When you may ask" at the bottom).
 
-1. **"my keploy cloud replay is failing, please analyse and fix it."**—for a local replay that came back red (agent fetches the latest report from the api-server). Say **"the keploy cloud replay pipeline is failing, please analyse and fix it."** when the failure was in CI—agent extracts the `test_run_id` from your CI log instead. Same diagnose-and-fix routine either way.
-2. **"Add new keploy tests for my changes."**
+## Hard rules
 
-The agent discovers the app, resolves the Keploy branch, finds the failing run, reads the diff, decides whether the tests need updating or the app has regressed, applies the fix (a code change to the handler, or a test update on the Keploy branch), re-runs replay, and reports back—without follow-up questions. CI still owns the merge.
+1. **Branch-first.** Every write to mocks / tests / recordings is branch-scoped. Resolve `branch_id` before any write. If a tool returns "branch_id is required", you skipped this—fix and retry, don't ask the dev.
+2. **Keploy branch name = git branch name.** Detect via `git rev-parse --abbrev-ref HEAD`. Pass that string to `create_branch` (find-or-create, idempotent). Reuse the returned `branch_id` for every subsequent write in this session.
+3. **App resolution from cwd.** `basename $(pwd)` → `listApps({q: <basename>})`. Exactly one match → use it. Multiple → pick the one whose name most specifically matches the dev's compose service. Zero matches → ask the dev once.
+4. **Fix the root cause—app code or test data.** When a test fails because the contract changed intentionally, fix the test on the Keploy branch (`keploy mock patch` CLI for mocks; `updateTestCase` MCP tool for test cases). When a test fails because the app regressed, edit the handler code yourself to restore the correct behavior. Announce the file:line change in clear terms before re-running replay so the dev can interrupt if they object; otherwise proceed. Re-run replay to verify in both cases.
+5. **Don't ask what you can find out.** Use `git log`, `git diff`, file reads, and api-server calls. Never ask "what did you change", "which app", or "which branch"—discover them.
+6. **Always end with two dashboard URLs.** The branch diff page and the test-run report page. Format:
+   - `Branch diff: https://app.keploy.io/api-testing/branch-diff?appId=<app_id>&branchId=<branch_id>`
+   - `Run report: https://app.keploy.io/tr/<test_run_id>?appId=<app_id>`
+     The `<test_run_id>` is the id from Phase A1 (or, for Routine B, the id of the most recent run after `keploy cloud replay`). Swap the base for self-hosted.
 
-This page has three parts:
+## Discovery (run at the start of every conversation, before either routine)
 
-1. **Wire up the Keploy MCP server in your editor**—one-time config; same JSON shape across every supported editor.
-2. **Install the playbook**—a single block that goes into a Claude Code skill, a Cursor rules file, a Windsurf memory file, or any equivalent. It loads automatically whenever the agent sees a Keploy-related prompt.
-3. **Use the two prompts**—what to type, what the agent does.
+1. **App.** `basename $(pwd)` → `listApps({q: <basename>})` → pick the unambiguous match. Cache `app_id` for the session.
+2. **Branch.** `git rev-parse --abbrev-ref HEAD` → `create_branch({app_id, name: <git branch>})` → cache `branch_id`. If `git rev-parse` returns `HEAD` or exits non-zero, ask the dev for a branch name ONCE.
+3. **App context (optional, only when you need cluster/ns/deployment).** `getApp({appId: app_id, fields: ["name","namespace","deployment","origin.clusterName","origin.namespace","origin.deployment"]})`. The full app schema is ~16k tokens; the projected response is ~300 tokens. **Call this AT MOST ONCE per session.** The returned identity fields are sticky — hold them mentally, do not re-call `getApp` later in the conversation for the same `app_id`.
 
-This page picks up after two one-time setups are already done: the application is [recording in your cluster](/docs/quickstart/k8s-proxy), and the [CI pipeline](/docs/quickstart/k8s-proxy-developer-workflow#wiring-up-your-ci-pipeline) (replay on PR open, branch-merge on PR merge) is wired into your repo as instructed on the Developer Workflow page. The agent only drives the dev-side loop—it never touches CI.
+All three values are sticky for the rest of the conversation. Don't re-discover unless the dev switches git branches. Re-calling `getApp` mid-session is an anti-pattern — its 16k schema lives in your context for every subsequent step regardless of whether you ask for it again.
 
 ---
 
-## Before you start
+## Routine A—failing cloud replay (local or CI), analyse and fix
 
-- A **Keploy PAT**—Dashboard → Settings → API Keys. Copy the `kep_...` value (shown only once).
-- An **MCP-aware editor**: Claude Code, Cursor, Windsurf, Claude Desktop, VS Code, or Trae.
+### Phase A1—Resolve the `test_run_id`
 
----
+The goal of this phase is exactly one thing: produce a `test_run_id` you can pass to `getTestReportFull` in Phase A2. Pick how you get it based on the form of Prompt A:
+
+- **Local form** ("my keploy cloud replay is failing…") → call `listTestReports({appId: app_id, branch_id, status: "FAILED", limit: 5})` **EXACTLY ONCE**. Pick `data[0]` (newest first by `created_at`) and take its `id`. That's the dev's last local `keploy cloud replay --branch-name` invocation — `keploy cloud replay` uploads its report into the legacy `/tr` collection, which is what `listTestReports` queries. **Do NOT retry with different status / source / branch_id permutations** if the first call returns empty — that wastes context on calls that will all return the same empty set (the run genuinely doesn't exist on this branch yet; tell the dev to run `keploy cloud replay` first). `status` is CASE-SENSITIVE — use the exact value `"FAILED"`, not `"failed"` or `"Failed"`. Use `getTestReport({appId: app_id, reportId: test_run_id})` for a cheap roll-up probe before pulling the full report only when the full report is going to be large.
+- **CI form** ("the keploy cloud replay pipeline is failing…") → the dev usually pastes a CI log URL or dashboard URL. Extract `test_run_id` from it. If they didn't paste anything, fall back to the local-form lookup above—a CI failure posts the same legacy test-run-report record to the api-server, so the latest-failed lookup still finds it. Use `source: "ci"` on the list call to scope to runs that carry CI metadata.
+
+Either way, Phase A2 onward is identical—same `getTestReportFull` call, same routes, same fixes.
 
-## Step 1—Wire up the Keploy MCP server
+### Phase A2—Fetch the full report
+
+Call `getTestReportFull({appId: app_id, reportId: test_run_id, fields: ["failed_steps[].diff", "mock_mismatches", "status", "ci_metadata"]})`. The OpenAPI-generated tool's **path** parameters are camelCase (`appId`, `reportId`) per the spec, while its **query** parameters stay snake_case (`include_oss_report`, `mock_mismatches_only`, `max_test_cases_per_set`, `fields`); pass each one with the literal name the spec declares.
+
+**Use `fields` aggressively** — full report is ~34k tokens, projection brings it to ~5k. Supports dotted paths + array wildcards. Defaults for the other params return roll-up + every test set + every per-case diff in one round-trip. Read:
+
+- `report.status`—`FAILED` is your trigger to continue.
+- `report.ci_metadata`—when populated this is a CI run; `provider` / `commit_sha` / `pr_number` give you the surrounding context.
+- `test_sets[]`—per set, each entry carries `tests[]` (per-case name + status roll-up) and `test_cases[]` (the inflated per-case rows). Iterate `test_cases[]` and, for any case whose `status` is `FAILED`, read:
+  - `oss_report.req.{method,url}` — which endpoint failed.
+  - `oss_report.result.status_code.{expected,actual}` — status-code diff.
+  - `oss_report.result.headers_result[].{expected,actual,normal}` — per-header diff (`normal=false` means a real mismatch).
+  - `oss_report.result.body_result[].{expected,actual,normal,type}` — per-body diff. This is your primary signal for an authored-response drift.
+  - `oss_report.mock_mismatches.{expected_mocks,actual_mocks}` — set of mocks the replayer recorded versus the set it actually consumed during this run. Populated for both passed and failed cases when consumed-mock data is known. Non-empty + a body diff together is the signature of a mock-driven regression.
+  - `oss_report.failure_info.mock_mismatch` — same shape, legacy fallback for reports produced by replayers older than v3.5.49.
+  - `oss_report.noise` — JSONPaths the recorder has already marked as ignorable (don't re-flag these as drifts).
+- For investigating only mock-driven failures on a large run, pass `mock_mismatches_only=true` — `test_cases[]` is restricted to entries with non-empty `mock_mismatches` (or the legacy fallback) and the response stays token-safe.
 
-All MCP-aware editors accept the exact same JSON config; only the config file path differs. The Claude Code snippet is shown below as the example; for the equivalent config paths on Cursor, Windsurf / Antigravity, GitHub Copilot, and other clients, see [MCP Client Configuration](/docs/running-keploy/agent-test-generation#mcp-client-configuration) on the Agent Test Generation page—the same JSON shape works there too.
+### Phase A3—Diagnose each failing test case
 
-**Claude Code** uses `~/.claude.json`:
+**Before classifying, verify intent.** The commit-message channel is **asymmetric**: it can confirm "deliberate" with high confidence, but it cannot confirm "bug." Real regressions ship with normal-sounding messages ("add field X", "extract helper", "switch to v2 client") — no one writes `fix(orders): accidentally broke duplicate-detect`. So the message has exactly one trustworthy verdict: deliberate → Case 2. Everything else means "message doesn't decide" — go look at the working tree, then the diff.
 
-```json
-{
-  "mcpServers": {
-    "keploy": {
-      "type": "http",
-      "url": "https://api.keploy.io/client/v1/mcp",
-      "headers": {"Authorization": "Bearer kep_..."}
-    }
-  }
-}
+Honest classifier — apply in order, stop at the first match:
+
+**Step 0 — Working-tree check (UNCONDITIONAL; run FIRST, before any classifier MCP or git-history call).**
+
+```
+git status -s -- <failing-handler-path>
+git diff -- <failing-handler-path>          # unstaged
+git diff --cached -- <failing-handler-path> # staged but not committed
 ```
 
-Fully quit and reopen your editor after editing the config. MCP clients only re-read config on startup.
+Run all three **every time**, even when the tree looks clean. The empty result IS the evidence required to advance to Step 1. Skipping = silent misclassification when the assumption is wrong.
 
----
+**ALLOWLIST of MCP calls permitted before Step 0** (Phase A1 discovery only): `listApps`, `getApp`, `create_branch`, `list_branches`, `listTestReports`, `getTestReport`, `tools/list`. EVERY other call — `getTestReportFull`, `getTestCase`, `getMock`, `listMocks`, `getRecording`, `listRecordings`, `updateTestCase`, `update_mock`, `delete_recording` — is classifier/write and MUST come AFTER Step 0. Reading `getTestCase` first biases toward Case 2 framing.
 
-## Step 2—Install the playbook
+Any uncommitted edit touching the failing handler's source → **Case 1, mandatory.** Revert (or ask the dev); do NOT proceed to commit-history reasoning. Uncommitted edits beat any committed-history signal — they can't be the deliberate new contract.
 
-The playbook below teaches your agent to run the whole workflow autonomously from the two prompts. Without it, the agent has to rediscover the workflow on every call by reading each tool's individual description—slower and prone to skipping the branch-resolution step.
+**Step 1 — Identify the recording's snapshot anchor.**
 
-The exact same block works on every MCP-aware editor; only the file path changes. The walkthrough below uses Claude Code's native Skills system as the example.
+If Step 0 is clean (no uncommitted edits on the failing path), find the commit the failing test set was recorded against. Sources, in order of preference:
 
-### Where the playbook goes
+- The test set's `created_at` timestamp from `listRecordings` / `getRecording` → `git log --until=<ts> -1 --format=%H -- <failing-handler-path>` gives the commit that was HEAD when the recording was captured.
+- The branch's first commit ancestor of HEAD that pre-dates the recording's creation.
 
-| Editor             | Install path                                                                                                                                                                 |
-| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Claude Code**    | `~/.claude/skills/keploy/SKILL.md` (global) **or** `<repo>/.claude/skills/keploy/SKILL.md` (committed). Auto-loaded when the dev's prompt matches the skill's `description`. |
-| **Cursor**         | `.cursor/rules/keploy.md` in the repo (committed) **or** Cursor Settings → Rules for AI (global).                                                                            |
-| **Windsurf**       | `.windsurfrules` at the repo root, **or** `~/.codeium/windsurf/memory/global_rules.md` (global).                                                                             |
-| **Claude Desktop** | A Project's "Project knowledge" section, or paste as the first message of every conversation.                                                                                |
-| **VS Code**        | `.github/copilot-instructions.md` (Copilot Chat reads this) or paste in chat per session.                                                                                    |
+Commits **at or before** the anchor are already encoded in the recording — they cannot be a drift source, regardless of how "intentional" their messages read. Skip them.
 
-All five locations accept exactly the same block below. Only Claude Code reads the YAML frontmatter at the top (the `---`-delimited block) to decide when to load the skill; other editors ignore it harmlessly—either keep it for portability or strip it; both work.
+**Step 2 — Commit-message check (post-anchor commits only).** For each commit strictly after the anchor that touches the failing handler:
 
-Commit the file when you want every teammate's agent to follow the same playbook. Whichever location you pick, **fully restart the editor afterward**—every editor reads skills / rules / memory only at startup.
+- (a) **Message names a feature / refactor / contract change / incident remediation** → **Case 2**. Dev's stated intent is the contract; do NOT rewrite the deliberate code into a non-IO equivalent (still Case 1 in disguise).
+- (b) **Message silent/generic** AND diff touches the exact drifted field per `oss_report.result.body_result[].expected` AND baseline still represents correct contract → **Case 1**. Side-effect regression; the signal is the diff×baseline intersection, not the message.
+- (c) **Otherwise** (silent + drift not in diff, or unclear) → **Case 2** (default to test-data work; silence is not evidence of a bug).
+- (d) **No post-anchor commits touch the failing handler AND Step 0 clean** → ask the dev; flaky/environment/proxy.
 
-### The playbook
+Classify each failing test case using `oss_report.result` body/header diff + `oss_report.mock_mismatches`:
 
-Use the copy button on the block below and paste it into the file at the path you picked above.
+#### Case 1—Bug in the app (regression). You fix the code.
 
-````markdown
----
-name: keploy
-description: Use this skill whenever the dev mentions keploy—a failing "cloud replay" (local or CI pipeline), a request to "add new keploy tests" or similar, or any Keploy MCP tool. Drives the autonomous Keploy branch workflow end-to-end from two fixed dev prompts—agent resolves app + branch, diagnoses failing runs (local or CI), fixes mocks/tests on a branch, captures new traffic, and validates without follow-up questions.
----
+The handler used to behave correctly; an uncommitted edit or a recent commit broke it as a side effect. Signal: Step 0 flagged an uncommitted edit OR the diff touches the same field the report says drifted, the commit message doesn't claim that field as the intended change, and `oss_report.result.body_result[].expected` (the recorded baseline) still represents the correct behavior.
 
-# Keploy MCP playbook—autonomous developer workflow
+**Action — execute IN THIS ORDER. Skipping or reordering any step is a defect; the rebuild does nothing without the edit, and the replay tells you nothing without the rebuild.**
 
-The developer will only ever say one of two things to you:
+**Step C1.1 — Edit the source. This is THE fix.** Pick one of:
 
-**Prompt A:** "my keploy cloud replay is failing, please analyse and fix it." OR "the keploy cloud replay pipeline is failing, please analyse and fix it."—both forms route to the same routine; the first means the dev's last local replay run failed (find the latest test_run on the branch via api-server), the second means a CI pipeline run failed (the dev should paste the CI log or dashboard URL; extract `test_run_id` from it).
-**Prompt B:** "Add new keploy tests for my changes."
+- **Uncommitted regression (Step 0 fired):** Run `git checkout -- <failing-handler-path>` (or `git restore -- <path>`) via the Bash tool to discard the working-tree edit. This is the literal fix — the test was passing against the committed source, and the working-tree edit is what broke it. Do NOT skip to rebuild expecting the rebuild to "restore" the source; image rebuild reads from the current source, so without the checkout you'll rebuild the same broken code.
+- **Committed regression (Step 2 fired):** Edit the **application source code** (`.go` / `.py` / `.ts` / `.java` / etc.) yourself using the Edit/Write tool to restore the expected behavior — minimal change, consistent with the test's contract.
 
-You handle EVERYTHING else autonomously. Discover the app, the branch, the failing run, the code changes—from the filesystem, from git, and from the Keploy api-server. Make decisions. Execute fixes. Report what you did. Do NOT ask the developer follow-up questions unless you are truly blocked (see "When you may ask" at the bottom).
+The fix MUST live in the application source. Do NOT modify `Dockerfile*`, `docker-compose*.yml`, `keploy.yml`, `.env*`, k8s manifests, CI workflows, or the replay command line; do NOT introduce a new env-var-driven branch in code as a way to toggle the bug away at runtime. Announce the file:line and a one-line description of the edit **before** applying it so the dev can interrupt if they object; otherwise proceed. Do NOT touch the test—its captured baseline is still correct.
 
-## Hard rules
+**Step C1.2 — Verify the edit landed.** Run `git status -s -- <failing-handler-path>` and `git diff -- <failing-handler-path>`. Status + diff MUST both be empty for an uncommitted-regression revert (means working tree matches committed baseline). If either still shows changes, redo C1.1. Do not proceed to rebuild until verification is clean.
 
-1. **Branch-first.** Every write to mocks / tests / recordings is branch-scoped. Resolve `branch_id` before any write. If a tool returns "branch_id is required", you skipped this—fix and retry, don't ask the dev.
-2. **Keploy branch name = git branch name.** Detect via `git rev-parse --abbrev-ref HEAD`. Pass that string to `create_branch` (find-or-create, idempotent). Reuse the returned `branch_id` for every subsequent write in this session.
-3. **App resolution from cwd.** `basename $(pwd)` → `listApps({q: <basename>})`. Exactly one match → use it. Multiple → pick the one whose name most specifically matches the dev's compose service. Zero matches → ask the dev once.
-4. **Fix the root cause—app code or test data.** When a test fails because the contract changed intentionally, fix the test on the Keploy branch (`update_mock` / `update_test_suite`). When a test fails because the app regressed, edit the handler code yourself to restore the correct behavior. Announce the file:line change in clear terms before re-running replay so the dev can interrupt if they object; otherwise proceed. Re-run replay to verify in both cases.
-5. **Don't ask what you can find out.** Use `git log`, `git diff`, file reads, and api-server calls. Never ask "what did you change", "which app", or "which branch"—discover them.
-6. **Always end with two dashboard URLs.** The branch diff page and the test-run report page. Format:
-   - `Branch diff: https://app.keploy.io/api-testing/branch-diff?appId=<app_id>&branchId=<branch_id>`
-   - `Run report: https://app.keploy.io/tr?appId=<app_id>&branch=<branch_name>`
-     Swap the base for self-hosted.
+**Step C1.3 — Rebuild the image.** Only AFTER C1.2 clean: `docker build -t <image>:<tag> <build-context>`. Cloud replay does not compile from source — see "rebuild before replay" below for tag-matching rules.
 
-## Discovery (run at the start of every conversation, before either routine)
+**Step C1.4 — Replay.** Then run `keploy cloud replay`. Expect green. If still red after a clean C1.1/C1.2 revert + C1.3 rebuild, investigate further — do NOT flip to Case 2 (the Step 0 signal is the strongest available).
 
-1. **App.** `basename $(pwd)` → `listApps({q: <basename>})` → pick the unambiguous match. Cache `app_id` for the session.
-2. **Branch.** `git rev-parse --abbrev-ref HEAD` → `create_branch({app_id, name: <git branch>})` → cache `branch_id`. If `git rev-parse` returns `HEAD` or exits non-zero, ask the dev for a branch name ONCE.
+**Anti-patterns (both forbidden):** (1) jumping to `docker build` without executing C1.1 — rebuilds the broken code unchanged; (2) flipping from Case 1 → Case 2 mid-run — Step 0 + Step 2 classification is binding for this iteration. Persistent red after a clean C1.1-C1.3 means ambient state (stale `mock-*-patch.yaml` files, leftover branch test sets) is poisoning the run; surface that to the dev, don't override the classification.
 
-Both values are sticky for the rest of the conversation. Don't re-discover unless the dev switches git branches.
+#### Case 2—App behavior drifted intentionally. You fix the test data on the branch.
 
----
+The contract changed on purpose; the test's recorded baseline is stale. Read `oss_report.result` (status / headers / body diff) and `oss_report.mock_mismatches` together, then pick a sub-action:
 
-## Routine A—failing cloud replay (local or CI), analyse and fix
+**2a—Only a test diff (no mock mismatch driving it).** Update the test data on the branch. The legacy `/tr` flow stores recordings as test cases, so the write tool is `updateTestCase` MCP tool (or `keploy mock patch` CLI if the mismatch is on the recorded response of a downstream mock):
 
-### Phase A1—Resolve the `test_run_id`
+- If the diverging field is genuinely non-deterministic (timestamps, request IDs, generated UUIDs—anything that legitimately changes every run), add its JSONPath to the test case's `noise` map via `updateTestCase`. Marking a field as noise tells the runner to ignore diffs on that path; once added, the next replay should treat the same divergence as `normal=true`.
+- Otherwise update the recorded `response` body on the test case via `updateTestCase`. Fetch the existing case first via `getTestCase` so you only mutate the fields the new contract dictates and don't drop unrelated keys.
 
-The goal of this phase is exactly one thing: produce a `test_run_id` you can pass to `get_session_report` in Phase A2. Pick how you get it based on the form of Prompt A:
+**2b—Test diff plus a mock mismatch that's plausibly causing the diff.** The recorded mock is what's out of date—the downstream call's shape changed. Look at `oss_report.mock_mismatches.expected_mocks` (what the recorder captured) vs `actual_mocks` (what the replayer actually consumed) — entries that appear in `actual_mocks` but not `expected_mocks` are the new outgoing calls you need to capture.
 
-- **Local form** ("my keploy cloud replay is failing…") → call `listTestRuns({app_id, branch_id, kind: "test_suite_run", limit: 5})` (or the equivalent op-id surfaced by the OpenAPI-generated tool list), pick the most recent run whose status is `failed`, and take its `id`. That's the dev's last local `keploy cloud replay --branch-name` invocation.
-- **CI form** ("the keploy cloud replay pipeline is failing…") → the dev usually pastes a CI log URL or dashboard URL. Extract `test_run_id` from it. If they didn't paste anything, fall back to the local-form lookup above—a CI failure posts the same `test_suite_run` record to the api-server, so the latest-failed lookup still finds it.
+**Data flow for Case 2b (internalise before touching anything):**
 
-Either way, Phase A2 onward is identical—same `get_session_report` call, same routes, same fixes.
+- **Cloud (api-server + mongo) is the single source of truth.** Every write (`keploy mock patch`, `updateTestCase`, `delete_recording`, `keploy upload test-set`) mutates the cloud, not a local file. `keploy cloud replay` re-downloads the bundle into a throwaway local `keploy/<test_set_id>/` cache each run — never hand-edit YAML there, it's wiped on the next run.
+- **A 2xx write is persisted in mongo.** Use `getMock` / `getTestCase` for readback if you want to double-check; you do NOT need to "push" or "force-refresh" the bundle.
+- **`keploy upload test-set` has one valid use: landing a fresh `keploy record` capture on a branch (step 2 of 2b-recapture).** Don't re-upload an edited bundle — it creates a duplicate test set, doesn't replace the failing one.
 
-### Phase A2—Fetch the full report
+**Before the ladder: check whether patch is even applicable.**
 
-Call `get_session_report({app_id, test_run_id, verbose: true})`. Read:
+`keploy mock patch` can only modify an EXISTING mock entry. It cannot add a new one. So if the source change introduced a new outbound call (new SQL query, new downstream HTTP, new redis op) that has no corresponding entry in the recorded mocks, patching is structurally impossible — there is nothing to patch.
 
-- `status`—`has_failures` is your trigger to continue.
-- `failed_steps[]`—for each entry note `suite_id`, `suite_name`, `step_name`, `method`, `url`, `diff`, `authored_assertions`, `authored_response_body`, `mock_mismatch_failure`, `mock_mismatches`.
-- `mock_mismatch_dominant`—true when >50% of failures are mock-mismatches (the signature of a keploy-side egress-hook issue, not an app regression).
+Detect this case from the report's `mock_mismatches`:
 
-### Phase A3—Diagnose each failing step
+- `actual_mocks` lists the outbound calls the app made during replay.
+- `expected_mocks` lists the calls the recording has mocks for.
+- **If `actual_mocks` contains entries with no match in `expected_mocks`** (e.g. a `SELECT … FOR UPDATE` query that doesn't appear in any recorded mock's `sqlNormalized`), the bundle is missing an entry. Skip the patch ladder and go directly to **2b-recapture** — that is the only path that can introduce a new mock entry without hand-guessing its response shape.
 
-Two cases. Decide per step from `git log` / `git diff origin/main...HEAD` (commits on the failing endpoint or its dependencies) and the report's `failed_steps[]` (the test diff and any `mock_mismatches`):
+If every `actual_mocks` entry has a matching `expected_mocks` entry but the *values* differ (column order, bind values, response rows), patch IS applicable — proceed to the ladder.
 
-#### Case 1—Bug in the app (regression). You fix the code.
+**Escalation ladder when patches don't make replay green:** `keploy mock patch` ×≤2 → 2b-recapture (record/upload/delete) ×1 → **then** stop-and-report. Verify each patch with `getMock` readback. Reporting prematurely (after 2 failed patches, skipping recapture) leaves a fixable drift uncaught.
 
-The handler used to behave correctly; a recent commit broke it. Signal: a recent commit touched the failing endpoint or its dependencies AND the test's `authored_response_body` still represents the correct behavior.
+**Pick patch vs recapture by drift *shape*, not "is this deliberate":**
 
-Action: edit the handler code yourself to restore the expected behavior—minimal change, consistent with the test's contract. Announce the file:line and a one-line description of the edit **before** applying it so the dev can interrupt if they object; otherwise proceed. Do NOT touch the test—its captured baseline is still correct.
+- **2b-patch — DEFAULT for scoped *value* changes on EXISTING mocks.** Tool: `keploy mock patch --app-id <appUUID> --branch-id <branchUUID> --test-set-id <tsUUID> --mock-id <name> --mock-yaml-file <path>` (CLI). Read existing with `getMock` first, write patched YAML to a file, invoke CLI. Use when the drift is a value change on an existing mock entry (one operator changed, one column / header / constant differs, response shape same family). **New query / new downstream call → NOT patchable → 2b-recapture.**
 
-#### Case 2—App behavior drifted intentionally. You fix the test data on the branch.
+  > **Always use the CLI, not MCP `update_mock`.** The CLI re-derives kind-specific fields the agent can't compute — most importantly `sql_ast_hash` for PostgresV3 (sha256 of the parsed-normalized AST the matcher keys on). MCP `update_mock` with a stale `sql_ast_hash` writes 2xx, `getMock` echoes new SQL back, but replay still fails because the matcher's hash lookup misses.
 
-The contract changed on purpose; the test's recorded baseline is stale. Read `failed_steps[].diff` and `mock_mismatches` together, then pick a sub-action:
+- **2b-recapture — ONLY for sweeping changes that can't be patched cleanly** (call graph rewritten, request/response shapes diverge wholesale, multiple endpoints drifted at once). Order: (1) `keploy record`, (2) `keploy upload test-set --branch <git branch>`, (3) **only after upload succeeds**, `delete_recording` on the stale set. **Never `delete_recording` first** — reversing leaves the branch at zero coverage and the next replay "passes" trivially (zero tests = zero failures, indistinguishable from a real fix).
 
-**2a—Only a test diff (no mock mismatch driving it).** Update the test step on the branch:
+  > **Recapture is YOU driving traffic. Seed deliberately, drive surgically.** Keploy has no scoped-recapture primitive. Before re-recording, read each existing case's `http_req` + recorded responses to learn what state it assumes (e.g. recorded `GET /api/orders` returned 7 rows but the suite POSTs only 2 → 5 pre-seeded). SQL INSERT / S3 copy / Kafka seed that state, then `keploy record` and drive ONLY the affected case's curls. Driving everything against a fresh DB silently degrades coverage — a `GET ?user_id=Mallory` with no Mallory row captures `{"count":0,"orders":[]}` and replays green without verifying anything.
 
-- If the diverging field is genuinely non-deterministic (timestamps, request IDs, generated UUIDs—anything that legitimately changes every run), add its JSONPath to the step's `noise` list via `update_test_suite`. Marking a field as noise tells the runner to ignore diffs on that path.
-- Otherwise update the recorded `response` body on the step via `update_test_suite`. **MUST preserve every kept step's existing `id`**—fetch the test first via `getTestSuite`, copy each step's `id` into your merged `steps_json`, and change only the field(s) the new contract dictates. Omitting step IDs is rejected as a "full rewrite".
+**Two constraints on every Case 2b action:**
 
-**2b—Test diff plus a mock mismatch that's plausibly causing the diff.** The recorded mock is what's out of date—the downstream call's shape changed. Update the mock via `update_mock({app_id, test_set_id, mock_id, branch_id, mock_yaml: <updated yaml>})`. Read the existing mock with `getMock` first to preserve fields you're not changing, then re-run replay.
+- **Fixtures are branch-scoped — never copy across branches.** Each Keploy branch carries its own recorded fixtures, mocks, and mappings, captured against THAT branch's app state. NEVER upload a recording (or a `keploy mock patch` derivative) from a sibling branch or from main onto the current branch as a shortcut "fix." The fixture's recorded app-state assumptions belong to where it was captured; planting it on a branch with different code state silently corrupts replay lineage and confuses every downstream reader of the branch history. If you need fresh fixtures, re-record against the CURRENT branch's app via `keploy record` + `keploy upload test-set --branch <git branch>`.
 
-- If the test still fails after one or two mock edits, the recorded baseline is too far gone to patch piecemeal. Fall back: drop the stale test data (`delete_recording` on the affected test set) and re-capture from scratch using Routine B's flow (`keploy record` against the current behavior, then `keploy upload test-set --branch <git branch>` to land it on the branch).
+- **Reuse before re-record.** Before `keploy record` / `keploy upload test-set` for 2b: `listRecordings({app_id, branch_id})`, then `getMock` per ID the report's `mock_mismatches.expected_mocks` named. **Do NOT call `listMocks`** — the report already names drifted IDs; `listMocks` adds ~28k tokens of context-polluting inventory. If a current-branch recording already covers the endpoint with the missing mock entries, `delete_recording` the failing set and stop. Re-record is the LAST resort.
 
-Multiple failing steps can land in different cases—handle each independently.
+Multiple failing test cases can land in different cases—handle each independently.
 
 ### Phase A4—Verify
 
-After every Case-1 (app code edit) or Case-2 (test data edit) fix, run via Bash:
+**After a Case 1 source edit, rebuild the app's docker image BEFORE replay.** Cloud replay doesn't compile — it uses whatever's currently tagged for the manifest's image ref on the local Docker daemon. Resolve the tag from replay logs (`"Using deployment image": "<repo>:<tag>"`) or `getRecording`/`listRecordings` `resources`, then:
 
 ```bash
-keploy cloud replay --app <ns.deployment> --branch-name <git branch>
+docker build -t <manifest-image-tag> <build-context-dir>
 ```
 
-If still failing, re-enter Phase A2 with the new `test_run_id`. If passing, proceed to A5. Cap retry attempts at 3—if it's still red, the failures are likely a keploy-side proxy issue (your fixes aren't taking effect). Report the residual failures honestly with the `test_run_id` and the run-report URL so the dev can file a keploy bug, then stop.
+`<build-context-dir>` is where the app's `Dockerfile` lives. If the manifest tag is registry-prefixed, match it exactly — else Docker pulls from the registry and misses your edit. **Case 2 fixes skip this step.**
+
+Then run via Bash — **always pipe the output through `tail` / `grep`** so the ~10-40k tokens of replay log don't enter your context wholesale, AND **always pass `--disableReportUpload=false`** so the `/tr` report row gets written (OAuth-authenticated CLIs default this flag to `true`, which silently skips the upload — without it, `listTestReports` will return empty for this run and the dashboard URL won't print):
+
+```bash
+keploy cloud replay --app <ns.deployment> --branch-name <git branch> --cluster <cluster-name> --disableReportUpload=false 2>&1 \
+  | tail -n 60 \
+  | grep -E "Total test|Failed Testcases|test passed|test failed|FAIL|ERROR|debug bundle|View test report"
+```
+
+The full replay log contains per-mock-match traces, per-testcase debug lines, and a final summary block. Your decisions only need the final summary + any FAIL/ERROR lines + the `View test report at:` URL. Piping at the command level keeps the slice that re-bills on every subsequent step to ~2k tokens instead of the full ~40k — over a retry loop that compounds enormously. Apply the same pipe pattern to every other long-running Bash command: `keploy record` output, `docker build`, `keploy upload test-set`. Read the cached log file directly only when the grep slice doesn't show what you need.
+
+**`--cluster` is mandatory.** Use the `origin.clusterName` you cached from Discovery's `getApp` (do NOT re-call). Without it, auto-select needs a cluster heartbeat within 35s and dies `no active clusters found`.
+
+If still failing, re-enter Phase A2 with the new `test_run_id`. Cap retries at 3 — beyond that, report residual failures with `test_run_id` + run URL and stop.
+
+**Sanity gate before declaring success.** Post-fix `total_tests` must be ≥ pre-fix. A drop (especially "0 passed / 0 failed") = coverage regression from deleting a test set before its replacement uploaded. Re-record + upload to restore.
 
 ### Phase A5—Report (exact format)
 
 ```
 ### Diagnosis
-| Test | Step | Case | Cause |
+| Test set | Test case | Case | Cause |
 | --- | --- | --- | --- |
-| <name> | <step> | 1 / 2a / 2b | <one-line cause from repo inspection> |
+| <test_set_name> | <test_case_name> | 1 / 2a / 2b | <one-line cause from repo inspection> |
 
 ### Fixes applied
 - (Case 1) Edited `<file:line>`—`<one-line change description>`.
-- (Case 2a) `update_test_suite` on `<suite_name>`—set noise on `<path>` OR updated response field `<path>`.
-- (Case 2b) `update_mock` on `<mock_name>` (test set `<test_set_id>`) OR `delete_recording` + re-capture via `keploy record` + `keploy upload test-set`.
+- (Case 2a) `updateTestCase` on `<test_case_name>`—set noise on `<path>` OR updated response field `<path>`.
+- (Case 2b) `keploy mock patch` on `<mock_name>` (test set `<test_set_id>`) OR `delete_recording` + re-capture via `keploy record` + `keploy upload test-set`.
 - `keploy cloud replay` re-run: `<p>/<t>` tests passed.
 
 ### Next step for you
@@ -213,7 +238,7 @@ If still failing, re-enter Phase A2 with the new `test_run_id`. If passing, proc
 - (Retry cap hit) File a keploy bug with `test_run_id=<id>` and the run-report URL.
 
 Branch diff: https://app.keploy.io/api-testing/branch-diff?appId=<app_id>&branchId=<branch_id>
-Run report: https://app.keploy.io/tr?appId=<app_id>&branch=<branch_name>
+Run report: https://app.keploy.io/tr/<test_run_id>?appId=<app_id>
 ```
 
 ---
@@ -255,9 +280,13 @@ keploy upload test-set \
 ### Phase B4—Validate
 
 ```bash
-keploy cloud replay --app <ns.deployment> --branch-name <git branch>
+keploy cloud replay --app <ns.deployment> --branch-name <git branch> --cluster <cluster-name> --disableReportUpload=false 2>&1 \
+  | tail -n 60 \
+  | grep -E "Total test|Failed Testcases|test passed|test failed|FAIL|ERROR|debug bundle|View test report"
 ```
 
+`--cluster` is mandatory — resolve from the `getApp` call you made in Discovery (you cached `origin.clusterName`; do NOT re-call `getApp`). `--disableReportUpload=false` is mandatory too — OAuth CLIs default it to `true` which silently skips the `/tr` report upload. Pipe through `tail`/`grep` for the same context-cost reason as Phase A4.
+
 If anything failed, enter Routine A from Phase A2—the diagnosis routine handles it.
 
 ### Phase B5—Report (exact format)
@@ -275,7 +304,7 @@ If anything failed, enter Routine A from Phase A2—the diagnosis routine handle
 Open your PR. CI will replay this branch automatically; merge will fold the test data into main.
 
 Branch diff: https://app.keploy.io/api-testing/branch-diff?appId=<app_id>&branchId=<branch_id>
-Run report: https://app.keploy.io/tr?appId=<app_id>&branch=<branch_name>
+Run report: https://app.keploy.io/tr/<test_run_id>?appId=<app_id>
 ```
 
 ---
@@ -291,112 +320,16 @@ Everything else—what failed and why, which mock to update, what test-set name
 
 ## Anti-patterns (refuse these)
 
-- Editing handler code on a Case-2-shaped failure (contract changed intentionally). The test data is what's stale—update it on the branch instead.
-- Writing to `main` (any tool that omits `branch_id`). Always branch-first.
-- Re-recording to absorb a failure without first reading the diff and deciding the route. Re-record only when Route C applies.
-- Inventing a PAT, branch name, or secret value.
-````
-
-Save the file and fully restart your editor so the skill / rules / memory entry is available in your next session.
-
----
-
-## Step 3—Use the two prompts
-
-That's it. From now on, you only ever type one of:
-
-> **"my keploy cloud replay is failing, please analyse and fix it."**
-
-_or, when the failure was in CI:_
-
-> **"the keploy cloud replay pipeline is failing, please analyse and fix it."**
-
-or
-
-> **"Add new keploy tests for my changes."**
-
-What happens behind the scenes for each:
-
-### Prompt A—analyse and fix a failing replay (local or CI)
-
-| Phase | What the agent does                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| A0    | Resolve `app_id` from `basename $(pwd)` + `listApps`. Resolve `branch_id` from `git rev-parse --abbrev-ref HEAD` + `create_branch`.                                                                                                                                                                                                                                                                                                                                                                             |
-| A1    | Get a `test_run_id` to fetch the report against. Local form → list the branch's recent test runs and take the latest failed one's id. CI form → extract `test_run_id` from the CI log or dashboard URL the dev pasted (falls back to the local lookup if nothing was pasted).                                                                                                                                                                                                                                   |
-| A2    | Fetch the full report (`get_session_report` with `verbose=true`).                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| A3    | Per failing step, decide Case 1 (bug in the app—recent commit broke it, test is still correct) or Case 2 (app behavior drifted intentionally—test data is stale, with sub-actions 2a noise / 2a response edit / 2b mock edit / 2b delete + re-record). Decision is from `git log` / `git diff` plus the report's `mock_mismatches`, never from a dev question.                                                                                                                                                  |
-| A4    | For Case 1: announce the file:line and a one-line description, then edit the handler code so the dev can stop the agent if they object. For Case 2a: `update_test_suite` to add noise on a non-deterministic field, or to update the recorded `response` body (preserve every existing step `id`). For Case 2b: `update_mock` on the affected mock, or—if the baseline is too far gone—`delete_recording` and re-record via Routine B's flow. Either way, re-run `keploy cloud replay --branch-name` to verify. |
-| A5    | Report: diagnosis table (case per step) + fixes applied + next-step-for-you + branch-diff URL + run-report URL.                                                                                                                                                                                                                                                                                                                                                                                                 |
-
-### Prompt B—author new keploy tests
-
-| Phase | What the agent does                                                                                                                                                                                                                                                                                                                                                                |
-| ----- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| B0    | Discovery (same as A0).                                                                                                                                                                                                                                                                                                                                                            |
-| B1    | `git diff origin/main...HEAD` to find handler files that changed; extract added/modified endpoints.                                                                                                                                                                                                                                                                                |
-| B2    | Pre-flight: discover the dev's run command from the repo (Makefile → docker-compose.yml → Procfile → package.json → README), start the app, curl any 200-returning endpoint to confirm it's serving traffic, stop it. Then run `keploy record -c "<dev run command>" --sync`, drive a realistic curl per new endpoint, stop the recorder. Recording lands at `keploy/test-set-N/`. |
-| B3    | `keploy upload test-set --app <ns.deployment> --branch <git branch> --test-set keploy/test-set-N --name <descriptive-name>` to land the bundle on the Keploy branch.                                                                                                                                                                                                               |
-| B4    | `keploy cloud replay --app <ns.deployment> --branch-name <git branch>` to validate. On failure, drop into Routine A.                                                                                                                                                                                                                                                               |
-| B5    | Report: captured endpoints table + replay result + next-step (open PR) + branch-diff URL + run-report URL.                                                                                                                                                                                                                                                                         |
-
-For everything not covered by these two prompts—manually inspecting test data, editing one mock, listing recordings—use the manual flow on the [Developer Workflow](/docs/quickstart/k8s-proxy-developer-workflow) page directly. The two-prompt workflow handles the 90% case; the manual flow is the escape hatch.
-
----
-
-## Putting it together
-
-Here are the typical scenarios the agent handles—one per case it decides between. Every one starts with the same two-prompt UX and ends with the dev pushing once CI catches up. The variable bit is what the agent does in the middle.
-
-### Scenario 1—App regression (Case 1)
-
-You merged a refactor that accidentally broke the price calculation on `/orders/{id}`. The test still expects the right total.
-
-> _"my keploy cloud replay is failing, please analyse and fix it."_
-
-A0 → A1 (latest failed run) → A2 (report shows `total_amount: 0` vs expected `99.99`). A3 sees your recent commit on the price-calc helper and the test's authored response is still correct → **Case 1**. A4 announces the edit at `pkg/order/calc.go:42`—restoring the line-item subtotal branch—then applies the fix and re-runs replay (green). A5 reports the edit + URLs.
-
-### Scenario 2—Test data drift on the response (Case 2a, response edit)
-
-You renamed a response field from `username` to `display_name` on `/users/{id}` on purpose. CI replay now fails because the recorded response still says `username`.
-
-> _"the keploy cloud replay pipeline is failing, please analyse and fix it."_
-
-A3 sees the rename commit and `authored_assertions` pinned to `username` → **Case 2a**. A4 calls `update_test_suite` to swap the field name on the recorded response (preserving every kept step's `id`), re-runs replay (green). A5 reports the test edit + URLs.
-
-### Scenario 3—Test data drift, non-deterministic field (Case 2a, noise)
-
-The replay started failing on `$.created_at`—a timestamp that differs each run. No code changes near it.
-
-> _"my keploy cloud replay is failing, please analyse and fix it."_
-
-A3 sees the diverging field is genuinely time-varying with no related commit → **Case 2a (noise)**. A4 calls `update_test_suite` to add `$.created_at` to that step's noise list; replay re-runs green.
-
-### Scenario 4—Mock drift from a DB query change (Case 2b, mock edit)
-
-You added a `discount_percent` column to the orders table and updated the `SELECT` to return it. The handler emits the new field, the test expects it, but the recorded mock for the DB call still has the old shape.
-
-> _"my keploy cloud replay is failing, please analyse and fix it."_
-
-A3 sees the schema-change commit and `mock_mismatches` on the SELECT row → **Case 2b**. A4 calls `update_mock` to add `discount_percent` to the mock spec; replay re-runs green. A5 reports the mock edit + URLs.
-
-### Scenario 5—Mock too far gone, full re-record (Case 2b, fallback)
-
-A downstream gRPC client was swapped for HTTP; the recorded mocks are protobuf bytes that no longer apply.
-
-> _"my keploy cloud replay is failing, please analyse and fix it."_
-
-A3 → **Case 2b**. A4 tries one `update_mock` edit—it doesn't pass. The agent falls back: `delete_recording` on the affected test set, then re-records via Routine B's flow (pre-flight → `keploy record -c "<run cmd>" --sync` → curl → `keploy upload test-set --branch <git branch>`). Replay re-runs green.
-
-### Scenario 6—Adding tests for a new endpoint (Routine B)
-
-You added `POST /coupons/redeem`.
-
-> _"Add new keploy tests for my changes."_
-
-B0 → B1 (`git diff origin/main...HEAD` surfaces the new route). B2 pre-flight: agent finds `make run` in the Makefile, brings the app up, `curl /health` returns 200, stops it. Then `keploy record -c "make run" --sync`, curls `POST /coupons/redeem` with a realistic body, stops the recorder. B3 uploads via `keploy upload test-set --app <ns.deployment> --branch <git branch> --name coupons-redeem`. B4 replay returns 1/1 passed. B5 reports the captured endpoint + URLs.
-
----
-
-Across every scenario, you only ever spoke one of two sentences. You push your code change (and, for Case 1, the agent's app-side edit). CI replays the branch on the PR; merge runs `keploy cloud branch-merge` and the test data lands on main.
-
-For the same flow done manually (CLI / dashboard, no agent), see [Developer Workflow with Keploy Proxy](/docs/quickstart/k8s-proxy-developer-workflow).
+- **Editing handler code on a Case-2 failure.** Contract changed intentionally → fix test data on the branch, not source.
+- **Rewriting deliberate code into non-IO equivalents to satisfy stale mocks** (mutex for `SELECT … FOR UPDATE`, local cache for Redis, hardcoded value for HTTP call). Mutates prod behaviour to pass tests; often regresses the safety property the commit added (process-local mutex doesn't survive replicas).
+- **`delete_recording` as the first action of 2b.** Order is record → upload → delete. Delete-first empties the branch; next replay "passes" trivially (zero tests = zero failures).
+- **Hand-editing local `keploy/<test_set_id>/` files.** That dir is re-downloaded each replay; edits are overwritten. Use CLI / MCP write paths.
+- **`keploy upload test-set` to re-publish edited mocks.** Upload is for landing fresh recordings only — it creates a duplicate test set, not a replacement. If `keploy mock patch` + `getMock` confirm the write but replay still fails, that's a matcher defect to report.
+- **Editing anything outside the application source tree.** No `Dockerfile*` / `docker-compose*` / `keploy.yml` / `.env*` / k8s manifests / CI workflows; no env-var-driven runtime bypass branches. Real code fix or test-data fix — nothing in between.
+- **Flipping CLI flags to make a failure go away** (`--freezeTime=false`, `--envs FOO=bar`, `--mocking=false`, `--ignoreOrdering=true`). Always a test-data problem instead.
+- **Writing to `main`** (any tool that omits `branch_id`).
+- **Uploading fixtures from another branch onto the current branch.** Fixtures are branch-scoped — they encode app-state assumptions of where they were captured. Re-record against THIS branch instead.
+- **Uploading fresh recordings without checking existing branch coverage first.** `listRecordings({app_id, branch_id})` + targeted `getMock` first; reuse if covered.
+- **Inventing a PAT, branch name, or secret value.**
+- **Running `keploy --help`, `keploy <cmd> --help`, or any `--version` info dump.** This skill names every command + flag you need (`keploy cloud replay`, `keploy mock patch`, `keploy record`, `keploy upload test-set`). The CLI's help text is ~14k tokens and re-bills on every subsequent turn — pure waste.
+- **Reading `keploy/cloud-debug.log`, `keploy-logs.txt`, or any file under the local `keploy/` cache directory.** That dir is throwaway state wiped on every replay; the cloud-debug.log alone is ~25k tokens. Use `getTestReportFull` for structured failure data — never inspect the raw debug log.

From 8945f871692ed027259e10a53fa1ea0aba5207c7 Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 15:48:11 +0530
Subject: [PATCH 03/11] docs(k8s-proxy-llm-workflow): make getApp mandatory
 pre-replay; clarify --cluster error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Routine B used to skip Discovery step 3 (getApp) because B1 starts at
'git diff' — then hit Phase B4 needing --cluster and dropped the flag,
causing `no active clusters found`. Two fixes:

1. Discovery step 3 (`getApp` for cluster/ns/deployment) is now MANDATORY
   before any `keploy cloud replay` invocation (both Phase A4 and B4).
2. Phase B4 explicitly tells the agent: if you skipped Discovery
   step 3 because Routine B starts at git diff, go back and call getApp
   NOW. Plus inline the error-message ambiguity: `no active clusters
   found` actually means "you forgot --cluster", not "no cluster is
   running".

Source of truth: matches the trimmed verified-working SKILL.md
(`.claude/skills/keploy/SKILL.md`) byte-for-byte.
---
 .../version-4.0.0/quickstart/k8s-proxy-llm-workflow.md        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index 53fe02954..2aeb8e396 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -49,7 +49,7 @@ You handle EVERYTHING else autonomously. Discover the app, the branch, the faili
 
 1. **App.** `basename $(pwd)` → `listApps({q: <basename>})` → pick the unambiguous match. Cache `app_id` for the session.
 2. **Branch.** `git rev-parse --abbrev-ref HEAD` → `create_branch({app_id, name: <git branch>})` → cache `branch_id`. If `git rev-parse` returns `HEAD` or exits non-zero, ask the dev for a branch name ONCE.
-3. **App context (optional, only when you need cluster/ns/deployment).** `getApp({appId: app_id, fields: ["name","namespace","deployment","origin.clusterName","origin.namespace","origin.deployment"]})`. The full app schema is ~16k tokens; the projected response is ~300 tokens. **Call this AT MOST ONCE per session.** The returned identity fields are sticky — hold them mentally, do not re-call `getApp` later in the conversation for the same `app_id`.
+3. **App context — MANDATORY before any `keploy cloud replay` invocation** (Phase A4 OR Phase B4). `getApp({appId: app_id, fields: ["name","namespace","deployment","origin.clusterName","origin.namespace","origin.deployment"]})`. The full app schema is ~16k tokens; the projected response is ~300 tokens. **Call this AT MOST ONCE per session.** The returned identity fields are sticky — hold them mentally, do not re-call `getApp` later in the conversation for the same `app_id`. **You CANNOT issue `keploy cloud replay` without first resolving `origin.clusterName` here — the CLI requires `--cluster <name>` and the error it returns when the flag is missing (`no active clusters found`) is misleading; it actually means "you forgot `--cluster`".**
 
 All three values are sticky for the rest of the conversation. Don't re-discover unless the dev switches git branches. Re-calling `getApp` mid-session is an anti-pattern — its 16k schema lives in your context for every subsequent step regardless of whether you ask for it again.
 
@@ -285,7 +285,7 @@ keploy cloud replay --app <ns.deployment> --branch-name <git branch> --cluster <
   | grep -E "Total test|Failed Testcases|test passed|test failed|FAIL|ERROR|debug bundle|View test report"
 ```
 
-`--cluster` is mandatory — resolve from the `getApp` call you made in Discovery (you cached `origin.clusterName`; do NOT re-call `getApp`). `--disableReportUpload=false` is mandatory too — OAuth CLIs default it to `true` which silently skips the `/tr` report upload. Pipe through `tail`/`grep` for the same context-cost reason as Phase A4.
+`--cluster` is mandatory — resolve from the `getApp` call you made in Discovery (you cached `origin.clusterName`; do NOT re-call `getApp`). **If you skipped Discovery step 3 because Routine B "starts at git diff" — go back and call `getApp` NOW before replay. Without `--cluster`, the CLI dies with `no active clusters found`, which sounds like "no cluster is running" but actually means "you forgot the flag".** `--disableReportUpload=false` is mandatory too — OAuth CLIs default it to `true` which silently skips the `/tr` report upload. Pipe through `tail`/`grep` for the same context-cost reason as Phase A4.
 
 If anything failed, enter Routine A from Phase A2—the diagnosis routine handles it.
 

From e87c55cf59b9f48ad63a9ecd9c5ec8bfafd583c8 Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 15:51:40 +0530
Subject: [PATCH 04/11] docs(k8s-proxy-llm-workflow): keploy mock patch flag is
 --app (not --app-id)

The CLI registers --app, not --app-id (OSS root pre-registers --app-id
as a deprecated uint64 flag). The prior template told agents to use
--app-id which the CLI rejects with exit 1.

Real-world impact: S4 validation run had the agent construct the
documented --app-id command, get rejected, confabulate success.
---
 .../version-4.0.0/quickstart/k8s-proxy-llm-workflow.md          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index 2aeb8e396..8cd4f70e7 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -176,7 +176,7 @@ If every `actual_mocks` entry has a matching `expected_mocks` entry but the *val
 
 **Pick patch vs recapture by drift *shape*, not "is this deliberate":**
 
-- **2b-patch — DEFAULT for scoped *value* changes on EXISTING mocks.** Tool: `keploy mock patch --app-id <appUUID> --branch-id <branchUUID> --test-set-id <tsUUID> --mock-id <name> --mock-yaml-file <path>` (CLI). Read existing with `getMock` first, write patched YAML to a file, invoke CLI. Use when the drift is a value change on an existing mock entry (one operator changed, one column / header / constant differs, response shape same family). **New query / new downstream call → NOT patchable → 2b-recapture.**
+- **2b-patch — DEFAULT for scoped *value* changes on EXISTING mocks.** Tool: `keploy mock patch --app <appUUID> --branch-id <branchUUID> --test-set-id <tsUUID> --mock-id <name> --mock-yaml-file <path>` (CLI). Read existing with `getMock` first, write patched YAML to a file, invoke CLI. Use when the drift is a value change on an existing mock entry (one operator changed, one column / header / constant differs, response shape same family). **New query / new downstream call → NOT patchable → 2b-recapture.**
 
   > **Always use the CLI, not MCP `update_mock`.** The CLI re-derives kind-specific fields the agent can't compute — most importantly `sql_ast_hash` for PostgresV3 (sha256 of the parsed-normalized AST the matcher keys on). MCP `update_mock` with a stale `sql_ast_hash` writes 2xx, `getMock` echoes new SQL back, but replay still fails because the matcher's hash lookup misses.
 

From fe311a9a988cd30f312ffa60feb55b5aaa684443 Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 15:54:37 +0530
Subject: [PATCH 05/11] docs(k8s-proxy-llm-workflow): canonical fields=
 projection + listTestReports one-shot stricter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two cost-discipline fixes from validation evidence:

1. Phase A2: replaced the narrow recommended projection
   ([failed_steps[].diff, mock_mismatches, status, ci_metadata])
   with one that covers per-case identity + per-case oss_report.req /
   .result / .mock_mismatches / .noise — everything Phase A3 actually
   reads. The old projection was too narrow, agents fell back to
   include_oss_report=true (NO fields=) to fetch the full 34k blob
   that re-bills every subsequent turn.

2. Phase A1: added "do NOT re-call listTestReports after your own
   `keploy cloud replay` finishes — the replay stdout already prints
   the new test_run_id in `View test report at: .../tr/<id>`, parse
   that line instead of re-querying."

Also added explicit "ADD fields, never drop" rule under "use fields
aggressively" — agents were retrying without fields= to "get everything"
which is the exact failure mode the projection was meant to prevent.
---
 .../quickstart/k8s-proxy-llm-workflow.md      | 29 +++++++++++++++++--
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index 8cd4f70e7..810e8cda3 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -61,16 +61,39 @@ All three values are sticky for the rest of the conversation. Don't re-discover
 
 The goal of this phase is exactly one thing: produce a `test_run_id` you can pass to `getTestReportFull` in Phase A2. Pick how you get it based on the form of Prompt A:
 
-- **Local form** ("my keploy cloud replay is failing…") → call `listTestReports({appId: app_id, branch_id, status: "FAILED", limit: 5})` **EXACTLY ONCE**. Pick `data[0]` (newest first by `created_at`) and take its `id`. That's the dev's last local `keploy cloud replay --branch-name` invocation — `keploy cloud replay` uploads its report into the legacy `/tr` collection, which is what `listTestReports` queries. **Do NOT retry with different status / source / branch_id permutations** if the first call returns empty — that wastes context on calls that will all return the same empty set (the run genuinely doesn't exist on this branch yet; tell the dev to run `keploy cloud replay` first). `status` is CASE-SENSITIVE — use the exact value `"FAILED"`, not `"failed"` or `"Failed"`. Use `getTestReport({appId: app_id, reportId: test_run_id})` for a cheap roll-up probe before pulling the full report only when the full report is going to be large.
+- **Local form** ("my keploy cloud replay is failing…") → call `listTestReports({appId: app_id, branch_id, status: "FAILED", limit: 5})` **EXACTLY ONCE for the whole session**. Pick `data[0]` (newest first by `created_at`) and take its `id`. That's the dev's last local `keploy cloud replay --branch-name` invocation — `keploy cloud replay` uploads its report into the legacy `/tr` collection. **Do NOT retry with different status / source / branch_id permutations** if the first call returns empty. **Do NOT re-call after your own `keploy cloud replay` finishes** — the final replay's stdout already shows the new `test_run_id` in its `View test report at: …/tr/<id>` line; parse the URL instead of re-querying. `status` is CASE-SENSITIVE — use the exact value `"FAILED"`, not `"failed"` or `"Failed"`. Use `getTestReport({appId: app_id, reportId: test_run_id})` for a cheap roll-up probe only when the full report is going to be large.
 - **CI form** ("the keploy cloud replay pipeline is failing…") → the dev usually pastes a CI log URL or dashboard URL. Extract `test_run_id` from it. If they didn't paste anything, fall back to the local-form lookup above—a CI failure posts the same legacy test-run-report record to the api-server, so the latest-failed lookup still finds it. Use `source: "ci"` on the list call to scope to runs that carry CI metadata.
 
 Either way, Phase A2 onward is identical—same `getTestReportFull` call, same routes, same fixes.
 
 ### Phase A2—Fetch the full report
 
-Call `getTestReportFull({appId: app_id, reportId: test_run_id, fields: ["failed_steps[].diff", "mock_mismatches", "status", "ci_metadata"]})`. The OpenAPI-generated tool's **path** parameters are camelCase (`appId`, `reportId`) per the spec, while its **query** parameters stay snake_case (`include_oss_report`, `mock_mismatches_only`, `max_test_cases_per_set`, `fields`); pass each one with the literal name the spec declares.
+Call `getTestReportFull` with a projection that includes EVERYTHING you will need for Phase A3 diagnosis — in one call. The canonical projection covers status + CI metadata + per-case identity + per-case diff + per-case mock mismatches + noise paths:
 
-**Use `fields` aggressively** — full report is ~34k tokens, projection brings it to ~5k. Supports dotted paths + array wildcards. Defaults for the other params return roll-up + every test set + every per-case diff in one round-trip. Read:
+```
+getTestReportFull({
+  appId: app_id,
+  reportId: test_run_id,
+  fields: [
+    "status",
+    "ci_metadata",
+    "failed_steps[].diff",
+    "mock_mismatches",
+    "test_sets[].name",
+    "test_sets[].id",
+    "test_sets[].test_cases[].name",
+    "test_sets[].test_cases[].status",
+    "test_sets[].test_cases[].oss_report.req",
+    "test_sets[].test_cases[].oss_report.result",
+    "test_sets[].test_cases[].oss_report.mock_mismatches",
+    "test_sets[].test_cases[].oss_report.noise"
+  ]
+})
+```
+
+The OpenAPI-generated tool's **path** parameters are camelCase (`appId`, `reportId`); **query** parameters stay snake_case (`include_oss_report`, `mock_mismatches_only`, `max_test_cases_per_set`, `fields`). Pass each with the literal name the spec declares.
+
+**Use `fields` aggressively** — full report is ~34k tokens, projection brings it to ~5k. Supports dotted paths + array wildcards. **If your first call missed a field you need, ADD it to a new projected call — NEVER drop `fields=` to "get everything" and never fall back to `include_oss_report=true`/`max_test_cases_per_set=N` without `fields=`. The unprojected response is the 34k-token blob that re-bills every subsequent turn for the rest of the session.** Read:
 
 - `report.status`—`FAILED` is your trigger to continue.
 - `report.ci_metadata`—when populated this is a CI run; `provider` / `commit_sha` / `pr_number` give you the surrounding context.

From a923874f25e7baf5ca94ebd58c37ee6378b4b627 Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 16:58:13 +0530
Subject: [PATCH 06/11] docs(k8s-proxy-llm-workflow): correct field names +
 mock_mismatches_only call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two skill corrections discovered via S7 deep-dive on the actual
getTestReportFull response schema:

1. Field-name corrections: the canonical fields= projection used wrong
   keys that returned null on every call.
     test_sets[].name              → test_sets[].test_set_name
     test_sets[].id                → test_sets[].test_set_id
     test_sets[].test_cases[].name → test_sets[].test_cases[].test_case_name
     test_sets[].test_cases[].id   → test_sets[].test_cases[].test_case_id
   Plus dropped refs that don't exist anywhere in the response:
     failed_steps[].diff (not in response)
     top-level mock_mismatches (not in response)
     oss_report.failure_info.mock_mismatch (failure_info has no such subkey)

2. mock_mismatches_only=true second call: per-case mock_mismatches data
   is NOT included by default in getTestReportFull. Added explicit
   instruction that when Phase A3 routes to Case 2b, make a SECOND
   projected call with mock_mismatches_only=true to discover mock IDs
   from oss_report.mock_mismatches.actual_mocks[].name. This avoids
   listMocks (~28k token inventory) for the common Case 2b path.

3. listMocks ban softened: now allowed as fallback when the
   mock_mismatches_only call returns empty for the failing test set
   (e.g., body-only drift with no consumed mocks).

Verified live: S7 with the corrected skill + the projection bug fixes
(see api-server PR for those) — 13/16 strict assert pass (was 11/16),
A-CR1 fields= now passing 2/2, response payload 22k → 572 bytes on the
projected call.
---
 .../quickstart/k8s-proxy-llm-workflow.md      | 59 +++++++++++++------
 1 file changed, 41 insertions(+), 18 deletions(-)

diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index 810e8cda3..f96259317 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -68,24 +68,29 @@ Either way, Phase A2 onward is identical—same `getTestReportFull` call, same r
 
 ### Phase A2—Fetch the full report
 
-Call `getTestReportFull` with a projection that includes EVERYTHING you will need for Phase A3 diagnosis — in one call. The canonical projection covers status + CI metadata + per-case identity + per-case diff + per-case mock mismatches + noise paths:
+Call `getTestReportFull` with a projection that includes EVERYTHING you will need for Phase A3 diagnosis — in one call. **The field paths below match the actual response schema verbatim — pasting other forms (`name`/`id` for test cases, `failed_steps[]`, top-level `mock_mismatches`) returns null because those keys do not exist in the response.** The canonical projection:
 
 ```
 getTestReportFull({
   appId: app_id,
   reportId: test_run_id,
+  include_oss_report: true,
+  max_test_cases_per_set: 50,
   fields: [
-    "status",
-    "ci_metadata",
-    "failed_steps[].diff",
-    "mock_mismatches",
-    "test_sets[].name",
-    "test_sets[].id",
-    "test_sets[].test_cases[].name",
+    "report.status",
+    "report.ci_metadata",
+    "report.failed_count",
+    "report.passed_count",
+    "test_sets[].test_set_name",
+    "test_sets[].test_set_id",
+    "test_sets[].status",
+    "test_sets[].test_cases[].test_case_name",
+    "test_sets[].test_cases[].test_case_id",
     "test_sets[].test_cases[].status",
     "test_sets[].test_cases[].oss_report.req",
+    "test_sets[].test_cases[].oss_report.resp",
     "test_sets[].test_cases[].oss_report.result",
-    "test_sets[].test_cases[].oss_report.mock_mismatches",
+    "test_sets[].test_cases[].oss_report.failure_info",
     "test_sets[].test_cases[].oss_report.noise"
   ]
 })
@@ -93,19 +98,37 @@ getTestReportFull({
 
 The OpenAPI-generated tool's **path** parameters are camelCase (`appId`, `reportId`); **query** parameters stay snake_case (`include_oss_report`, `mock_mismatches_only`, `max_test_cases_per_set`, `fields`). Pass each with the literal name the spec declares.
 
-**Use `fields` aggressively** — full report is ~34k tokens, projection brings it to ~5k. Supports dotted paths + array wildcards. **If your first call missed a field you need, ADD it to a new projected call — NEVER drop `fields=` to "get everything" and never fall back to `include_oss_report=true`/`max_test_cases_per_set=N` without `fields=`. The unprojected response is the 34k-token blob that re-bills every subsequent turn for the rest of the session.** Read:
+**Use `fields` aggressively** — full report is ~34k tokens, projection brings it to ~5k. Supports dotted paths + array wildcards (`field[]`). **If your first call missed a field you need, ADD it to a new projected call — NEVER drop `fields=` to "get everything" and never fall back to `include_oss_report=true`/`max_test_cases_per_set=N` without `fields=`. The unprojected response is the 34k-token blob that re-bills every subsequent turn for the rest of the session.** Read:
 
-- `report.status`—`FAILED` is your trigger to continue.
-- `report.ci_metadata`—when populated this is a CI run; `provider` / `commit_sha` / `pr_number` give you the surrounding context.
-- `test_sets[]`—per set, each entry carries `tests[]` (per-case name + status roll-up) and `test_cases[]` (the inflated per-case rows). Iterate `test_cases[]` and, for any case whose `status` is `FAILED`, read:
+- `report.status` — `FAILED` is your trigger to continue.
+- `report.ci_metadata` — when populated this is a CI run; `provider` / `commit_sha` / `pr_number` give you the surrounding context.
+- `test_sets[]` — per set, each entry carries `test_set_name`, `test_set_id`, `status`, and `test_cases[]` (the inflated per-case rows). Iterate `test_cases[]` and, for any case whose `status` is `FAILED`, read:
+  - `test_case_name`, `test_case_id` — identity (note these are the FULL field names, NOT `name`/`id`).
   - `oss_report.req.{method,url}` — which endpoint failed.
-  - `oss_report.result.status_code.{expected,actual}` — status-code diff.
+  - `oss_report.result.status_code.{expected,actual,normal}` — status-code diff.
   - `oss_report.result.headers_result[].{expected,actual,normal}` — per-header diff (`normal=false` means a real mismatch).
   - `oss_report.result.body_result[].{expected,actual,normal,type}` — per-body diff. This is your primary signal for an authored-response drift.
-  - `oss_report.mock_mismatches.{expected_mocks,actual_mocks}` — set of mocks the replayer recorded versus the set it actually consumed during this run. Populated for both passed and failed cases when consumed-mock data is known. Non-empty + a body diff together is the signature of a mock-driven regression.
-  - `oss_report.failure_info.mock_mismatch` — same shape, legacy fallback for reports produced by replayers older than v3.5.49.
+  - `oss_report.result.dep_result[]` — per-dependency (downstream mock) diff. Often empty in current api-server builds; `mock_mismatches` (separate call) is the reliable signal.
+  - `oss_report.failure_info.{category,risk,assessment}` — high-level signals: `category` includes tokens like `HEADER_CHANGED`, `BODY_CHANGED`, `SCHEMA_UNCHANGED`; `assessment.value_changes[]` names the specific JSON keys that drifted. Useful for fast classification.
   - `oss_report.noise` — JSONPaths the recorder has already marked as ignorable (don't re-flag these as drifts).
-- For investigating only mock-driven failures on a large run, pass `mock_mismatches_only=true` — `test_cases[]` is restricted to entries with non-empty `mock_mismatches` (or the legacy fallback) and the response stays token-safe.
+
+**Getting mock IDs for Case 2b (separate call with `mock_mismatches_only=true`):** The default report does NOT include `mock_mismatches` per case (the field is omitted by the server unless explicitly requested). When Phase A3 classification routes to Case 2b, make a SECOND projected call with `mock_mismatches_only=true` to discover which mocks need patching:
+
+```
+getTestReportFull({
+  appId: app_id,
+  reportId: test_run_id,
+  mock_mismatches_only: true,
+  fields: [
+    "test_sets[].test_set_id",
+    "test_sets[].test_cases[].test_case_name",
+    "test_sets[].test_cases[].oss_report.mock_mismatches.actual_mocks[].name",
+    "test_sets[].test_cases[].oss_report.mock_mismatches.actual_mocks[].kind"
+  ]
+})
+```
+
+The response will only contain cases that consumed mocks. Read `oss_report.mock_mismatches.actual_mocks[].name` to get the mock IDs (`mock-N` strings) for targeted `getMock` reads. This avoids `listMocks` (which returns ~28k tokens of inventory).
 
 ### Phase A3—Diagnose each failing test case
 
@@ -211,7 +234,7 @@ If every `actual_mocks` entry has a matching `expected_mocks` entry but the *val
 
 - **Fixtures are branch-scoped — never copy across branches.** Each Keploy branch carries its own recorded fixtures, mocks, and mappings, captured against THAT branch's app state. NEVER upload a recording (or a `keploy mock patch` derivative) from a sibling branch or from main onto the current branch as a shortcut "fix." The fixture's recorded app-state assumptions belong to where it was captured; planting it on a branch with different code state silently corrupts replay lineage and confuses every downstream reader of the branch history. If you need fresh fixtures, re-record against the CURRENT branch's app via `keploy record` + `keploy upload test-set --branch <git branch>`.
 
-- **Reuse before re-record.** Before `keploy record` / `keploy upload test-set` for 2b: `listRecordings({app_id, branch_id})`, then `getMock` per ID the report's `mock_mismatches.expected_mocks` named. **Do NOT call `listMocks`** — the report already names drifted IDs; `listMocks` adds ~28k tokens of context-polluting inventory. If a current-branch recording already covers the endpoint with the missing mock entries, `delete_recording` the failing set and stop. Re-record is the LAST resort.
+- **Reuse before re-record.** Before `keploy record` / `keploy upload test-set` for 2b: `listRecordings({app_id, branch_id})`, then `getMock` per ID. The report-based path (`mock_mismatches_only=true` projected call) names mock IDs directly — **prefer that**. `listMocks` should NOT be the default discovery step (it adds ~28k tokens of inventory), but it IS an acceptable FALLBACK when the projected `mock_mismatches_only=true` call returns empty for the test set you care about (e.g., the failure shape is body-only with no consumed mocks). If a current-branch recording already covers the endpoint with the missing mock entries, `delete_recording` the failing set and stop. Re-record is the LAST resort.
 
 Multiple failing test cases can land in different cases—handle each independently.
 

From fafd794bf410881a568fa2ee0657805c82f7eaa3 Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 17:57:19 +0530
Subject: [PATCH 07/11] docs(k8s-proxy-llm-workflow): mandate
 --disable-mapping=false on keploy record
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After investigating S6 (Routine B) end-to-end, found that
`keploy record --sync` alone produces no `mappings.yaml`. The recorder
inherits keploy.yml's `disableMapping` and the auto-orchestrator-forwarded
flag doesn't propagate without an explicit host-side override. Without
mappings.yaml, the upload pipeline persists no `mapping_audits` doc in
mongo, and `getMockMapping` returns empty `mocks: []` for every test
case — forcing the replay matcher onto fragile timestamp windows.

Two skill updates:
1. Phase B2 step 1: `keploy record -c "<cmd>" --sync --disable-mapping=false`
   is the canonical incantation, with explicit rationale for why
   --disable-mapping=false is mandatory.
2. Case 2b-recapture: same flag pair documented on the record step of
   the (record → upload → delete) order.

The --disable-mapping flag was added to `keploy record` upstream
(keploy/keploy PR #4250).
---
 .../version-4.0.0/quickstart/k8s-proxy-llm-workflow.md        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index f96259317..59446ef10 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -226,7 +226,7 @@ If every `actual_mocks` entry has a matching `expected_mocks` entry but the *val
 
   > **Always use the CLI, not MCP `update_mock`.** The CLI re-derives kind-specific fields the agent can't compute — most importantly `sql_ast_hash` for PostgresV3 (sha256 of the parsed-normalized AST the matcher keys on). MCP `update_mock` with a stale `sql_ast_hash` writes 2xx, `getMock` echoes new SQL back, but replay still fails because the matcher's hash lookup misses.
 
-- **2b-recapture — ONLY for sweeping changes that can't be patched cleanly** (call graph rewritten, request/response shapes diverge wholesale, multiple endpoints drifted at once). Order: (1) `keploy record`, (2) `keploy upload test-set --branch <git branch>`, (3) **only after upload succeeds**, `delete_recording` on the stale set. **Never `delete_recording` first** — reversing leaves the branch at zero coverage and the next replay "passes" trivially (zero tests = zero failures, indistinguishable from a real fix).
+- **2b-recapture — ONLY for sweeping changes that can't be patched cleanly** (call graph rewritten, request/response shapes diverge wholesale, multiple endpoints drifted at once). Order: (1) `keploy record --sync --disable-mapping=false` (the latter two flags are MANDATORY — see Phase B2 below for the full rationale: without them the recording's mock-to-case relationship is lost at upload time), (2) `keploy upload test-set --branch <git branch>`, (3) **only after upload succeeds**, `delete_recording` on the stale set. **Never `delete_recording` first** — reversing leaves the branch at zero coverage and the next replay "passes" trivially (zero tests = zero failures, indistinguishable from a real fix).
 
   > **Recapture is YOU driving traffic. Seed deliberately, drive surgically.** Keploy has no scoped-recapture primitive. Before re-recording, read each existing case's `http_req` + recorded responses to learn what state it assumes (e.g. recorded `GET /api/orders` returned 7 rows but the suite POSTs only 2 → 5 pre-seeded). SQL INSERT / S3 copy / Kafka seed that state, then `keploy record` and drive ONLY the affected case's curls. Driving everything against a fresh DB silently degrades coverage — a `GET ?user_id=Mallory` with no Mallory row captures `{"count":0,"orders":[]}` and replays green without verifying anything.
 
@@ -306,7 +306,7 @@ Run report: https://app.keploy.io/tr/<test_run_id>?appId=<app_id>
 
 **Capture:**
 
-1. Run `keploy record -c "<dev run command>" --sync` via Bash. The `-c` value is the exact command from your pre-flight; `--sync` records test cases synchronously so each curl is captured in order with no race against the next one. Cloud association happens in Phase B3's upload step, not here—`keploy record` itself is the local OSS command and doesn't take `--cloud-app-id`.
+1. Run `keploy record -c "<dev run command>" --sync --disable-mapping=false` via Bash. The `-c` value is the exact command from your pre-flight; `--sync` records test cases synchronously so each curl is captured in order with no race against the next one; **`--disable-mapping=false` is MANDATORY** — without it, the host inherits `keploy.yml`'s `disableMapping: true` (the auto-generated default), the agent silently skips writing `mappings.yaml`, and the uploaded bundle lands in mongo with no `mapping_audits` doc → `getMockMapping` returns empty `mocks: []` for every test case → replay matcher falls back to fragile timestamp-windows. Cloud association happens in Phase B3's upload step, not here — `keploy record` itself is the local OSS command and doesn't take `--cloud-app-id`.
 2. For each new/changed endpoint, drive ONE realistic curl. Infer body shape from the OpenAPI spec if there is one, otherwise from the handler signature itself.
 3. Stop `keploy record` (kill the PID you captured at step 1, or send Ctrl-C equivalent).
 4. The recording lands at `keploy/test-set-N/` on disk.

From 9646579be37859d7897776bf3834de8826e98b9d Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 20:22:44 +0530
Subject: [PATCH 08/11] docs(k8s-proxy-llm-workflow): add Installation section
 + Vale spelling fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address user feedback + Copilot Vale-spelling comments on PR #871:

User feedback (Cursor user): the doc lacked a setup section, so they
went off the older `.cursorrules` instructions in agent-test-generation.md
which is now deprecated. Verified against cursor-agent's built-in
`migrate-to-skills` skill: `.cursor/skills/<name>/SKILL.md` IS the
modern Cursor format, `.cursorrules` and `.cursor/rules/*.mdc` are
being migrated FROM. Added an Installation section at the top of the
page covering the modern Skills mechanism for Cursor / Claude Code /
other agents, with an explicit "do not use .cursorrules" note (the
playbook is ~8k tokens; pinning it as always-on context would bill on
every editor turn).

Vale spelling fixes (Copilot comments r3343-r3369):
- "analyse" → "analyze" (en_US): Prompt A wording + Routine A heading
- "ALLOWLIST" → "Allowlist" (security term, lowercased to match Vale)
  + added `[Aa]llowlist` to the Base vocabulary so future occurrences
  pass lint
- "re-bills" → "gets re-added to context" (3 sites) — clearer to
  readers and dodges Vale's spelling check

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../config/vocabularies/Base/accept.txt       |  1 +
 .../quickstart/k8s-proxy-llm-workflow.md      | 24 ++++++++++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/vale_styles/config/vocabularies/Base/accept.txt b/vale_styles/config/vocabularies/Base/accept.txt
index 1b5c9d8aa..c75fea8f0 100644
--- a/vale_styles/config/vocabularies/Base/accept.txt
+++ b/vale_styles/config/vocabularies/Base/accept.txt
@@ -1,5 +1,6 @@
 [Aa]ir-?gap(?:ped|ping)?
 [Aa]uditable
+[Aa]llowlist
 [Cc]group[s]?
 [Cc]leartext
 [Cc]onfigMap[s]?
diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index 59446ef10..8e7dbcdad 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -26,9 +26,21 @@ keywords:
 
 # Keploy MCP playbook—autonomous developer workflow
 
+## Installation
+
+Save this playbook as an **agent skill**, not a static rules file. Cursor's modern Skills mechanism (and Claude Code's identical `SKILL.md` convention) loads the file on demand when the user issues one of the two prompts below, instead of injecting it as always-on context. That keeps every other unrelated agent task out of this playbook's token cost.
+
+- **Cursor:** create `.cursor/skills/keploy/SKILL.md` (or your project's preferred Skills path) and paste the rest of this page into it. Do **not** put this content in `.cursorrules` — `.cursorrules` files are always-on and would bill the full ~8k-token playbook on every editor interaction.
+- **Claude Code:** create `.claude/skills/keploy/SKILL.md` and paste the rest of this page into it. The agent invokes it automatically when the developer's prompt matches one of the entry points below.
+- **Other agents (Windsurf, Antigravity, …):** create the equivalent skill / project-context file. Avoid global / always-on placements for the same token-cost reason.
+
+The MCP server itself is configured separately — see the [agent test generation MCP setup](/docs/running-keploy/agent-test-generation#mcp-client-configuration) for the Cursor / Claude Code MCP entries that wire `https://api.keploy.io/client/v1/mcp` into your client.
+
+## Entry points
+
 The developer will only ever say one of two things to you:
 
-**Prompt A:** "my keploy cloud replay is failing, please analyse and fix it." OR "the keploy cloud replay pipeline is failing, please analyse and fix it."—both forms route to the same routine; the first means the dev's last local replay run failed (find the latest test_run on the branch via api-server), the second means a CI pipeline run failed (the dev should paste the CI log or dashboard URL; extract `test_run_id` from it).
+**Prompt A:** "my keploy cloud replay is failing, please analyze and fix it." OR "the keploy cloud replay pipeline is failing, please analyze and fix it."—both forms route to the same routine; the first means the dev's last local replay run failed (find the latest test_run on the branch via api-server), the second means a CI pipeline run failed (the dev should paste the CI log or dashboard URL; extract `test_run_id` from it).
 **Prompt B:** "Add new keploy tests for my changes."
 
 You handle EVERYTHING else autonomously. Discover the app, the branch, the failing run, the code changes—from the filesystem, from git, and from the Keploy api-server. Make decisions. Execute fixes. Report what you did. Do NOT ask the developer follow-up questions unless you are truly blocked (see "When you may ask" at the bottom).
@@ -55,7 +67,7 @@ All three values are sticky for the rest of the conversation. Don't re-discover
 
 ---
 
-## Routine A—failing cloud replay (local or CI), analyse and fix
+## Routine A—failing cloud replay (local or CI), analyze and fix
 
 ### Phase A1—Resolve the `test_run_id`
 
@@ -98,7 +110,7 @@ getTestReportFull({
 
 The OpenAPI-generated tool's **path** parameters are camelCase (`appId`, `reportId`); **query** parameters stay snake_case (`include_oss_report`, `mock_mismatches_only`, `max_test_cases_per_set`, `fields`). Pass each with the literal name the spec declares.
 
-**Use `fields` aggressively** — full report is ~34k tokens, projection brings it to ~5k. Supports dotted paths + array wildcards (`field[]`). **If your first call missed a field you need, ADD it to a new projected call — NEVER drop `fields=` to "get everything" and never fall back to `include_oss_report=true`/`max_test_cases_per_set=N` without `fields=`. The unprojected response is the 34k-token blob that re-bills every subsequent turn for the rest of the session.** Read:
+**Use `fields` aggressively** — full report is ~34k tokens, projection brings it to ~5k. Supports dotted paths + array wildcards (`field[]`). **If your first call missed a field you need, ADD it to a new projected call — NEVER drop `fields=` to "get everything" and never fall back to `include_oss_report=true`/`max_test_cases_per_set=N` without `fields=`. The unprojected response is the 34k-token blob that gets re-added to context every subsequent turn for the rest of the session.** Read:
 
 - `report.status` — `FAILED` is your trigger to continue.
 - `report.ci_metadata` — when populated this is a CI run; `provider` / `commit_sha` / `pr_number` give you the surrounding context.
@@ -146,7 +158,7 @@ git diff --cached -- <failing-handler-path> # staged but not committed
 
 Run all three **every time**, even when the tree looks clean. The empty result IS the evidence required to advance to Step 1. Skipping = silent misclassification when the assumption is wrong.
 
-**ALLOWLIST of MCP calls permitted before Step 0** (Phase A1 discovery only): `listApps`, `getApp`, `create_branch`, `list_branches`, `listTestReports`, `getTestReport`, `tools/list`. EVERY other call — `getTestReportFull`, `getTestCase`, `getMock`, `listMocks`, `getRecording`, `listRecordings`, `updateTestCase`, `update_mock`, `delete_recording` — is classifier/write and MUST come AFTER Step 0. Reading `getTestCase` first biases toward Case 2 framing.
+**Allowlist of MCP calls permitted before Step 0** (Phase A1 discovery only): `listApps`, `getApp`, `create_branch`, `list_branches`, `listTestReports`, `getTestReport`, `tools/list`. EVERY other call — `getTestReportFull`, `getTestCase`, `getMock`, `listMocks`, `getRecording`, `listRecordings`, `updateTestCase`, `update_mock`, `delete_recording` — is classifier/write and MUST come AFTER Step 0. Reading `getTestCase` first biases toward Case 2 framing.
 
 Any uncommitted edit touching the failing handler's source → **Case 1, mandatory.** Revert (or ask the dev); do NOT proceed to commit-history reasoning. Uncommitted edits beat any committed-history signal — they can't be the deliberate new contract.
 
@@ -256,7 +268,7 @@ keploy cloud replay --app <ns.deployment> --branch-name <git branch> --cluster <
   | grep -E "Total test|Failed Testcases|test passed|test failed|FAIL|ERROR|debug bundle|View test report"
 ```
 
-The full replay log contains per-mock-match traces, per-testcase debug lines, and a final summary block. Your decisions only need the final summary + any FAIL/ERROR lines + the `View test report at:` URL. Piping at the command level keeps the slice that re-bills on every subsequent step to ~2k tokens instead of the full ~40k — over a retry loop that compounds enormously. Apply the same pipe pattern to every other long-running Bash command: `keploy record` output, `docker build`, `keploy upload test-set`. Read the cached log file directly only when the grep slice doesn't show what you need.
+The full replay log contains per-mock-match traces, per-testcase debug lines, and a final summary block. Your decisions only need the final summary + any FAIL/ERROR lines + the `View test report at:` URL. Piping at the command level keeps the slice that gets re-added to context on every subsequent step to ~2k tokens instead of the full ~40k — over a retry loop that compounds enormously. Apply the same pipe pattern to every other long-running Bash command: `keploy record` output, `docker build`, `keploy upload test-set`. Read the cached log file directly only when the grep slice doesn't show what you need.
 
 **`--cluster` is mandatory.** Use the `origin.clusterName` you cached from Discovery's `getApp` (do NOT re-call). Without it, auto-select needs a cluster heartbeat within 35s and dies `no active clusters found`.
 
@@ -377,5 +389,5 @@ Everything else—what failed and why, which mock to update, what test-set name
 - **Uploading fixtures from another branch onto the current branch.** Fixtures are branch-scoped — they encode app-state assumptions of where they were captured. Re-record against THIS branch instead.
 - **Uploading fresh recordings without checking existing branch coverage first.** `listRecordings({app_id, branch_id})` + targeted `getMock` first; reuse if covered.
 - **Inventing a PAT, branch name, or secret value.**
-- **Running `keploy --help`, `keploy <cmd> --help`, or any `--version` info dump.** This skill names every command + flag you need (`keploy cloud replay`, `keploy mock patch`, `keploy record`, `keploy upload test-set`). The CLI's help text is ~14k tokens and re-bills on every subsequent turn — pure waste.
+- **Running `keploy --help`, `keploy <cmd> --help`, or any `--version` info dump.** This skill names every command + flag you need (`keploy cloud replay`, `keploy mock patch`, `keploy record`, `keploy upload test-set`). The CLI's help text is ~14k tokens and gets re-added to context on every subsequent turn — pure waste.
 - **Reading `keploy/cloud-debug.log`, `keploy-logs.txt`, or any file under the local `keploy/` cache directory.** That dir is throwaway state wiped on every replay; the cloud-debug.log alone is ~25k tokens. Use `getTestReportFull` for structured failure data — never inspect the raw debug log.

From b534b7ffa89aa91f59e0fcc078ba6621dbaee7ae Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 20:43:22 +0530
Subject: [PATCH 09/11] =?UTF-8?q?docs(k8s-proxy-llm-workflow):=20prettier?=
 =?UTF-8?q?=20=E2=80=94=20normalize=20emphasis=20to=20=5Funderscore=5F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI's prettier check (creyD/prettier_action@v4.6 with prettier 3.8.3)
fails the PR because three emphasis spans in the file use `*…*`
syntax. Prettier 3.x normalizes em-emphasis to `_…_`. Auto-fixed via
`prettier --write`. No prose changes — only the markup style for
the three italic spans (`*values*`, `*shape*`, `*value*`).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../version-4.0.0/quickstart/k8s-proxy-llm-workflow.md      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index 8e7dbcdad..e05b64668 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -228,13 +228,13 @@ Detect this case from the report's `mock_mismatches`:
 - `expected_mocks` lists the calls the recording has mocks for.
 - **If `actual_mocks` contains entries with no match in `expected_mocks`** (e.g. a `SELECT … FOR UPDATE` query that doesn't appear in any recorded mock's `sqlNormalized`), the bundle is missing an entry. Skip the patch ladder and go directly to **2b-recapture** — that is the only path that can introduce a new mock entry without hand-guessing its response shape.
 
-If every `actual_mocks` entry has a matching `expected_mocks` entry but the *values* differ (column order, bind values, response rows), patch IS applicable — proceed to the ladder.
+If every `actual_mocks` entry has a matching `expected_mocks` entry but the _values_ differ (column order, bind values, response rows), patch IS applicable — proceed to the ladder.
 
 **Escalation ladder when patches don't make replay green:** `keploy mock patch` ×≤2 → 2b-recapture (record/upload/delete) ×1 → **then** stop-and-report. Verify each patch with `getMock` readback. Reporting prematurely (after 2 failed patches, skipping recapture) leaves a fixable drift uncaught.
 
-**Pick patch vs recapture by drift *shape*, not "is this deliberate":**
+**Pick patch vs recapture by drift _shape_, not "is this deliberate":**
 
-- **2b-patch — DEFAULT for scoped *value* changes on EXISTING mocks.** Tool: `keploy mock patch --app <appUUID> --branch-id <branchUUID> --test-set-id <tsUUID> --mock-id <name> --mock-yaml-file <path>` (CLI). Read existing with `getMock` first, write patched YAML to a file, invoke CLI. Use when the drift is a value change on an existing mock entry (one operator changed, one column / header / constant differs, response shape same family). **New query / new downstream call → NOT patchable → 2b-recapture.**
+- **2b-patch — DEFAULT for scoped _value_ changes on EXISTING mocks.** Tool: `keploy mock patch --app <appUUID> --branch-id <branchUUID> --test-set-id <tsUUID> --mock-id <name> --mock-yaml-file <path>` (CLI). Read existing with `getMock` first, write patched YAML to a file, invoke CLI. Use when the drift is a value change on an existing mock entry (one operator changed, one column / header / constant differs, response shape same family). **New query / new downstream call → NOT patchable → 2b-recapture.**
 
   > **Always use the CLI, not MCP `update_mock`.** The CLI re-derives kind-specific fields the agent can't compute — most importantly `sql_ast_hash` for PostgresV3 (sha256 of the parsed-normalized AST the matcher keys on). MCP `update_mock` with a stale `sql_ast_hash` writes 2xx, `getMock` echoes new SQL back, but replay still fails because the matcher's hash lookup misses.
 

From 8685ef2b48cb20e0edadc535da300ea81b5a39a6 Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 20:47:58 +0530
Subject: [PATCH 10/11] =?UTF-8?q?docs(k8s-proxy-llm-workflow):=20self-revi?=
 =?UTF-8?q?ew=20fixes=20=E2=80=94=20token=20count,=20foreground=20-c,=20tw?=
 =?UTF-8?q?o-routine=20wording?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three self-review nits caught on a deep re-read:

1. Installation: "~8k-token playbook" was off — measured the actual file
   with tiktoken cl100k_base and got 9,310 tokens. Bumped the warning
   to "~9k-token" so the cost rationale is grounded in the real number.

2. Phase B2 capture: clarified that the -c value must be the FOREGROUND
   form of the run command. If pre-flight uses `docker compose up -d`
   (detached, common in repos without a foreground equivalent declared),
   passing the same string to `keploy record -c` makes docker exit
   immediately on detach and keploy thinks the app already terminated,
   capturing nothing. Example: pre-flight `docker compose up -d`,
   record `docker compose up` (no -d).

3. Page description: "exactly two developer prompts" was inaccurate —
   Prompt A has two phrasings, so the agent listens for three distinct
   surface phrases. Reworded to "two routine prompts (failing-replay
   analyze-and-fix; add-tests-for-my-changes)" so the count refers to
   the two routines rather than the surface phrases.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../version-4.0.0/quickstart/k8s-proxy-llm-workflow.md      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index e05b64668..341ca6921 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -2,7 +2,7 @@
 id: k8s-proxy-llm-workflow
 title: Developer + LLM Workflow with Keploy Proxy
 sidebar_label: LLM Workflow
-description: Wire up Keploy's MCP server, paste a single autonomous playbook into your editor, and run the whole Keploy workflow with exactly two developer prompts—the LLM does everything else.
+description: Wire up Keploy's MCP server, paste a single autonomous playbook into your editor, and run the whole Keploy workflow with two routine prompts (failing-replay analyze-and-fix; add-tests-for-my-changes)—the LLM does everything else.
 tags:
   - K8s
   - Developer Workflow
@@ -30,7 +30,7 @@ keywords:
 
 Save this playbook as an **agent skill**, not a static rules file. Cursor's modern Skills mechanism (and Claude Code's identical `SKILL.md` convention) loads the file on demand when the user issues one of the two prompts below, instead of injecting it as always-on context. That keeps every other unrelated agent task out of this playbook's token cost.
 
-- **Cursor:** create `.cursor/skills/keploy/SKILL.md` (or your project's preferred Skills path) and paste the rest of this page into it. Do **not** put this content in `.cursorrules` — `.cursorrules` files are always-on and would bill the full ~8k-token playbook on every editor interaction.
+- **Cursor:** create `.cursor/skills/keploy/SKILL.md` (or your project's preferred Skills path) and paste the rest of this page into it. Do **not** put this content in `.cursorrules` — `.cursorrules` files are always-on and would bill the full ~9k-token playbook on every editor interaction.
 - **Claude Code:** create `.claude/skills/keploy/SKILL.md` and paste the rest of this page into it. The agent invokes it automatically when the developer's prompt matches one of the entry points below.
 - **Other agents (Windsurf, Antigravity, …):** create the equivalent skill / project-context file. Avoid global / always-on placements for the same token-cost reason.
 
@@ -318,7 +318,7 @@ Run report: https://app.keploy.io/tr/<test_run_id>?appId=<app_id>
 
 **Capture:**
 
-1. Run `keploy record -c "<dev run command>" --sync --disable-mapping=false` via Bash. The `-c` value is the exact command from your pre-flight; `--sync` records test cases synchronously so each curl is captured in order with no race against the next one; **`--disable-mapping=false` is MANDATORY** — without it, the host inherits `keploy.yml`'s `disableMapping: true` (the auto-generated default), the agent silently skips writing `mappings.yaml`, and the uploaded bundle lands in mongo with no `mapping_audits` doc → `getMockMapping` returns empty `mocks: []` for every test case → replay matcher falls back to fragile timestamp-windows. Cloud association happens in Phase B3's upload step, not here — `keploy record` itself is the local OSS command and doesn't take `--cloud-app-id`.
+1. Run `keploy record -c "<dev run command>" --sync --disable-mapping=false` via Bash. The `-c` value must be the **foreground** form of the run command — if your pre-flight used the detached/background form (e.g. `docker compose up -d`), pass the foreground variant here (`docker compose up`, no `-d`). Detached commands return immediately on launch and keploy treats the early exit as "app stopped", capturing nothing. `--sync` records test cases synchronously so each curl is captured in order with no race against the next one; **`--disable-mapping=false` is MANDATORY** — without it, the host inherits `keploy.yml`'s `disableMapping: true` (the auto-generated default), the agent silently skips writing `mappings.yaml`, and the uploaded bundle lands in mongo with no `mapping_audits` doc → `getMockMapping` returns empty `mocks: []` for every test case → replay matcher falls back to fragile timestamp-windows. Cloud association happens in Phase B3's upload step, not here — `keploy record` itself is the local OSS command and doesn't take `--cloud-app-id`.
 2. For each new/changed endpoint, drive ONE realistic curl. Infer body shape from the OpenAPI spec if there is one, otherwise from the handler signature itself.
 3. Stop `keploy record` (kill the PID you captured at step 1, or send Ctrl-C equivalent).
 4. The recording lands at `keploy/test-set-N/` on disk.

From c6df353837672d3284dcf9b5a60f20cd229b134f Mon Sep 17 00:00:00 2001
From: Charan Kamarapu <kamarapucharan@gmail.com>
Date: Sun, 7 Jun 2026 20:53:28 +0530
Subject: [PATCH 11/11] docs(vale): allow spaced em-dash + logical quotes +
 tech vocab
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI's Vale doc linter (errata-ai/vale-action@v2.1.1 with vale 3.0.3 and
the project's existing Google + Vale base styles) flagged 89 errors on
the k8s-proxy-llm-workflow page after my Installation section landed.
Categorized:

  58× Google.EmDash — "Don't put a space before or after a dash". The
        doc uses the spaced em-dash form ` — ` for prose readability;
        many other docs in the repo do the same (see hits in
        generate-api-tests-using-ai.md, etc.). Disabling the rule
        repo-wide is consistent with the seven other Google.* overrides
        already in `.vale.ini` and matches the docs' established style.

   8× Google.Quotes — "Commas and periods go inside quotation marks".
        The docs use period-OUTSIDE-quote when the quoted token is a
        literal the reader is supposed to paste verbatim (e.g.
        `the exact value "FAILED".`); putting the period inside would
        change the visible token. Disabling for consistency with the
        other Google.* overrides.

  23× Vale.Spelling — tech terms not yet in the Base vocabulary.
        Added: branch_id, camelCase, CLI[s]?, cwd, hardcoded,
        JSONPath[s]?, matcher, misclassification, mutex, OAuth,
        readback, README, snake_case, stdout, test_run, unprojected.

   1× Vale.Spelling on "whatever's" — possessive on the indefinite
        pronoun that Vale's en_US dictionary doesn't recognize.
        Reworded the sentence in-place rather than vocab-ing it; the
        possessive form is genuinely unusual and a rewrite is cleaner
        than whitelisting it.

Local `vale --config=.vale.ini versioned_docs/.../k8s-proxy-llm-workflow.md`
now reports 0 errors. Prettier still clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .vale.ini                                        |  2 ++
 vale_styles/config/vocabularies/Base/accept.txt  | 16 ++++++++++++++++
 .../quickstart/k8s-proxy-llm-workflow.md         |  2 +-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/.vale.ini b/.vale.ini
index df3341e73..00320fa5b 100644
--- a/.vale.ini
+++ b/.vale.ini
@@ -32,6 +32,8 @@ Google.Exclamation = NO     # Allow exclamation points
 Google.Ellipses = NO        # Allow ellipses in text
 Google.Latin = NO           # Allow "e.g." and "i.e." instead of "for example"
 Google.Units = NO           # Allow "k8s" — Google.Units' \d+s regex matches "8s" inside the token
+Google.EmDash = NO          # Allow spaced em-dashes — used consistently across the repo for prose readability (the Google style wants `—` without spaces; many existing docs use the spaced form intentionally)
+Google.Quotes = NO          # Allow logical (British) punctuation around quotation marks — the docs use period-OUTSIDE-quote for technical tokens (e.g. `the value "FAILED".`) so dropping a period inside doesn't alter the literal token the reader is supposed to paste
 
 # Allow specific terms:
 Vale.Terms=NO
diff --git a/vale_styles/config/vocabularies/Base/accept.txt b/vale_styles/config/vocabularies/Base/accept.txt
index c75fea8f0..3070ac44a 100644
--- a/vale_styles/config/vocabularies/Base/accept.txt
+++ b/vale_styles/config/vocabularies/Base/accept.txt
@@ -202,3 +202,19 @@ Woohoo
 wsl
 WSL
 YAMLs
+branch_id
+camelCase
+CLI[s]?
+cwd
+hardcoded
+JSONPath[s]?
+matcher
+misclassification
+mutex
+OAuth
+readback
+README
+snake_case
+stdout
+test_run
+unprojected
diff --git a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
index 341ca6921..b7a1f84e0 100644
--- a/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
+++ b/versioned_docs/version-4.0.0/quickstart/k8s-proxy-llm-workflow.md
@@ -252,7 +252,7 @@ Multiple failing test cases can land in different cases—handle each independen
 
 ### Phase A4—Verify
 
-**After a Case 1 source edit, rebuild the app's docker image BEFORE replay.** Cloud replay doesn't compile — it uses whatever's currently tagged for the manifest's image ref on the local Docker daemon. Resolve the tag from replay logs (`"Using deployment image": "<repo>:<tag>"`) or `getRecording`/`listRecordings` `resources`, then:
+**After a Case 1 source edit, rebuild the app's docker image BEFORE replay.** Cloud replay doesn't compile — it uses whatever image is currently tagged for the manifest's image ref on the local Docker daemon. Resolve the tag from replay logs (`"Using deployment image": "<repo>:<tag>"`) or `getRecording`/`listRecordings` `resources`, then:
 
 ```bash
 docker build -t <manifest-image-tag> <build-context-dir>