callstackincubator · V3RON · May 19, 2026 · May 19, 2026
diff --git a/DICTIONARY.md b/DICTIONARY.md
@@ -36,6 +36,8 @@ Before adding or changing a term, check this file and ask the user for approval.
 - `workspace bootstrap`: Command run in a provisioned workspace before the agent starts.
 - `schedule`: Execution ordering and concurrency policy.
 - `reporter`: Component rendering suite-run progress and results.
+- `token-usage`: Built-in reporter that emits compact JSON billable-token summaries for agent consumption.
+- `token-optimization`: Bundled skill that guides an agent through baseline, minimal edits, and before/after token comparison.
 - `skill detection`: Evidence that a skill was used, with confidence and evidence.
 - `session event`: Normalized event observed during a session.
 - `snapshot baseline`: Stored token baseline for a benchmark case x runner pair.

diff --git a/docs/reporters.md b/docs/reporters.md
@@ -10,14 +10,15 @@ Execution, aggregation, and `results.json` writing stay in the runner. Reporters
 skillgym run <suite.ts> --reporter standard
 skillgym run <suite.ts> --reporter json
 skillgym run <suite.ts> --reporter json-summary
+skillgym run <suite.ts> --reporter token-usage
 skillgym run <suite.ts> --reporter github-actions
 skillgym run <suite.ts> --reporter html
 skillgym run <suite.ts> --reporter ./examples/custom-reporter.ts
 skillgym run <suite.ts> --schedule isolated-by-runner --max-parallel 4
 ```
 
 - Omitting `--reporter` uses the built-in `standard` reporter.
-- Built-in reporters are `standard`, `json`, `json-summary`, `github-actions`, and `html`.
+- Built-in reporters are `standard`, `json`, `json-summary`, `token-usage`, `github-actions`, and `html`.
 - Relative paths resolve from `process.cwd()`.
 
 ## Config
@@ -138,6 +139,18 @@ The built-in `json-summary` reporter writes a trimmed JSON summary to stdout —
 - Per-runner results include `failureClass` when present so downstream tooling can keep grouped-failure semantics.
 - It is useful for post-run analysis steps or feeding results to an LLM.
 
+## Token usage reporter
+
+The built-in `token-usage` reporter writes one compact JSON object to stdout at `onSuiteFinish`.
+
+- It emits top-level `passed`, `billable`, `artifacts`, and `rows` fields.
+- Each row represents one `case x runner` result.
+- Passed rows with provider-backed normalized totals expose `billable: { sum, avg }`.
+- Failed rows and rows with derived or unavailable totals stay in the output with `billable: null`.
+- Top-level `billable` aggregates only passed comparable rows, so failed or non-comparable rows do not distort optimization comparisons.
+- No extra token-report artifact is written; use the listed artifact directory and the normal suite-run artifacts for debugging.
+- It is intended for agent loops such as prompt or skill minimization where stdout must stay strict and machine-readable.
+
 ## HTML reporter
 
 The built-in `html` reporter writes a self-contained `report.html` file to the suite run artifact directory.

diff --git a/skills/core.md b/skills/core.md
@@ -16,6 +16,7 @@ skillgym skills get core
 skillgym run <suite.ts>
 skillgym run <suite.ts> --case <id>
 skillgym run <suite.ts> --runner <runner-id>
+skillgym skills get token-optimization
 ```
 
 Typical agent loop:
@@ -85,6 +86,8 @@ Read the focused skills only when the task needs them:
   Use when benchmarking token regressions or updating snapshot baselines.
 - `skillgym skills get reporters`
   Use when choosing built-in reporters or wiring a custom reporter.
+- `skillgym skills get token-optimization`
+  Use when reducing billable token usage for one explicit prompt, suite, or skill target without broad refactors.
 
 ## Suggested authoring order
 

diff --git a/skills/reporters.md b/skills/reporters.md
@@ -12,15 +12,19 @@ Use this skill when choosing how benchmark results should be rendered or consume
 - `standard`
 - `json`
 - `json-summary`
+- `token-usage`
 - `github-actions`
+- `html`
 
 ## Main commands
 
 ```bash
 skillgym run <suite.ts> --reporter standard
 skillgym run <suite.ts> --reporter json
 skillgym run <suite.ts> --reporter json-summary
+skillgym run <suite.ts> --reporter token-usage
 skillgym run <suite.ts> --reporter github-actions
+skillgym run <suite.ts> --reporter html
 skillgym run <suite.ts> --reporter ./path/to/custom-reporter.ts
 ```
 
@@ -35,7 +39,9 @@ skillgym run <suite.ts> --reporter ./path/to/custom-reporter.ts
 - `standard`: default interactive CLI output for humans
 - `json`: full aggregated result on stdout for machine consumers
 - `json-summary`: trimmed result for post-processing or LLM consumption
+- `token-usage`: compact JSON billable summary for optimization loops and other agent consumers
 - `github-actions`: CI annotations and job summary output
+- `html`: self-contained artifact for manual result review
 
 ## Custom reporter shape
 
@@ -62,3 +68,5 @@ export default reporter;
 - `onError`
 
 Use `json-summary` when another agent or tool needs a smaller result than the full session report.
+
+Use `token-usage` when an agent needs strict compact JSON with one row per `case x runner`, comparable `billable` totals for provider-backed passed rows, and artifact paths for deeper debugging when a row fails.
diff --git a/skills/token-optimization.md b/skills/token-optimization.md
@@ -0,0 +1,68 @@
+---
+name: token-optimization
+description: Reduce billable token usage for one explicit Skillgym target. Covers baseline measurement with the token-usage reporter, minimal safe edits, verification loops, and when to fall back to artifacts.
+---
+
+# skillgym token-optimization
+
+Use this skill when the goal is to reduce billable token usage for one explicit target without breaking the benchmark.
+
+## Required input
+
+Start only when the optimization target is explicit.
+
+- valid targets: one prompt, one benchmark case, one suite slice, or one bundled skill/workflow file
+- if the target is missing, ask one short clarification question and wait
+
+## Optimization loop
+
+1. Identify the smallest protecting suite or case slice that proves the target still works.
+2. If none exists, create the smallest safe suite coverage first.
+3. Run a passing baseline with the compact reporter.
+4. Read the baseline JSON and note only comparable passed rows.
+5. Make the smallest safe metadata edit to the explicit target.
+6. Re-run the same slice.
+7. Compare before and after billable totals only on passed comparable rows.
+8. Stop when you hit the budget, reduction goal, or iteration limit.
+
+## Main commands
+
+```bash
+skillgym run <suite.ts> --reporter token-usage
+skillgym run <suite.ts> --case <id> --reporter token-usage
+skillgym run <suite.ts> --runner <runner-id> --reporter token-usage
+```
+
+## Rules
+
+- require a passing baseline before editing
+- keep stdout parsing on the `token-usage` reporter only
+- do not create a second detailed token report; use the normal artifact directory for debugging
+- failed rows still matter for diagnosis, but do not count lower token usage on failed rows
+- derived or unavailable token rows are not comparable; treat `billable: null` as non-comparable
+- prefer one safe minimization pass plus one verification run by default
+- re-run after every change instead of batching edits
+- keep edits scoped to the named target; avoid unrelated cleanup
+
+## How to compare runs
+
+- compare `rows[*].billable` for passed rows only
+- use top-level `billable` only when the compared run covers the same comparable rows
+- if a row fails, inspect the listed `artifacts` path and the standard run output before deciding what changed
+- if the baseline does not pass, fix benchmark stability first instead of claiming a token win
+
+## Good targets
+
+- tighten one prompt that causes repeated tool churn
+- shorten one bundled skill section that the agent reads every run
+- remove redundant instructions from one stable workflow
+
+## Do not use this skill for
+
+- broad benchmark rewrites
+- multi-target refactors with unclear attribution
+- unstable suites that are still failing for functional reasons
+
+## After stabilization
+
+If the behavior is stable and you want regression protection, add or refresh snapshots after the optimization work. Snapshots are optional follow-up protection, not part of the optimization loop itself.
diff --git a/src/cli.ts b/src/cli.ts
@@ -57,7 +57,10 @@ function createProgram(): Command {
       [],
     )
     .option("--runner <runner-id>", "Filter the configured runner set by runner id")
-    .option("--reporter <value>", "Use standard, json, github-actions, or override run.reporter")
+    .option(
+      "--reporter <value>",
+      "Use standard, json, json-summary, token-usage, github-actions, html, or override run.reporter",
+    )
     .option("--snapshots <path>", "Override the configured snapshot file path")
     .option("--update-snapshots", "Refresh snapshot baselines for the selected executions");
 

diff --git a/src/cli/help.ts b/src/cli/help.ts
@@ -31,7 +31,7 @@ ${theme.bold("Run Options:")}
   --case ${theme.accent("<id>")}            Filter the configured suite to one case id
   --tag ${theme.accent("<tag>")}            Filter cases by tag; repeat or comma-separate for OR matching
   --runner ${theme.accent("<runner-id>")}   Filter the configured runner set by runner id
-  --reporter ${theme.accent("<value>")}     Use ${theme.light("standard")}, ${theme.light("json")}, ${theme.light("github-actions")}, or override run.reporter
+  --reporter ${theme.accent("<value>")}     Use ${theme.light("standard")}, ${theme.light("json")}, ${theme.light("json-summary")}, ${theme.light("token-usage")}, ${theme.light("github-actions")}, ${theme.light("html")}, or override run.reporter
   --snapshots ${theme.accent("<path>")}     Override the configured snapshot file path
   --update-snapshots       Refresh snapshot baselines for the selected executions
 

diff --git a/src/index.ts b/src/index.ts
@@ -47,6 +47,7 @@ export {
   createGitHubActionsReporter,
   createHtmlReporter,
   createJsonReporter,
+  createTokenUsageReporter,
   createStandardReporter,
   loadReporter,
 } from "./reporters/index.js";

diff --git a/src/reporters/builtins.ts b/src/reporters/builtins.ts
@@ -2,6 +2,7 @@ export const BUILT_IN_REPORTER_NAMES = [
   "standard",
   "json",
   "json-summary",
+  "token-usage",
   "github-actions",
   "html",
 ] as const;

diff --git a/src/reporters/index.ts b/src/reporters/index.ts
@@ -16,3 +16,4 @@ export { createJsonReporter } from "./json.js";
 export { createJsonSummaryReporter } from "./json-summary.js";
 export { loadReporter } from "./load-reporter.js";
 export { createStandardReporter } from "./standard.js";
+export { createTokenUsageReporter } from "./token-usage.js";
diff --git a/src/reporters/load-reporter.ts b/src/reporters/load-reporter.ts
@@ -5,6 +5,7 @@ import { createHtmlReporter } from "./html.js";
 import { createJsonReporter } from "./json.js";
 import { createJsonSummaryReporter } from "./json-summary.js";
 import { createStandardReporter } from "./standard.js";
+import { createTokenUsageReporter } from "./token-usage.js";
 import { isBuiltInReporter } from "./builtins.js";
 import { importFromPath } from "../utils/import.js";
 
@@ -39,6 +40,8 @@ export async function loadReporter(
         return createJsonReporter();
       case "json-summary":
         return createJsonSummaryReporter();
+      case "token-usage":
+        return createTokenUsageReporter();
       case "github-actions":
         return createGitHubActionsReporter();
       case "html":

diff --git a/src/reporters/token-usage.ts b/src/reporters/token-usage.ts
@@ -0,0 +1,130 @@
+import process from "node:process";
+import type {
+  FailureClass,
+  RunnerResult,
+  RunnerResultStatus,
+  SuiteRunResult,
+} from "../domain/result.js";
+import type { UsageReport } from "../domain/session-report.js";
+import type { BenchmarkReporter } from "./contract.js";
+
+interface TokenUsageReporterOptions {
+  stdout?: Pick<NodeJS.WriteStream, "write">;
+}
+
+interface TokenBillable {
+  sum: number;
+  avg: number;
+}
+
+type TokenUsageKind = "provider" | "derived" | "unavailable";
+
+interface TokenUsageError {
+  name: string;
+  message: string;
+}
+
+interface TokenUsageRow {
+  case: string;
+  runner: string;
+  passed: boolean;
+  status: RunnerResultStatus;
+  usage: TokenUsageKind;
+  billable: TokenBillable | null;
+  failureOrigin?: RunnerResult["failureOrigin"];
+  failureClass?: FailureClass;
+  error?: TokenUsageError;
+}
+
+interface TokenUsageSummary {
+  passed: boolean;
+  billable: TokenBillable | null;
+  artifacts: string;
+  rows: TokenUsageRow[];
+}
+
+export function createTokenUsageReporter(
+  options: TokenUsageReporterOptions = {},
+): BenchmarkReporter {
+  const stdout = options.stdout ?? process.stdout;
+
+  return {
+    onSuiteFinish(event) {
+      stdout.write(`${JSON.stringify(summarizeSuiteRun(event.result))}\n`);
+    },
+  };
+}
+
+function summarizeSuiteRun(result: SuiteRunResult): TokenUsageSummary {
+  const rows = result.cases.flatMap((caseResult) =>
+    caseResult.runnerResults.map((runnerResult) => summarizeRow(caseResult.caseId, runnerResult)),
+  );
+  const billableRows = rows.filter(
+    (row): row is TokenUsageRow & { billable: TokenBillable } => row.billable !== null,
+  );
+
+  return {
+    passed: result.cases.every((caseResult) => caseResult.passed),
+    billable:
+      billableRows.length === 0
+        ? null
+        : {
+            sum: billableRows.reduce((sum, row) => sum + row.billable.sum, 0),
+            avg: billableRows.reduce((sum, row) => sum + row.billable.avg, 0) / billableRows.length,
+          },
+    artifacts: result.suiteRunArtifactDir,
+    rows,
+  };
+}
+
+function summarizeRow(caseId: string, result: RunnerResult): TokenUsageRow {
+  const usage = classifyUsage(result.report.usage);
+  const passed = result.status === "passed";
+
+  return {
+    case: caseId,
+    runner: result.runner.id,
+    passed,
+    status: result.status,
+    usage,
+    billable: passed && usage === "provider" ? createBillable(result) : null,
+    ...(result.failureOrigin === undefined ? {} : { failureOrigin: result.failureOrigin }),
+    ...(result.failureClass === undefined ? {} : { failureClass: result.failureClass }),
+    ...(result.error === undefined
+      ? {}
+      : {
+          error: {
+            name: result.error.name,
+            message: result.error.message,
+          },
+        }),
+  };
+}
+
+function classifyUsage(usage: UsageReport): TokenUsageKind {
+  if (usage.totalTokens === undefined) {
+    return "unavailable";
+  }
+
+  if (usage.source.input === "provider" && usage.source.output === "provider") {
+    return "provider";
+  }
+
+  return "derived";
+}
+
+function createBillable(result: RunnerResult): TokenBillable {
+  const avg = result.report.usage.totalTokens;
+
+  if (avg === undefined) {
+    throw new Error(`Missing billable token usage for runner ${result.runner.id}.`);
+  }
+
+  const repetitions =
+    result.repetitions === undefined ? 1 : Math.max(result.successfulRepetitions ?? 0, 1);
+
+  return {
+    sum: avg * repetitions,
+    avg,
+  };
+}
diff --git a/test/cli.test.ts b/test/cli.test.ts
@@ -63,6 +63,7 @@ test("cli skills list prints bundled skill names", async () => {
   expect(result.stdout).toContain("core");
   expect(result.stdout).toContain("reporters");
   expect(result.stdout).toContain("snapshots");
+  expect(result.stdout).toContain("token-optimization");
   expect(result.stdout).toContain("cases");
   expect(result.stdout).toContain("workspaces");
 });
@@ -77,6 +78,16 @@ test("cli skills get core prints the bundled core skill", async () => {
   expect(result.stdout).toContain("skillgym run <suite.ts>");
 });
 
+test("cli skills get token-optimization prints the bundled optimization skill", async () => {
+  const result = await execCli(["skills", "get", "token-optimization"]);
+
+  expect(result.exitCode).toBe(0);
+  expect(result.stderr).toBe("");
+  expect(result.stdout).toContain("# skillgym token-optimization");
+  expect(result.stdout).toContain("skillgym run <suite.ts> --reporter token-usage");
+  expect(result.stdout).toContain("require a passing baseline before editing");
+});
+
 test("cli skills get reports missing skill name without printing MOTD banner", async () => {
   const result = await execCli(["skills", "get"]);
 

diff --git a/test/reporters/load-reporter.test.ts b/test/reporters/load-reporter.test.ts
@@ -42,6 +42,13 @@ describe("loadReporter", () => {
     expect(reporter.onRunnerFinish).toBeUndefined();
   });
 
+  test("resolves built-in reporter when token-usage is provided", async () => {
+    const reporter = await loadReporter("token-usage", tempDir);
+
+    expect(typeof reporter.onSuiteFinish).toBe("function");
+    expect(reporter.onRunnerStart).toBeUndefined();
+  });
+
   test("resolves built-in reporter when html is provided", async () => {
     const reporter = await loadReporter("html", tempDir);