From e478ce2d687ad2b82ec60f17296945dcaa4f3ef9 Mon Sep 17 00:00:00 2001
From: Szymon Chmal <szymon@chmal.it>
Date: Tue, 19 May 2026 09:14:51 +0200
Subject: [PATCH 1/2] Add built-in token usage reporter

---
 src/cli.ts                                 |   5 +-
 src/cli/help.ts                            |   2 +-
 src/index.ts                               |   1 +
 src/reporters/builtins.ts                  |   1 +
 src/reporters/index.ts                     |   1 +
 src/reporters/load-reporter.ts             |   3 +
 src/reporters/token-usage.ts               | 130 ++++++++
 test/reporters/load-reporter.test.ts       |   7 +
 test/reporters/token-usage.test.ts         | 357 +++++++++++++++++++++
 test/runner/execute-suite.reporter.test.ts |   2 +
 test/runner/model-rejection.test.ts        |   1 +
 11 files changed, 508 insertions(+), 2 deletions(-)
 create mode 100644 src/reporters/token-usage.ts
 create mode 100644 test/reporters/token-usage.test.ts
diff --git a/src/cli.ts b/src/cli.ts
index 6b88368..9e786cd 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -57,7 +57,10 @@ function createProgram(): Command {
       [],
     )
     .option("--runner <runner-id>", "Filter the configured runner set by runner id")
-    .option("--reporter <value>", "Use standard, json, github-actions, or override run.reporter")
+    .option(
+      "--reporter <value>",
+      "Use standard, json, json-summary, token-usage, github-actions, html, or override run.reporter",
+    )
     .option("--snapshots <path>", "Override the configured snapshot file path")
     .option("--update-snapshots", "Refresh snapshot baselines for the selected executions");
 
diff --git a/src/cli/help.ts b/src/cli/help.ts
index 773d327..59047d3 100644
--- a/src/cli/help.ts
+++ b/src/cli/help.ts
@@ -31,7 +31,7 @@ ${theme.bold("Run Options:")}
   --case ${theme.accent("<id>")}            Filter the configured suite to one case id
   --tag ${theme.accent("<tag>")}            Filter cases by tag; repeat or comma-separate for OR matching
   --runner ${theme.accent("<runner-id>")}   Filter the configured runner set by runner id
-  --reporter ${theme.accent("<value>")}     Use ${theme.light("standard")}, ${theme.light("json")}, ${theme.light("github-actions")}, or override run.reporter
+  --reporter ${theme.accent("<value>")}     Use ${theme.light("standard")}, ${theme.light("json")}, ${theme.light("json-summary")}, ${theme.light("token-usage")}, ${theme.light("github-actions")}, ${theme.light("html")}, or override run.reporter
   --snapshots ${theme.accent("<path>")}     Override the configured snapshot file path
   --update-snapshots       Refresh snapshot baselines for the selected executions
 
diff --git a/src/index.ts b/src/index.ts
index 29df1a3..b16e2d4 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -47,6 +47,7 @@ export {
   createGitHubActionsReporter,
   createHtmlReporter,
   createJsonReporter,
+  createTokenUsageReporter,
   createStandardReporter,
   loadReporter,
 } from "./reporters/index.js";
diff --git a/src/reporters/builtins.ts b/src/reporters/builtins.ts
index aa2d585..b5d29e6 100644
--- a/src/reporters/builtins.ts
+++ b/src/reporters/builtins.ts
@@ -2,6 +2,7 @@ export const BUILT_IN_REPORTER_NAMES = [
   "standard",
   "json",
   "json-summary",
+  "token-usage",
   "github-actions",
   "html",
 ] as const;
diff --git a/src/reporters/index.ts b/src/reporters/index.ts
index eff8f07..bafbbfa 100644
--- a/src/reporters/index.ts
+++ b/src/reporters/index.ts
@@ -16,3 +16,4 @@ export { createJsonReporter } from "./json.js";
 export { createJsonSummaryReporter } from "./json-summary.js";
 export { loadReporter } from "./load-reporter.js";
 export { createStandardReporter } from "./standard.js";
+export { createTokenUsageReporter } from "./token-usage.js";
diff --git a/src/reporters/load-reporter.ts b/src/reporters/load-reporter.ts
index b94c140..f86c85c 100644
--- a/src/reporters/load-reporter.ts
+++ b/src/reporters/load-reporter.ts
@@ -5,6 +5,7 @@ import { createHtmlReporter } from "./html.js";
 import { createJsonReporter } from "./json.js";
 import { createJsonSummaryReporter } from "./json-summary.js";
 import { createStandardReporter } from "./standard.js";
+import { createTokenUsageReporter } from "./token-usage.js";
 import { isBuiltInReporter } from "./builtins.js";
 import { importFromPath } from "../utils/import.js";
 
@@ -39,6 +40,8 @@ export async function loadReporter(
         return createJsonReporter();
       case "json-summary":
         return createJsonSummaryReporter();
+      case "token-usage":
+        return createTokenUsageReporter();
       case "github-actions":
         return createGitHubActionsReporter();
       case "html":
diff --git a/src/reporters/token-usage.ts b/src/reporters/token-usage.ts
new file mode 100644
index 0000000..6915f98
--- /dev/null
+++ b/src/reporters/token-usage.ts
@@ -0,0 +1,130 @@
+import process from "node:process";
+import type {
+  FailureClass,
+  RunnerResult,
+  RunnerResultStatus,
+  SuiteRunResult,
+} from "../domain/result.js";
+import type { UsageReport } from "../domain/session-report.js";
+import type { BenchmarkReporter } from "./contract.js";
+
+interface TokenUsageReporterOptions {
+  stdout?: Pick<NodeJS.WriteStream, "write">;
+}
+
+interface TokenBillable {
+  sum: number;
+  avg: number;
+}
+
+type TokenUsageKind = "provider" | "derived" | "unavailable";
+
+interface TokenUsageError {
+  name: string;
+  message: string;
+}
+
+interface TokenUsageRow {
+  case: string;
+  runner: string;
+  passed: boolean;
+  status: RunnerResultStatus;
+  usage: TokenUsageKind;
+  billable: TokenBillable | null;
+  failureOrigin?: RunnerResult["failureOrigin"];
+  failureClass?: FailureClass;
+  error?: TokenUsageError;
+}
+
+interface TokenUsageSummary {
+  passed: boolean;
+  billable: TokenBillable | null;
+  artifacts: string;
+  rows: TokenUsageRow[];
+}
+
+export function createTokenUsageReporter(
+  options: TokenUsageReporterOptions = {},
+): BenchmarkReporter {
+  const stdout = options.stdout ?? process.stdout;
+
+  return {
+    onSuiteFinish(event) {
+      stdout.write(`${JSON.stringify(summarizeSuiteRun(event.result))}\n`);
+    },
+  };
+}
+
+function summarizeSuiteRun(result: SuiteRunResult): TokenUsageSummary {
+  const rows = result.cases.flatMap((caseResult) =>
+    caseResult.runnerResults.map((runnerResult) => summarizeRow(caseResult.caseId, runnerResult)),
+  );
+  const billableRows = rows.filter(
+    (row): row is TokenUsageRow & { billable: TokenBillable } => row.billable !== null,
+  );
+
+  return {
+    passed: result.cases.every((caseResult) => caseResult.passed),
+    billable:
+      billableRows.length === 0
+        ? null
+        : {
+            sum: billableRows.reduce((sum, row) => sum + row.billable.sum, 0),
+            avg: billableRows.reduce((sum, row) => sum + row.billable.avg, 0) / billableRows.length,
+          },
+    artifacts: result.suiteRunArtifactDir,
+    rows,
+  };
+}
+
+function summarizeRow(caseId: string, result: RunnerResult): TokenUsageRow {
+  const usage = classifyUsage(result.report.usage);
+  const passed = result.status === "passed";
+
+  return {
+    case: caseId,
+    runner: result.runner.id,
+    passed,
+    status: result.status,
+    usage,
+    billable: passed && usage === "provider" ? createBillable(result) : null,
+    ...(result.failureOrigin === undefined ? {} : { failureOrigin: result.failureOrigin }),
+    ...(result.failureClass === undefined ? {} : { failureClass: result.failureClass }),
+    ...(result.error === undefined
+      ? {}
+      : {
+          error: {
+            name: result.error.name,
+            message: result.error.message,
+          },
+        }),
+  };
+}
+
+function classifyUsage(usage: UsageReport): TokenUsageKind {
+  if (usage.totalTokens === undefined) {
+    return "unavailable";
+  }
+
+  if (usage.source.input === "provider" && usage.source.output === "provider") {
+    return "provider";
+  }
+
+  return "derived";
+}
+
+function createBillable(result: RunnerResult): TokenBillable {
+  const avg = result.report.usage.totalTokens;
+
+  if (avg === undefined) {
+    throw new Error(`Missing billable token usage for runner ${result.runner.id}.`);
+  }
+
+  const repetitions =
+    result.repetitions === undefined ? 1 : Math.max(result.successfulRepetitions ?? 0, 1);
+
+  return {
+    sum: avg * repetitions,
+    avg,
+  };
+}
diff --git a/test/reporters/load-reporter.test.ts b/test/reporters/load-reporter.test.ts
index 22cc74d..9404639 100644
--- a/test/reporters/load-reporter.test.ts
+++ b/test/reporters/load-reporter.test.ts
@@ -42,6 +42,13 @@ describe("loadReporter", () => {
     expect(reporter.onRunnerFinish).toBeUndefined();
   });
 
+  test("resolves built-in reporter when token-usage is provided", async () => {
+    const reporter = await loadReporter("token-usage", tempDir);
+
+    expect(typeof reporter.onSuiteFinish).toBe("function");
+    expect(reporter.onRunnerStart).toBeUndefined();
+  });
+
   test("resolves built-in reporter when html is provided", async () => {
     const reporter = await loadReporter("html", tempDir);
 
diff --git a/test/reporters/token-usage.test.ts b/test/reporters/token-usage.test.ts
new file mode 100644
index 0000000..519a66c
--- /dev/null
+++ b/test/reporters/token-usage.test.ts
@@ -0,0 +1,357 @@
+import { expect, test } from "vitest";
+import type { RunnerResult, SuiteRunResult } from "../../src/index.js";
+import { createTokenUsageReporter } from "../../src/reporters/token-usage.js";
+import { createRunnerInfo } from "../../src/runner/runner-info.js";
+import { createSessionReport } from "../helpers/session-report.js";
+
+test("token-usage reporter summarizes comparable billable rows as strict JSON", async () => {
+  const writes: string[] = [];
+  const reporter = createTokenUsageReporter({
+    stdout: {
+      write(chunk: string) {
+        writes.push(chunk);
+        return true;
+      },
+    },
+  });
+
+  const mainRunner = createRunnerInfo("open-main", { type: "opencode", model: "openai/gpt-5" });
+  const fallbackRunner = createRunnerInfo("open-fallback", {
+    type: "opencode",
+    model: "openai/gpt-5-mini",
+  });
+  const result: SuiteRunResult = {
+    suitePath: "examples/basic-suite.ts",
+    startedAt: "2026-04-02T12:00:00.000Z",
+    endedAt: "2026-04-02T12:01:00.000Z",
+    durationMs: 60_000,
+    suiteRunArtifactDir: ".skillgym-results/run-1",
+    declaredTags: [],
+    selectedTags: [],
+    cases: [
+      {
+        caseId: "case-a",
+        tags: [],
+        passed: true,
+        runnerResults: [
+          createRunnerResult(mainRunner, {
+            status: "passed",
+            usage: {
+              inputTokens: 1000,
+              outputTokens: 150,
+              reasoningTokens: 50,
+              cacheTokens: 0,
+              totalTokens: 1200,
+              source: { input: "provider", output: "provider", reasoning: "provider" },
+            },
+          }),
+          createRunnerResult(fallbackRunner, {
+            status: "passed",
+            successfulRepetitions: 2,
+            repetitions: [
+              createRepetitionResult(fallbackRunner, 1, {
+                inputTokens: 1700,
+                outputTokens: 250,
+                reasoningTokens: 150,
+                cacheTokens: 0,
+                totalTokens: 2100,
+                source: { input: "provider", output: "provider", reasoning: "provider" },
+              }),
+              createRepetitionResult(fallbackRunner, 2, {
+                inputTokens: 1700,
+                outputTokens: 250,
+                reasoningTokens: 150,
+                cacheTokens: 0,
+                totalTokens: 2100,
+                source: { input: "provider", output: "provider", reasoning: "provider" },
+              }),
+            ],
+            usage: {
+              inputTokens: 1700,
+              outputTokens: 250,
+              reasoningTokens: 150,
+              cacheTokens: 0,
+              totalTokens: 2100,
+              source: { input: "provider", output: "provider", reasoning: "provider" },
+            },
+          }),
+        ],
+      },
+      {
+        caseId: "case-b",
+        tags: [],
+        passed: false,
+        runnerResults: [
+          createRunnerResult(mainRunner, {
+            status: "failed",
+            failureOrigin: "assertion",
+            failureClass: { id: "missing-rule", label: "Missing rule" },
+            error: { name: "AssertionError", message: "expected prompt to keep critical rule" },
+            usage: {
+              inputTokens: 900,
+              outputTokens: 200,
+              reasoningTokens: 100,
+              cacheTokens: 0,
+              totalTokens: 1200,
+              source: { input: "provider", output: "provider", reasoning: "provider" },
+            },
+          }),
+          createRunnerResult(fallbackRunner, {
+            status: "passed",
+            usage: {
+              inputTokens: 600,
+              outputTokens: 150,
+              reasoningTokens: 40,
+              cacheTokens: 0,
+              totalTokens: 790,
+              source: { input: "derived", output: "derived", reasoning: "derived" },
+            },
+          }),
+        ],
+      },
+      {
+        caseId: "case-c",
+        tags: [],
+        passed: true,
+        runnerResults: [
+          createRunnerResult(mainRunner, {
+            status: "passed",
+            usage: {
+              inputTokens: undefined,
+              outputTokens: undefined,
+              reasoningTokens: undefined,
+              cacheTokens: undefined,
+              totalTokens: undefined,
+              source: { input: "chars", output: "chars", reasoning: "chars" },
+            },
+          }),
+        ],
+      },
+    ],
+    runners: [
+      {
+        runner: mainRunner,
+        totalCases: 3,
+        passedCases: 2,
+        successRate: 2 / 3,
+        averageDurationMs: 1000,
+        averageTotalTokens: 1200,
+      },
+      {
+        runner: fallbackRunner,
+        totalCases: 2,
+        passedCases: 2,
+        successRate: 1,
+        averageDurationMs: 1000,
+        averageTotalTokens: 1445,
+      },
+    ],
+  };
+
+  const context = {
+    isInteractive: false,
+    cwd: "/workspace",
+    workspaceMode: "shared" as const,
+    suitePath: result.suitePath,
+    suiteRunArtifactDir: result.suiteRunArtifactDir,
+    selectedCaseCount: 3,
+    selectedRunnerCount: 2,
+    selectedExecutionCount: 5,
+    scheduleMode: "serial" as const,
+    maxParallel: 1,
+    declaredTags: [],
+  };
+
+  await reporter.onSuiteFinish?.({ context, result });
+
+  expect(writes).toHaveLength(1);
+  expect(() => JSON.parse(writes[0]!)).not.toThrow();
+
+  const output = JSON.parse(writes[0]!) as {
+    passed: boolean;
+    billable: { sum: number; avg: number } | null;
+    artifacts: string;
+    rows: Array<{
+      case: string;
+      runner: string;
+      passed: boolean;
+      status: string;
+      usage: string;
+      billable: { sum: number; avg: number } | null;
+      failureOrigin?: string;
+      failureClass?: { id: string; label?: string };
+      error?: { name: string; message: string };
+      artifactDir?: string;
+    }>;
+  };
+
+  expect(output.passed).toBe(false);
+  expect(output.billable).toEqual({ sum: 5400, avg: 1650 });
+  expect(output.artifacts).toBe(".skillgym-results/run-1");
+  expect(output.rows).toEqual([
+    {
+      case: "case-a",
+      runner: "open-main",
+      passed: true,
+      status: "passed",
+      usage: "provider",
+      billable: { sum: 1200, avg: 1200 },
+    },
+    {
+      case: "case-a",
+      runner: "open-fallback",
+      passed: true,
+      status: "passed",
+      usage: "provider",
+      billable: { sum: 4200, avg: 2100 },
+    },
+    {
+      case: "case-b",
+      runner: "open-main",
+      passed: false,
+      status: "failed",
+      usage: "provider",
+      billable: null,
+      failureOrigin: "assertion",
+      failureClass: { id: "missing-rule", label: "Missing rule" },
+      error: { name: "AssertionError", message: "expected prompt to keep critical rule" },
+    },
+    {
+      case: "case-b",
+      runner: "open-fallback",
+      passed: true,
+      status: "passed",
+      usage: "derived",
+      billable: null,
+    },
+    {
+      case: "case-c",
+      runner: "open-main",
+      passed: true,
+      status: "passed",
+      usage: "unavailable",
+      billable: null,
+    },
+  ]);
+
+  for (const row of output.rows) {
+    expect(row.artifactDir).toBeUndefined();
+  }
+});
+
+test("token-usage reporter is silent until suite finishes", async () => {
+  const writes: string[] = [];
+  const reporter = createTokenUsageReporter({
+    stdout: {
+      write(chunk: string) {
+        writes.push(chunk);
+        return true;
+      },
+    },
+  });
+
+  const runner = createRunnerInfo("open-main", { type: "opencode", model: "openai/gpt-5" });
+  const context = {
+    isInteractive: false,
+    cwd: "/workspace",
+    workspaceMode: "shared" as const,
+    suitePath: "suite.ts",
+    suiteRunArtifactDir: ".skillgym-results/run-1",
+    selectedCaseCount: 1,
+    selectedRunnerCount: 1,
+    selectedExecutionCount: 1,
+    scheduleMode: "serial" as const,
+    maxParallel: 1,
+    declaredTags: [],
+  };
+
+  await reporter.onSuiteStart?.({
+    context,
+    cases: [],
+    runners: [runner],
+    startedAt: "2026-04-02T12:00:00.000Z",
+  });
+  await reporter.onRunnerStart?.({
+    context,
+    case: { id: "case-a", prompt: "", assert() {} },
+    runner,
+    caseIndex: 1,
+    totalCases: 1,
+  });
+
+  expect(writes).toHaveLength(0);
+});
+
+function createRunnerResult(
+  runner: ReturnType<typeof createRunnerInfo>,
+  options: {
+    status: RunnerResult["status"];
+    usage: {
+      inputTokens?: number;
+      outputTokens?: number;
+      reasoningTokens?: number;
+      cacheTokens?: number;
+      totalTokens?: number;
+      source: {
+        input: "provider" | "derived" | "chars";
+        output: "provider" | "derived" | "chars";
+        reasoning: "provider" | "derived" | "chars";
+      };
+    };
+    successfulRepetitions?: number;
+    repetitions?: RunnerResult["repetitions"];
+    error?: RunnerResult["error"];
+    failureOrigin?: RunnerResult["failureOrigin"];
+    failureClass?: RunnerResult["failureClass"];
+  },
+): RunnerResult {
+  return {
+    runner,
+    passed: options.status === "passed",
+    status: options.status,
+    durationMs: 1000,
+    executionArtifactDir: ".skillgym-results/run-1/execution",
+    artifactDir: ".skillgym-results/run-1/execution",
+    report: createSessionReport({
+      runner,
+      usage: {
+        inputTokens: options.usage.inputTokens,
+        outputTokens: options.usage.outputTokens,
+        reasoningTokens: options.usage.reasoningTokens,
+        cacheTokens: options.usage.cacheTokens,
+        totalTokens: options.usage.totalTokens,
+        inputChars: 100,
+        outputChars: 40,
+        reasoningChars: 20,
+        source: options.usage.source,
+      },
+    }),
+    successfulRepetitions: options.successfulRepetitions,
+    repetitions: options.repetitions,
+    error: options.error,
+    failureOrigin: options.failureOrigin,
+    failureClass: options.failureClass,
+  };
+}
+
+function createRepetitionResult(
+  runner: ReturnType<typeof createRunnerInfo>,
+  repetition: number,
+  usage: {
+    inputTokens?: number;
+    outputTokens?: number;
+    reasoningTokens?: number;
+    cacheTokens?: number;
+    totalTokens?: number;
+    source: {
+      input: "provider" | "derived" | "chars";
+      output: "provider" | "derived" | "chars";
+      reasoning: "provider" | "derived" | "chars";
+    };
+  },
+): NonNullable<RunnerResult["repetitions"]>[number] {
+  return {
+    ...createRunnerResult(runner, { status: "passed", usage }),
+    repetition,
+  };
+}
diff --git a/test/runner/execute-suite.reporter.test.ts b/test/runner/execute-suite.reporter.test.ts
index 79eb23e..dffdd32 100644
--- a/test/runner/execute-suite.reporter.test.ts
+++ b/test/runner/execute-suite.reporter.test.ts
@@ -1483,6 +1483,8 @@ test("executeSuite raises process max listeners for parallel runs and restores i
           passed: true,
           durationMs: 10,
           executionArtifactDir: options.artifactDir,
+          outputTokens: 0,
+          observedReads: 0,
         });
       },
     });
diff --git a/test/runner/model-rejection.test.ts b/test/runner/model-rejection.test.ts
index d3c6200..7371896 100644
--- a/test/runner/model-rejection.test.ts
+++ b/test/runner/model-rejection.test.ts
@@ -102,6 +102,7 @@ async function createResultWithLogs(options: {
     passed: false,
     status: "failed",
     durationMs: 1,
+    executionArtifactDir: artifactDir,
     artifactDir,
     error: {
       name: "Error",

From 5865536ef26de26f47a4e5036ef631c2346a4a85 Mon Sep 17 00:00:00 2001
From: Szymon Chmal <szymon@chmal.it>
Date: Tue, 19 May 2026 09:16:27 +0200
Subject: [PATCH 2/2] Document token optimization workflow

---
 DICTIONARY.md                |  2 ++
 docs/reporters.md            | 15 +++++++-
 skills/core.md               |  3 ++
 skills/reporters.md          |  8 +++++
 skills/token-optimization.md | 68 ++++++++++++++++++++++++++++++++++++
 test/cli.test.ts             | 11 ++++++
 6 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 skills/token-optimization.md

diff --git a/DICTIONARY.md b/DICTIONARY.md
index af5c711..43973be 100644
--- a/DICTIONARY.md
+++ b/DICTIONARY.md
@@ -36,6 +36,8 @@ Before adding or changing a term, check this file and ask the user for approval.
 - `workspace bootstrap`: Command run in a provisioned workspace before the agent starts.
 - `schedule`: Execution ordering and concurrency policy.
 - `reporter`: Component rendering suite-run progress and results.
+- `token-usage`: Built-in reporter that emits compact JSON billable-token summaries for agent consumption.
+- `token-optimization`: Bundled skill that guides an agent through baseline, minimal edits, and before/after token comparison.
 - `skill detection`: Evidence that a skill was used, with confidence and evidence.
 - `session event`: Normalized event observed during a session.
 - `snapshot baseline`: Stored token baseline for a benchmark case x runner pair.
diff --git a/docs/reporters.md b/docs/reporters.md
index fa05668..b099aec 100644
--- a/docs/reporters.md
+++ b/docs/reporters.md
@@ -10,6 +10,7 @@ Execution, aggregation, and `results.json` writing stay in the runner. Reporters
 skillgym run <suite.ts> --reporter standard
 skillgym run <suite.ts> --reporter json
 skillgym run <suite.ts> --reporter json-summary
+skillgym run <suite.ts> --reporter token-usage
 skillgym run <suite.ts> --reporter github-actions
 skillgym run <suite.ts> --reporter html
 skillgym run <suite.ts> --reporter ./examples/custom-reporter.ts
@@ -17,7 +18,7 @@ skillgym run <suite.ts> --schedule isolated-by-runner --max-parallel 4
 ```
 
 - Omitting `--reporter` uses the built-in `standard` reporter.
-- Built-in reporters are `standard`, `json`, `json-summary`, `github-actions`, and `html`.
+- Built-in reporters are `standard`, `json`, `json-summary`, `token-usage`, `github-actions`, and `html`.
 - Relative paths resolve from `process.cwd()`.
 
 ## Config
@@ -138,6 +139,18 @@ The built-in `json-summary` reporter writes a trimmed JSON summary to stdout —
 - Per-runner results include `failureClass` when present so downstream tooling can keep grouped-failure semantics.
 - It is useful for post-run analysis steps or feeding results to an LLM.
 
+## Token usage reporter
+
+The built-in `token-usage` reporter writes one compact JSON object to stdout at `onSuiteFinish`.
+
+- It emits top-level `passed`, `billable`, `artifacts`, and `rows` fields.
+- Each row represents one `case x runner` result.
+- Passed rows with provider-backed normalized totals expose `billable: { sum, avg }`.
+- Failed rows and rows with derived or unavailable totals stay in the output with `billable: null`.
+- Top-level `billable` aggregates only passed comparable rows, so failed or non-comparable rows do not distort optimization comparisons.
+- No extra token-report artifact is written; use the listed artifact directory and the normal suite-run artifacts for debugging.
+- It is intended for agent loops such as prompt or skill minimization where stdout must stay strict and machine-readable.
+
 ## HTML reporter
 
 The built-in `html` reporter writes a self-contained `report.html` file to the suite run artifact directory.
diff --git a/skills/core.md b/skills/core.md
index 1d98660..5675dbf 100644
--- a/skills/core.md
+++ b/skills/core.md
@@ -16,6 +16,7 @@ skillgym skills get core
 skillgym run <suite.ts>
 skillgym run <suite.ts> --case <id>
 skillgym run <suite.ts> --runner <runner-id>
+skillgym skills get token-optimization
 ```
 
 Typical agent loop:
@@ -85,6 +86,8 @@ Read the focused skills only when the task needs them:
   Use when benchmarking token regressions or updating snapshot baselines.
 - `skillgym skills get reporters`
   Use when choosing built-in reporters or wiring a custom reporter.
+- `skillgym skills get token-optimization`
+  Use when reducing billable token usage for one explicit prompt, suite, or skill target without broad refactors.
 
 ## Suggested authoring order
 
diff --git a/skills/reporters.md b/skills/reporters.md
index 8a1c34e..793bbf1 100644
--- a/skills/reporters.md
+++ b/skills/reporters.md
@@ -12,7 +12,9 @@ Use this skill when choosing how benchmark results should be rendered or consume
 - `standard`
 - `json`
 - `json-summary`
+- `token-usage`
 - `github-actions`
+- `html`
 
 ## Main commands
 
@@ -20,7 +22,9 @@ Use this skill when choosing how benchmark results should be rendered or consume
 skillgym run <suite.ts> --reporter standard
 skillgym run <suite.ts> --reporter json
 skillgym run <suite.ts> --reporter json-summary
+skillgym run <suite.ts> --reporter token-usage
 skillgym run <suite.ts> --reporter github-actions
+skillgym run <suite.ts> --reporter html
 skillgym run <suite.ts> --reporter ./path/to/custom-reporter.ts
 ```
 
@@ -35,7 +39,9 @@ skillgym run <suite.ts> --reporter ./path/to/custom-reporter.ts
 - `standard`: default interactive CLI output for humans
 - `json`: full aggregated result on stdout for machine consumers
 - `json-summary`: trimmed result for post-processing or LLM consumption
+- `token-usage`: compact JSON billable summary for optimization loops and other agent consumers
 - `github-actions`: CI annotations and job summary output
+- `html`: self-contained artifact for manual result review
 
 ## Custom reporter shape
 
@@ -62,3 +68,5 @@ export default reporter;
 - `onError`
 
 Use `json-summary` when another agent or tool needs a smaller result than the full session report.
+
+Use `token-usage` when an agent needs strict compact JSON with one row per `case x runner`, comparable `billable` totals for provider-backed passed rows, and artifact paths for deeper debugging when a row fails.
diff --git a/skills/token-optimization.md b/skills/token-optimization.md
new file mode 100644
index 0000000..ba3219e
--- /dev/null
+++ b/skills/token-optimization.md
@@ -0,0 +1,68 @@
+---
+name: token-optimization
+description: Reduce billable token usage for one explicit Skillgym target. Covers baseline measurement with the token-usage reporter, minimal safe edits, verification loops, and when to fall back to artifacts.
+---
+
+# skillgym token-optimization
+
+Use this skill when the goal is to reduce billable token usage for one explicit target without breaking the benchmark.
+
+## Required input
+
+Start only when the optimization target is explicit.
+
+- valid targets: one prompt, one benchmark case, one suite slice, or one bundled skill/workflow file
+- if the target is missing, ask one short clarification question and wait
+
+## Optimization loop
+
+1. Identify the smallest protecting suite or case slice that proves the target still works.
+2. If none exists, create the smallest safe suite coverage first.
+3. Run a passing baseline with the compact reporter.
+4. Read the baseline JSON and note only comparable passed rows.
+5. Make the smallest safe metadata edit to the explicit target.
+6. Re-run the same slice.
+7. Compare before and after billable totals only on passed comparable rows.
+8. Stop when you hit the budget, reduction goal, or iteration limit.
+
+## Main commands
+
+```bash
+skillgym run <suite.ts> --reporter token-usage
+skillgym run <suite.ts> --case <id> --reporter token-usage
+skillgym run <suite.ts> --runner <runner-id> --reporter token-usage
+```
+
+## Rules
+
+- require a passing baseline before editing
+- keep stdout parsing on the `token-usage` reporter only
+- do not create a second detailed token report; use the normal artifact directory for debugging
+- failed rows still matter for diagnosis, but do not count lower token usage on failed rows
+- derived or unavailable token rows are not comparable; treat `billable: null` as non-comparable
+- prefer one safe minimization pass plus one verification run by default
+- re-run after every change instead of batching edits
+- keep edits scoped to the named target; avoid unrelated cleanup
+
+## How to compare runs
+
+- compare `rows[*].billable` for passed rows only
+- use top-level `billable` only when the compared run covers the same comparable rows
+- if a row fails, inspect the listed `artifacts` path and the standard run output before deciding what changed
+- if the baseline does not pass, fix benchmark stability first instead of claiming a token win
+
+## Good targets
+
+- tighten one prompt that causes repeated tool churn
+- shorten one bundled skill section that the agent reads every run
+- remove redundant instructions from one stable workflow
+
+## Do not use this skill for
+
+- broad benchmark rewrites
+- multi-target refactors with unclear attribution
+- unstable suites that are still failing for functional reasons
+
+## After stabilization
+
+If the behavior is stable and you want regression protection, add or refresh snapshots after the optimization work. Snapshots are optional follow-up protection, not part of the optimization loop itself.
diff --git a/test/cli.test.ts b/test/cli.test.ts
index e938991..cee83cf 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -63,6 +63,7 @@ test("cli skills list prints bundled skill names", async () => {
   expect(result.stdout).toContain("core");
   expect(result.stdout).toContain("reporters");
   expect(result.stdout).toContain("snapshots");
+  expect(result.stdout).toContain("token-optimization");
   expect(result.stdout).toContain("cases");
   expect(result.stdout).toContain("workspaces");
 });
@@ -77,6 +78,16 @@ test("cli skills get core prints the bundled core skill", async () => {
   expect(result.stdout).toContain("skillgym run <suite.ts>");
 });
 
+test("cli skills get token-optimization prints the bundled optimization skill", async () => {
+  const result = await execCli(["skills", "get", "token-optimization"]);
+
+  expect(result.exitCode).toBe(0);
+  expect(result.stderr).toBe("");
+  expect(result.stdout).toContain("# skillgym token-optimization");
+  expect(result.stdout).toContain("skillgym run <suite.ts> --reporter token-usage");
+  expect(result.stdout).toContain("require a passing baseline before editing");
+});
+
 test("cli skills get reports missing skill name without printing MOTD banner", async () => {
   const result = await execCli(["skills", "get"]);