From e478ce2d687ad2b82ec60f17296945dcaa4f3ef9 Mon Sep 17 00:00:00 2001 From: Szymon Chmal Date: Tue, 19 May 2026 09:14:51 +0200 Subject: [PATCH 1/2] Add built-in token usage reporter --- src/cli.ts | 5 +- src/cli/help.ts | 2 +- src/index.ts | 1 + src/reporters/builtins.ts | 1 + src/reporters/index.ts | 1 + src/reporters/load-reporter.ts | 3 + src/reporters/token-usage.ts | 130 ++++++++ test/reporters/load-reporter.test.ts | 7 + test/reporters/token-usage.test.ts | 357 +++++++++++++++++++++ test/runner/execute-suite.reporter.test.ts | 2 + test/runner/model-rejection.test.ts | 1 + 11 files changed, 508 insertions(+), 2 deletions(-) create mode 100644 src/reporters/token-usage.ts create mode 100644 test/reporters/token-usage.test.ts diff --git a/src/cli.ts b/src/cli.ts index 6b88368..9e786cd 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -57,7 +57,10 @@ function createProgram(): Command { [], ) .option("--runner ", "Filter the configured runner set by runner id") - .option("--reporter ", "Use standard, json, github-actions, or override run.reporter") + .option( + "--reporter ", + "Use standard, json, json-summary, token-usage, github-actions, html, or override run.reporter", + ) .option("--snapshots ", "Override the configured snapshot file path") .option("--update-snapshots", "Refresh snapshot baselines for the selected executions"); diff --git a/src/cli/help.ts b/src/cli/help.ts index 773d327..59047d3 100644 --- a/src/cli/help.ts +++ b/src/cli/help.ts @@ -31,7 +31,7 @@ ${theme.bold("Run Options:")} --case ${theme.accent("")} Filter the configured suite to one case id --tag ${theme.accent("")} Filter cases by tag; repeat or comma-separate for OR matching --runner ${theme.accent("")} Filter the configured runner set by runner id - --reporter ${theme.accent("")} Use ${theme.light("standard")}, ${theme.light("json")}, ${theme.light("github-actions")}, or override run.reporter + --reporter ${theme.accent("")} Use ${theme.light("standard")}, ${theme.light("json")}, ${theme.light("json-summary")}, ${theme.light("token-usage")}, ${theme.light("github-actions")}, ${theme.light("html")}, or override run.reporter --snapshots ${theme.accent("")} Override the configured snapshot file path --update-snapshots Refresh snapshot baselines for the selected executions diff --git a/src/index.ts b/src/index.ts index 29df1a3..b16e2d4 100644 --- a/src/index.ts +++ b/src/index.ts @@ -47,6 +47,7 @@ export { createGitHubActionsReporter, createHtmlReporter, createJsonReporter, + createTokenUsageReporter, createStandardReporter, loadReporter, } from "./reporters/index.js"; diff --git a/src/reporters/builtins.ts b/src/reporters/builtins.ts index aa2d585..b5d29e6 100644 --- a/src/reporters/builtins.ts +++ b/src/reporters/builtins.ts @@ -2,6 +2,7 @@ export const BUILT_IN_REPORTER_NAMES = [ "standard", "json", "json-summary", + "token-usage", "github-actions", "html", ] as const; diff --git a/src/reporters/index.ts b/src/reporters/index.ts index eff8f07..bafbbfa 100644 --- a/src/reporters/index.ts +++ b/src/reporters/index.ts @@ -16,3 +16,4 @@ export { createJsonReporter } from "./json.js"; export { createJsonSummaryReporter } from "./json-summary.js"; export { loadReporter } from "./load-reporter.js"; export { createStandardReporter } from "./standard.js"; +export { createTokenUsageReporter } from "./token-usage.js"; diff --git a/src/reporters/load-reporter.ts b/src/reporters/load-reporter.ts index b94c140..f86c85c 100644 --- a/src/reporters/load-reporter.ts +++ b/src/reporters/load-reporter.ts @@ -5,6 +5,7 @@ import { createHtmlReporter } from "./html.js"; import { createJsonReporter } from "./json.js"; import { createJsonSummaryReporter } from "./json-summary.js"; import { createStandardReporter } from "./standard.js"; +import { createTokenUsageReporter } from "./token-usage.js"; import { isBuiltInReporter } from "./builtins.js"; import { importFromPath } from "../utils/import.js"; @@ -39,6 +40,8 @@ export async function loadReporter( return createJsonReporter(); case "json-summary": return createJsonSummaryReporter(); + case "token-usage": + return createTokenUsageReporter(); case "github-actions": return createGitHubActionsReporter(); case "html": diff --git a/src/reporters/token-usage.ts b/src/reporters/token-usage.ts new file mode 100644 index 0000000..6915f98 --- /dev/null +++ b/src/reporters/token-usage.ts @@ -0,0 +1,130 @@ +import process from "node:process"; +import type { + FailureClass, + RunnerResult, + RunnerResultStatus, + SuiteRunResult, +} from "../domain/result.js"; +import type { UsageReport } from "../domain/session-report.js"; +import type { BenchmarkReporter } from "./contract.js"; + +interface TokenUsageReporterOptions { + stdout?: Pick; +} + +interface TokenBillable { + sum: number; + avg: number; +} + +type TokenUsageKind = "provider" | "derived" | "unavailable"; + +interface TokenUsageError { + name: string; + message: string; +} + +interface TokenUsageRow { + case: string; + runner: string; + passed: boolean; + status: RunnerResultStatus; + usage: TokenUsageKind; + billable: TokenBillable | null; + failureOrigin?: RunnerResult["failureOrigin"]; + failureClass?: FailureClass; + error?: TokenUsageError; +} + +interface TokenUsageSummary { + passed: boolean; + billable: TokenBillable | null; + artifacts: string; + rows: TokenUsageRow[]; +} + +export function createTokenUsageReporter( + options: TokenUsageReporterOptions = {}, +): BenchmarkReporter { + const stdout = options.stdout ?? process.stdout; + + return { + onSuiteFinish(event) { + stdout.write(`${JSON.stringify(summarizeSuiteRun(event.result))}\n`); + }, + }; +} + +function summarizeSuiteRun(result: SuiteRunResult): TokenUsageSummary { + const rows = result.cases.flatMap((caseResult) => + caseResult.runnerResults.map((runnerResult) => summarizeRow(caseResult.caseId, runnerResult)), + ); + const billableRows = rows.filter( + (row): row is TokenUsageRow & { billable: TokenBillable } => row.billable !== null, + ); + + return { + passed: result.cases.every((caseResult) => caseResult.passed), + billable: + billableRows.length === 0 + ? null + : { + sum: billableRows.reduce((sum, row) => sum + row.billable.sum, 0), + avg: billableRows.reduce((sum, row) => sum + row.billable.avg, 0) / billableRows.length, + }, + artifacts: result.suiteRunArtifactDir, + rows, + }; +} + +function summarizeRow(caseId: string, result: RunnerResult): TokenUsageRow { + const usage = classifyUsage(result.report.usage); + const passed = result.status === "passed"; + + return { + case: caseId, + runner: result.runner.id, + passed, + status: result.status, + usage, + billable: passed && usage === "provider" ? createBillable(result) : null, + ...(result.failureOrigin === undefined ? {} : { failureOrigin: result.failureOrigin }), + ...(result.failureClass === undefined ? {} : { failureClass: result.failureClass }), + ...(result.error === undefined + ? {} + : { + error: { + name: result.error.name, + message: result.error.message, + }, + }), + }; +} + +function classifyUsage(usage: UsageReport): TokenUsageKind { + if (usage.totalTokens === undefined) { + return "unavailable"; + } + + if (usage.source.input === "provider" && usage.source.output === "provider") { + return "provider"; + } + + return "derived"; +} + +function createBillable(result: RunnerResult): TokenBillable { + const avg = result.report.usage.totalTokens; + + if (avg === undefined) { + throw new Error(`Missing billable token usage for runner ${result.runner.id}.`); + } + + const repetitions = + result.repetitions === undefined ? 1 : Math.max(result.successfulRepetitions ?? 0, 1); + + return { + sum: avg * repetitions, + avg, + }; +} diff --git a/test/reporters/load-reporter.test.ts b/test/reporters/load-reporter.test.ts index 22cc74d..9404639 100644 --- a/test/reporters/load-reporter.test.ts +++ b/test/reporters/load-reporter.test.ts @@ -42,6 +42,13 @@ describe("loadReporter", () => { expect(reporter.onRunnerFinish).toBeUndefined(); }); + test("resolves built-in reporter when token-usage is provided", async () => { + const reporter = await loadReporter("token-usage", tempDir); + + expect(typeof reporter.onSuiteFinish).toBe("function"); + expect(reporter.onRunnerStart).toBeUndefined(); + }); + test("resolves built-in reporter when html is provided", async () => { const reporter = await loadReporter("html", tempDir); diff --git a/test/reporters/token-usage.test.ts b/test/reporters/token-usage.test.ts new file mode 100644 index 0000000..519a66c --- /dev/null +++ b/test/reporters/token-usage.test.ts @@ -0,0 +1,357 @@ +import { expect, test } from "vitest"; +import type { RunnerResult, SuiteRunResult } from "../../src/index.js"; +import { createTokenUsageReporter } from "../../src/reporters/token-usage.js"; +import { createRunnerInfo } from "../../src/runner/runner-info.js"; +import { createSessionReport } from "../helpers/session-report.js"; + +test("token-usage reporter summarizes comparable billable rows as strict JSON", async () => { + const writes: string[] = []; + const reporter = createTokenUsageReporter({ + stdout: { + write(chunk: string) { + writes.push(chunk); + return true; + }, + }, + }); + + const mainRunner = createRunnerInfo("open-main", { type: "opencode", model: "openai/gpt-5" }); + const fallbackRunner = createRunnerInfo("open-fallback", { + type: "opencode", + model: "openai/gpt-5-mini", + }); + const result: SuiteRunResult = { + suitePath: "examples/basic-suite.ts", + startedAt: "2026-04-02T12:00:00.000Z", + endedAt: "2026-04-02T12:01:00.000Z", + durationMs: 60_000, + suiteRunArtifactDir: ".skillgym-results/run-1", + declaredTags: [], + selectedTags: [], + cases: [ + { + caseId: "case-a", + tags: [], + passed: true, + runnerResults: [ + createRunnerResult(mainRunner, { + status: "passed", + usage: { + inputTokens: 1000, + outputTokens: 150, + reasoningTokens: 50, + cacheTokens: 0, + totalTokens: 1200, + source: { input: "provider", output: "provider", reasoning: "provider" }, + }, + }), + createRunnerResult(fallbackRunner, { + status: "passed", + successfulRepetitions: 2, + repetitions: [ + createRepetitionResult(fallbackRunner, 1, { + inputTokens: 1700, + outputTokens: 250, + reasoningTokens: 150, + cacheTokens: 0, + totalTokens: 2100, + source: { input: "provider", output: "provider", reasoning: "provider" }, + }), + createRepetitionResult(fallbackRunner, 2, { + inputTokens: 1700, + outputTokens: 250, + reasoningTokens: 150, + cacheTokens: 0, + totalTokens: 2100, + source: { input: "provider", output: "provider", reasoning: "provider" }, + }), + ], + usage: { + inputTokens: 1700, + outputTokens: 250, + reasoningTokens: 150, + cacheTokens: 0, + totalTokens: 2100, + source: { input: "provider", output: "provider", reasoning: "provider" }, + }, + }), + ], + }, + { + caseId: "case-b", + tags: [], + passed: false, + runnerResults: [ + createRunnerResult(mainRunner, { + status: "failed", + failureOrigin: "assertion", + failureClass: { id: "missing-rule", label: "Missing rule" }, + error: { name: "AssertionError", message: "expected prompt to keep critical rule" }, + usage: { + inputTokens: 900, + outputTokens: 200, + reasoningTokens: 100, + cacheTokens: 0, + totalTokens: 1200, + source: { input: "provider", output: "provider", reasoning: "provider" }, + }, + }), + createRunnerResult(fallbackRunner, { + status: "passed", + usage: { + inputTokens: 600, + outputTokens: 150, + reasoningTokens: 40, + cacheTokens: 0, + totalTokens: 790, + source: { input: "derived", output: "derived", reasoning: "derived" }, + }, + }), + ], + }, + { + caseId: "case-c", + tags: [], + passed: true, + runnerResults: [ + createRunnerResult(mainRunner, { + status: "passed", + usage: { + inputTokens: undefined, + outputTokens: undefined, + reasoningTokens: undefined, + cacheTokens: undefined, + totalTokens: undefined, + source: { input: "chars", output: "chars", reasoning: "chars" }, + }, + }), + ], + }, + ], + runners: [ + { + runner: mainRunner, + totalCases: 3, + passedCases: 2, + successRate: 2 / 3, + averageDurationMs: 1000, + averageTotalTokens: 1200, + }, + { + runner: fallbackRunner, + totalCases: 2, + passedCases: 2, + successRate: 1, + averageDurationMs: 1000, + averageTotalTokens: 1445, + }, + ], + }; + + const context = { + isInteractive: false, + cwd: "/workspace", + workspaceMode: "shared" as const, + suitePath: result.suitePath, + suiteRunArtifactDir: result.suiteRunArtifactDir, + selectedCaseCount: 3, + selectedRunnerCount: 2, + selectedExecutionCount: 5, + scheduleMode: "serial" as const, + maxParallel: 1, + declaredTags: [], + }; + + await reporter.onSuiteFinish?.({ context, result }); + + expect(writes).toHaveLength(1); + expect(() => JSON.parse(writes[0]!)).not.toThrow(); + + const output = JSON.parse(writes[0]!) as { + passed: boolean; + billable: { sum: number; avg: number } | null; + artifacts: string; + rows: Array<{ + case: string; + runner: string; + passed: boolean; + status: string; + usage: string; + billable: { sum: number; avg: number } | null; + failureOrigin?: string; + failureClass?: { id: string; label?: string }; + error?: { name: string; message: string }; + artifactDir?: string; + }>; + }; + + expect(output.passed).toBe(false); + expect(output.billable).toEqual({ sum: 5400, avg: 1650 }); + expect(output.artifacts).toBe(".skillgym-results/run-1"); + expect(output.rows).toEqual([ + { + case: "case-a", + runner: "open-main", + passed: true, + status: "passed", + usage: "provider", + billable: { sum: 1200, avg: 1200 }, + }, + { + case: "case-a", + runner: "open-fallback", + passed: true, + status: "passed", + usage: "provider", + billable: { sum: 4200, avg: 2100 }, + }, + { + case: "case-b", + runner: "open-main", + passed: false, + status: "failed", + usage: "provider", + billable: null, + failureOrigin: "assertion", + failureClass: { id: "missing-rule", label: "Missing rule" }, + error: { name: "AssertionError", message: "expected prompt to keep critical rule" }, + }, + { + case: "case-b", + runner: "open-fallback", + passed: true, + status: "passed", + usage: "derived", + billable: null, + }, + { + case: "case-c", + runner: "open-main", + passed: true, + status: "passed", + usage: "unavailable", + billable: null, + }, + ]); + + for (const row of output.rows) { + expect(row.artifactDir).toBeUndefined(); + } +}); + +test("token-usage reporter is silent until suite finishes", async () => { + const writes: string[] = []; + const reporter = createTokenUsageReporter({ + stdout: { + write(chunk: string) { + writes.push(chunk); + return true; + }, + }, + }); + + const runner = createRunnerInfo("open-main", { type: "opencode", model: "openai/gpt-5" }); + const context = { + isInteractive: false, + cwd: "/workspace", + workspaceMode: "shared" as const, + suitePath: "suite.ts", + suiteRunArtifactDir: ".skillgym-results/run-1", + selectedCaseCount: 1, + selectedRunnerCount: 1, + selectedExecutionCount: 1, + scheduleMode: "serial" as const, + maxParallel: 1, + declaredTags: [], + }; + + await reporter.onSuiteStart?.({ + context, + cases: [], + runners: [runner], + startedAt: "2026-04-02T12:00:00.000Z", + }); + await reporter.onRunnerStart?.({ + context, + case: { id: "case-a", prompt: "", assert() {} }, + runner, + caseIndex: 1, + totalCases: 1, + }); + + expect(writes).toHaveLength(0); +}); + +function createRunnerResult( + runner: ReturnType, + options: { + status: RunnerResult["status"]; + usage: { + inputTokens?: number; + outputTokens?: number; + reasoningTokens?: number; + cacheTokens?: number; + totalTokens?: number; + source: { + input: "provider" | "derived" | "chars"; + output: "provider" | "derived" | "chars"; + reasoning: "provider" | "derived" | "chars"; + }; + }; + successfulRepetitions?: number; + repetitions?: RunnerResult["repetitions"]; + error?: RunnerResult["error"]; + failureOrigin?: RunnerResult["failureOrigin"]; + failureClass?: RunnerResult["failureClass"]; + }, +): RunnerResult { + return { + runner, + passed: options.status === "passed", + status: options.status, + durationMs: 1000, + executionArtifactDir: ".skillgym-results/run-1/execution", + artifactDir: ".skillgym-results/run-1/execution", + report: createSessionReport({ + runner, + usage: { + inputTokens: options.usage.inputTokens, + outputTokens: options.usage.outputTokens, + reasoningTokens: options.usage.reasoningTokens, + cacheTokens: options.usage.cacheTokens, + totalTokens: options.usage.totalTokens, + inputChars: 100, + outputChars: 40, + reasoningChars: 20, + source: options.usage.source, + }, + }), + successfulRepetitions: options.successfulRepetitions, + repetitions: options.repetitions, + error: options.error, + failureOrigin: options.failureOrigin, + failureClass: options.failureClass, + }; +} + +function createRepetitionResult( + runner: ReturnType, + repetition: number, + usage: { + inputTokens?: number; + outputTokens?: number; + reasoningTokens?: number; + cacheTokens?: number; + totalTokens?: number; + source: { + input: "provider" | "derived" | "chars"; + output: "provider" | "derived" | "chars"; + reasoning: "provider" | "derived" | "chars"; + }; + }, +): NonNullable[number] { + return { + ...createRunnerResult(runner, { status: "passed", usage }), + repetition, + }; +} diff --git a/test/runner/execute-suite.reporter.test.ts b/test/runner/execute-suite.reporter.test.ts index 79eb23e..dffdd32 100644 --- a/test/runner/execute-suite.reporter.test.ts +++ b/test/runner/execute-suite.reporter.test.ts @@ -1483,6 +1483,8 @@ test("executeSuite raises process max listeners for parallel runs and restores i passed: true, durationMs: 10, executionArtifactDir: options.artifactDir, + outputTokens: 0, + observedReads: 0, }); }, }); diff --git a/test/runner/model-rejection.test.ts b/test/runner/model-rejection.test.ts index d3c6200..7371896 100644 --- a/test/runner/model-rejection.test.ts +++ b/test/runner/model-rejection.test.ts @@ -102,6 +102,7 @@ async function createResultWithLogs(options: { passed: false, status: "failed", durationMs: 1, + executionArtifactDir: artifactDir, artifactDir, error: { name: "Error", From 5865536ef26de26f47a4e5036ef631c2346a4a85 Mon Sep 17 00:00:00 2001 From: Szymon Chmal Date: Tue, 19 May 2026 09:16:27 +0200 Subject: [PATCH 2/2] Document token optimization workflow --- DICTIONARY.md | 2 ++ docs/reporters.md | 15 +++++++- skills/core.md | 3 ++ skills/reporters.md | 8 +++++ skills/token-optimization.md | 68 ++++++++++++++++++++++++++++++++++++ test/cli.test.ts | 11 ++++++ 6 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 skills/token-optimization.md diff --git a/DICTIONARY.md b/DICTIONARY.md index af5c711..43973be 100644 --- a/DICTIONARY.md +++ b/DICTIONARY.md @@ -36,6 +36,8 @@ Before adding or changing a term, check this file and ask the user for approval. - `workspace bootstrap`: Command run in a provisioned workspace before the agent starts. - `schedule`: Execution ordering and concurrency policy. - `reporter`: Component rendering suite-run progress and results. +- `token-usage`: Built-in reporter that emits compact JSON billable-token summaries for agent consumption. +- `token-optimization`: Bundled skill that guides an agent through baseline, minimal edits, and before/after token comparison. - `skill detection`: Evidence that a skill was used, with confidence and evidence. - `session event`: Normalized event observed during a session. - `snapshot baseline`: Stored token baseline for a benchmark case x runner pair. diff --git a/docs/reporters.md b/docs/reporters.md index fa05668..b099aec 100644 --- a/docs/reporters.md +++ b/docs/reporters.md @@ -10,6 +10,7 @@ Execution, aggregation, and `results.json` writing stay in the runner. Reporters skillgym run --reporter standard skillgym run --reporter json skillgym run --reporter json-summary +skillgym run --reporter token-usage skillgym run --reporter github-actions skillgym run --reporter html skillgym run --reporter ./examples/custom-reporter.ts @@ -17,7 +18,7 @@ skillgym run --schedule isolated-by-runner --max-parallel 4 ``` - Omitting `--reporter` uses the built-in `standard` reporter. -- Built-in reporters are `standard`, `json`, `json-summary`, `github-actions`, and `html`. +- Built-in reporters are `standard`, `json`, `json-summary`, `token-usage`, `github-actions`, and `html`. - Relative paths resolve from `process.cwd()`. ## Config @@ -138,6 +139,18 @@ The built-in `json-summary` reporter writes a trimmed JSON summary to stdout — - Per-runner results include `failureClass` when present so downstream tooling can keep grouped-failure semantics. - It is useful for post-run analysis steps or feeding results to an LLM. +## Token usage reporter + +The built-in `token-usage` reporter writes one compact JSON object to stdout at `onSuiteFinish`. + +- It emits top-level `passed`, `billable`, `artifacts`, and `rows` fields. +- Each row represents one `case x runner` result. +- Passed rows with provider-backed normalized totals expose `billable: { sum, avg }`. +- Failed rows and rows with derived or unavailable totals stay in the output with `billable: null`. +- Top-level `billable` aggregates only passed comparable rows, so failed or non-comparable rows do not distort optimization comparisons. +- No extra token-report artifact is written; use the listed artifact directory and the normal suite-run artifacts for debugging. +- It is intended for agent loops such as prompt or skill minimization where stdout must stay strict and machine-readable. + ## HTML reporter The built-in `html` reporter writes a self-contained `report.html` file to the suite run artifact directory. diff --git a/skills/core.md b/skills/core.md index 1d98660..5675dbf 100644 --- a/skills/core.md +++ b/skills/core.md @@ -16,6 +16,7 @@ skillgym skills get core skillgym run skillgym run --case skillgym run --runner +skillgym skills get token-optimization ``` Typical agent loop: @@ -85,6 +86,8 @@ Read the focused skills only when the task needs them: Use when benchmarking token regressions or updating snapshot baselines. - `skillgym skills get reporters` Use when choosing built-in reporters or wiring a custom reporter. +- `skillgym skills get token-optimization` + Use when reducing billable token usage for one explicit prompt, suite, or skill target without broad refactors. ## Suggested authoring order diff --git a/skills/reporters.md b/skills/reporters.md index 8a1c34e..793bbf1 100644 --- a/skills/reporters.md +++ b/skills/reporters.md @@ -12,7 +12,9 @@ Use this skill when choosing how benchmark results should be rendered or consume - `standard` - `json` - `json-summary` +- `token-usage` - `github-actions` +- `html` ## Main commands @@ -20,7 +22,9 @@ Use this skill when choosing how benchmark results should be rendered or consume skillgym run --reporter standard skillgym run --reporter json skillgym run --reporter json-summary +skillgym run --reporter token-usage skillgym run --reporter github-actions +skillgym run --reporter html skillgym run --reporter ./path/to/custom-reporter.ts ``` @@ -35,7 +39,9 @@ skillgym run --reporter ./path/to/custom-reporter.ts - `standard`: default interactive CLI output for humans - `json`: full aggregated result on stdout for machine consumers - `json-summary`: trimmed result for post-processing or LLM consumption +- `token-usage`: compact JSON billable summary for optimization loops and other agent consumers - `github-actions`: CI annotations and job summary output +- `html`: self-contained artifact for manual result review ## Custom reporter shape @@ -62,3 +68,5 @@ export default reporter; - `onError` Use `json-summary` when another agent or tool needs a smaller result than the full session report. + +Use `token-usage` when an agent needs strict compact JSON with one row per `case x runner`, comparable `billable` totals for provider-backed passed rows, and artifact paths for deeper debugging when a row fails. diff --git a/skills/token-optimization.md b/skills/token-optimization.md new file mode 100644 index 0000000..ba3219e --- /dev/null +++ b/skills/token-optimization.md @@ -0,0 +1,68 @@ +--- +name: token-optimization +description: Reduce billable token usage for one explicit Skillgym target. Covers baseline measurement with the token-usage reporter, minimal safe edits, verification loops, and when to fall back to artifacts. +--- + +# skillgym token-optimization + +Use this skill when the goal is to reduce billable token usage for one explicit target without breaking the benchmark. + +## Required input + +Start only when the optimization target is explicit. + +- valid targets: one prompt, one benchmark case, one suite slice, or one bundled skill/workflow file +- if the target is missing, ask one short clarification question and wait + +## Optimization loop + +1. Identify the smallest protecting suite or case slice that proves the target still works. +2. If none exists, create the smallest safe suite coverage first. +3. Run a passing baseline with the compact reporter. +4. Read the baseline JSON and note only comparable passed rows. +5. Make the smallest safe metadata edit to the explicit target. +6. Re-run the same slice. +7. Compare before and after billable totals only on passed comparable rows. +8. Stop when you hit the budget, reduction goal, or iteration limit. + +## Main commands + +```bash +skillgym run --reporter token-usage +skillgym run --case --reporter token-usage +skillgym run --runner --reporter token-usage +``` + +## Rules + +- require a passing baseline before editing +- keep stdout parsing on the `token-usage` reporter only +- do not create a second detailed token report; use the normal artifact directory for debugging +- failed rows still matter for diagnosis, but do not count lower token usage on failed rows +- derived or unavailable token rows are not comparable; treat `billable: null` as non-comparable +- prefer one safe minimization pass plus one verification run by default +- re-run after every change instead of batching edits +- keep edits scoped to the named target; avoid unrelated cleanup + +## How to compare runs + +- compare `rows[*].billable` for passed rows only +- use top-level `billable` only when the compared run covers the same comparable rows +- if a row fails, inspect the listed `artifacts` path and the standard run output before deciding what changed +- if the baseline does not pass, fix benchmark stability first instead of claiming a token win + +## Good targets + +- tighten one prompt that causes repeated tool churn +- shorten one bundled skill section that the agent reads every run +- remove redundant instructions from one stable workflow + +## Do not use this skill for + +- broad benchmark rewrites +- multi-target refactors with unclear attribution +- unstable suites that are still failing for functional reasons + +## After stabilization + +If the behavior is stable and you want regression protection, add or refresh snapshots after the optimization work. Snapshots are optional follow-up protection, not part of the optimization loop itself. diff --git a/test/cli.test.ts b/test/cli.test.ts index e938991..cee83cf 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -63,6 +63,7 @@ test("cli skills list prints bundled skill names", async () => { expect(result.stdout).toContain("core"); expect(result.stdout).toContain("reporters"); expect(result.stdout).toContain("snapshots"); + expect(result.stdout).toContain("token-optimization"); expect(result.stdout).toContain("cases"); expect(result.stdout).toContain("workspaces"); }); @@ -77,6 +78,16 @@ test("cli skills get core prints the bundled core skill", async () => { expect(result.stdout).toContain("skillgym run "); }); +test("cli skills get token-optimization prints the bundled optimization skill", async () => { + const result = await execCli(["skills", "get", "token-optimization"]); + + expect(result.exitCode).toBe(0); + expect(result.stderr).toBe(""); + expect(result.stdout).toContain("# skillgym token-optimization"); + expect(result.stdout).toContain("skillgym run --reporter token-usage"); + expect(result.stdout).toContain("require a passing baseline before editing"); +}); + test("cli skills get reports missing skill name without printing MOTD banner", async () => { const result = await execCli(["skills", "get"]);