From 425549291b18bb7185e8a9bde0ae8c4743502691 Mon Sep 17 00:00:00 2001 From: kimizuka Date: Sun, 21 Jun 2026 02:21:27 +0000 Subject: [PATCH 1/4] feat(hooks): expose per-turn usage and cost in after_llm_call Forward the per-call token usage and computed USD cost to the after_llm_call hook payload so sidecar cost ledgers can record per-call spend from the payload alone, without subscribing to the runtime event channel. Cost is a *float64 so the wire contract can distinguish an unpriced model (nil, key absent) from a priced free call (pointer to 0). The per-turn cost is computed once in computeMessageCost and threaded into both the hook payload and the recorded assistant message, so the two can never disagree. For harness agents the cost is surfaced only when the harness reported a non-zero value, avoiding reporting a billed turn as free when a harness omits its cost (e.g. codex). Signed-off-by: kimizuka --- pkg/hooks/types.go | 36 ++++++++++++++++++ pkg/runtime/cost_test.go | 57 ++--------------------------- pkg/runtime/harness.go | 15 +++++++- pkg/runtime/hooks.go | 13 +++++-- pkg/runtime/loop.go | 79 +++++++++++++++++++++++++--------------- 5 files changed, 113 insertions(+), 87 deletions(-) diff --git a/pkg/hooks/types.go b/pkg/hooks/types.go index 54d2d5e36..02ab8b22a 100644 --- a/pkg/hooks/types.go +++ b/pkg/hooks/types.go @@ -90,6 +90,12 @@ const ( EventBeforeLLMCall EventType = "before_llm_call" // EventAfterLLMCall fires immediately after a successful model call, // before the response is recorded. Failed calls fire EventOnError. + // The Input carries the response text in [Input.StopResponse] + // (matching the stop event), the model that produced it in + // [Input.ModelID], and per-turn billing data in [Input.Usage] and + // [Input.Cost] so sidecar cost ledgers can record per-call spend + // from the payload alone, without subscribing to the runtime event + // channel. EventAfterLLMCall EventType = "after_llm_call" // EventSessionEnd fires when a session terminates. EventSessionEnd EventType = "session_end" @@ -319,6 +325,36 @@ type Input struct { ApprovalDecision string `json:"approval_decision,omitempty"` ApprovalSource string `json:"approval_source,omitempty"` + // AfterLLMCall specific: per-turn token usage and the computed USD + // cost of the model response the runtime just received. Both are + // populated only for [EventAfterLLMCall] and are nil for every + // other event. They are the hook-side counterpart of the runtime's + // internal TokenUsageEvent and let sidecar cost ledgers record + // per-call spend from the payload alone. + // + // Usage is a pointer so a handler can distinguish "the provider + // reported no usage" (nil) from "usage was zero". + // + // Cost is a *float64 with three meaningful states, mirroring the + // runtime's own pricing gate (usage present AND a model definition + // with a pricing table): + // - nil → unpriced: the model has no pricing data on file + // (unknown model ID, custom endpoint without cost + // config) or the provider reported no usage. With + // omitempty the "cost" key is absent on the wire. + // - 0 → a priced model whose computed cost is genuinely zero + // (a free call). Emitted as "cost": 0, NOT elided — + // omitempty on a pointer drops only nil, never a + // non-nil pointer to the zero value. + // - non-0 → the priced USD cost of this single response. + // A handler therefore reads a present "cost" as authoritative and + // an absent one as "unpriced", with no need to cross-check usage. + // (This is deliberately a *float64, unlike [chat.Message.Cost], + // which is a plain float64 with omitempty and so cannot distinguish + // a free priced call from an unpriced one on the wire.) + Usage *chat.Usage `json:"usage,omitempty"` + Cost *float64 `json:"cost,omitempty"` + // Compaction fields (BeforeCompaction, AfterCompaction). InputTokens int64 `json:"input_tokens,omitempty"` OutputTokens int64 `json:"output_tokens,omitempty"` diff --git a/pkg/runtime/cost_test.go b/pkg/runtime/cost_test.go index 134e48241..41fb10819 100644 --- a/pkg/runtime/cost_test.go +++ b/pkg/runtime/cost_test.go @@ -6,61 +6,12 @@ import ( "github.com/stretchr/testify/assert" "github.com/docker/docker-agent/pkg/chat" - "github.com/docker/docker-agent/pkg/modelsdev" ) -// TestComputeMessageCost_UncataloguedModelIsUnpriced is the regression test for -// the silent "$0 cost despite token usage" leak: when a model is absent from -// the pricing catalogue (m == nil) or carries no price table (m.Cost == nil), -// the per-message cost is 0 even though real tokens were spent. Before the fix -// the caller could not tell this apart from a genuinely free turn, so a spend -// guardrail built on the cost would never trip. computeMessageCost now reports -// priced=false for exactly these cases. -func TestComputeMessageCost_UncataloguedModelIsUnpriced(t *testing.T) { - usage := &chat.Usage{InputTokens: 1000, OutputTokens: 500} - - t.Run("model missing from catalogue", func(t *testing.T) { - cost, priced := computeMessageCost(usage, nil) - assert.Zero(t, cost) - assert.False(t, priced, "an uncatalogued model must be reported as unpriced") - }) - - t.Run("model present but no price table", func(t *testing.T) { - cost, priced := computeMessageCost(usage, &modelsdev.Model{}) - assert.Zero(t, cost) - assert.False(t, priced, "a model with no Cost table must be reported as unpriced") - }) - - t.Run("nil usage", func(t *testing.T) { - cost, priced := computeMessageCost(nil, &modelsdev.Model{Cost: &modelsdev.Cost{Input: 1}}) - assert.Zero(t, cost) - assert.False(t, priced) - }) -} - -// TestComputeMessageCost_PricedModel verifies the cost arithmetic is unchanged -// from the original inline formula and that a catalogued model reports priced. -func TestComputeMessageCost_PricedModel(t *testing.T) { - usage := &chat.Usage{ - InputTokens: 1_000_000, - OutputTokens: 2_000_000, - CachedInputTokens: 3_000_000, - CacheWriteTokens: 4_000_000, - } - m := &modelsdev.Model{Cost: &modelsdev.Cost{ - Input: 1.0, - Output: 2.0, - CacheRead: 0.5, - CacheWrite: 3.0, - }} - - cost, priced := computeMessageCost(usage, m) - - assert.True(t, priced, "a catalogued model with a price table must be reported as priced") - // (1e6*1 + 2e6*2 + 3e6*0.5 + 4e6*3) / 1e6 = 1 + 4 + 1.5 + 12 = 18.5 - assert.InDelta(t, 18.5, cost, 1e-9) -} - +// TestUsageHasTokens covers the helper that suppresses the missing-price +// warning for empty/no-op turns. The per-message cost arithmetic and its +// nil/unpriced branches are exercised by TestComputeMessageCost in +// after_llm_call_test.go, which shares the same computeMessageCost source. func TestUsageHasTokens(t *testing.T) { assert.False(t, usageHasTokens(nil), "nil usage has no tokens") assert.False(t, usageHasTokens(&chat.Usage{}), "zero usage has no tokens") diff --git a/pkg/runtime/harness.go b/pkg/runtime/harness.go index c48b380f8..f2de19cb0 100644 --- a/pkg/runtime/harness.go +++ b/pkg/runtime/harness.go @@ -189,7 +189,20 @@ func (r *LocalRuntime) runHarnessAgent(ctx context.Context, sess *session.Sessio content = strings.TrimSpace(finalResult) } - r.executeAfterLLMCallHooks(ctx, sess, a, modelID, content) + // A harness reports its own TotalCostUSD, which the harness + // library defaults to 0 whenever the harness output omits a cost + // (e.g. the codex harness never reports one). That 0 is + // indistinguishable from a genuinely free call, so — to avoid + // telling a cost ledger that a billed turn was free — surface cost + // only when the harness reported a non-zero value and leave it nil + // (unpriced) otherwise. This keeps the wire contract honest: a + // present cost is always a real reported figure. + var hookCost *float64 + if cost != 0 { + c := cost + hookCost = &c + } + r.executeAfterLLMCallHooks(ctx, sess, a, modelID, content, usage, hookCost) r.recordHarnessAssistantMessage(sess, a, content, modelID, usage, cost, events) r.executeStopHooks(ctx, sess, a, content, events) diff --git a/pkg/runtime/hooks.go b/pkg/runtime/hooks.go index 4b9b02455..dc53da57a 100644 --- a/pkg/runtime/hooks.go +++ b/pkg/runtime/hooks.go @@ -453,15 +453,22 @@ func (r *LocalRuntime) executeBeforeLLMCallHooks( // model call, before the response is recorded into the session and // tool calls are dispatched. The assistant text content is passed via // stop_response (matching the stop event), so handlers can reuse the -// same parsing logic. Failed model calls fire on_error instead and -// skip this event. -func (r *LocalRuntime) executeAfterLLMCallHooks(ctx context.Context, sess *session.Session, a *agent.Agent, modelID, responseContent string) { +// same parsing logic. The per-turn token usage and computed USD cost +// are forwarded via [hooks.Input.Usage] and [hooks.Input.Cost] so +// sidecar cost ledgers can record per-call spend from the payload +// alone. cost is a *float64 so an unpriced model (nil) is distinct on +// the wire from a priced free call (a pointer to 0); the caller owns +// that distinction. Failed model calls fire on_error instead and skip +// this event. +func (r *LocalRuntime) executeAfterLLMCallHooks(ctx context.Context, sess *session.Session, a *agent.Agent, modelID, responseContent string, usage *chat.Usage, cost *float64) { r.dispatchHook(ctx, a, hooks.EventAfterLLMCall, &hooks.Input{ SessionID: sess.ID, AgentName: a.Name(), ModelID: modelID, StopResponse: responseContent, LastUserMessage: sess.GetLastUserMessageContent(), + Usage: usage, + Cost: cost, }, nil) } diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go index e8ab1fa06..9281f1f83 100644 --- a/pkg/runtime/loop.go +++ b/pkg/runtime/loop.go @@ -412,8 +412,8 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session, // the actual inference context), then falls back to the models.dev // catalogue. The lookup above is reused inside resolveContextLimit // only when context_size isn't supplied; we keep the explicit call - // here because m is also threaded into [recordAssistantMessage] for - // per-message cost computation. + // here because m is also passed to [computeMessageCost] for + // per-turn cost computation. contextLimit := r.resolveContextLimit(ctx, model, modelID) if contextLimit > 0 && r.sessionCompaction && compaction.ShouldCompact(sess.InputTokens, sess.OutputTokens, 0, contextLimit) { r.compactWithReason(ctx, sess, "", compactionReasonThreshold, sink) @@ -611,11 +611,20 @@ func (r *LocalRuntime) runTurn( // A successful model call resets the overflow compaction counter. ls.overflowCompactions = 0 + // Compute the per-turn cost once, here, so the exact same value + // reaches both the after_llm_call hook payload and the recorded + // assistant message — the hook's cost is therefore guaranteed to + // equal the cost the session bills for this turn. It is nil when + // the turn cannot be priced (no usage, or a model with no pricing + // table); see computeMessageCost. + msgCost := computeMessageCost(res.Usage, m) + // after_llm_call hooks fire on success only; failed calls // fire on_error above. The assistant text content is passed // via stop_response, matching the stop event's payload, so - // handlers can reuse the same parsing. - r.executeAfterLLMCallHooks(ctx, sess, a, modelID.String(), res.Content) + // handlers can reuse the same parsing. Usage and Cost carry the + // per-turn billing data for sidecar cost ledgers. + r.executeAfterLLMCallHooks(ctx, sess, a, modelID.String(), res.Content, res.Usage, msgCost) if usedModel != nil && usedModel.ID() != model.ID() { slog.InfoContext(ctx, "Used fallback model", "agent", a.Name(), "primary", model.ID().String(), "used", usedModel.ID().String()) @@ -637,7 +646,7 @@ func (r *LocalRuntime) runTurn( events.Emit(Warning(fmt.Sprintf("Model %s refused to respond (stop reason: refusal).", modelID.String()), a.Name())) } - msgUsage := r.recordAssistantMessage(sess, a, res, agentTools, modelID.String(), m, events) + msgUsage := r.recordAssistantMessage(sess, a, res, agentTools, modelID.String(), msgCost, events) usage := SessionUsage(sess, contextLimit) usage.LastMessage = msgUsage @@ -793,16 +802,39 @@ func (r *LocalRuntime) Run(ctx context.Context, sess *session.Session) ([]sessio return sess.GetAllMessages(), nil } +// computeMessageCost returns the USD cost of a single model response, +// or nil when the response cannot be priced. It is nil when there is +// no usage to price (usage == nil) or the model has no pricing table +// (m == nil — e.g. an unknown model ID or a custom endpoint without +// cost config — or m.Cost == nil). A non-nil result of 0 therefore +// means "priced, but this call was free", distinct from "unpriced" +// (nil). This single arithmetic source feeds both the persisted +// assistant message (dereferenced to 0 when nil) and the +// after_llm_call hook payload (which keeps the nil/0 distinction), so +// the two can never disagree. +func computeMessageCost(usage *chat.Usage, m *modelsdev.Model) *float64 { + if usage == nil || m == nil || m.Cost == nil { + return nil + } + cost := (float64(usage.InputTokens)*m.Cost.Input + + float64(usage.OutputTokens)*m.Cost.Output + + float64(usage.CachedInputTokens)*m.Cost.CacheRead + + float64(usage.CacheWriteTokens)*m.Cost.CacheWrite) / 1e6 + return &cost +} + // recordAssistantMessage adds the model's response to the session and returns // per-message usage information for the token-usage event. Empty responses // (no text and no tool calls) are silently skipped since providers reject them. +// cost is the precomputed per-turn cost (see computeMessageCost); nil records +// as 0, matching the previous "no pricing data" behaviour. func (r *LocalRuntime) recordAssistantMessage( sess *session.Session, a *agent.Agent, res streamResult, agentTools []tools.Tool, modelID string, - m *modelsdev.Model, + cost *float64, events EventSink, ) *MessageUsage { if strings.TrimSpace(res.Content) == "" && len(res.Calls) == 0 { @@ -824,13 +856,17 @@ func (r *LocalRuntime) recordAssistantMessage( } } - // Calculate per-message cost when pricing information is available. - // When the model is absent from the catalogue (or carries no price - // table) the cost is silently 0 even though tokens were spent; warn so - // the otherwise-invisible "uncatalogued model bills $0" leak is at least - // observable in logs and any spend guardrail built on top of it. - messageCost, priced := computeMessageCost(res.Usage, m) - if !priced && usageHasTokens(res.Usage) { + // The per-turn cost was computed once in runTurn and threaded in; + // nil means the response could not be priced and records as 0, + // preserving the previous "no pricing data" behaviour. When the model + // is absent from the catalogue (or carries no price table) the cost is + // silently 0 even though tokens were spent; warn so the otherwise- + // invisible "uncatalogued model bills $0" leak is at least observable + // in logs and any spend guardrail built on top of it. + var messageCost float64 + if cost != nil { + messageCost = *cost + } else if usageHasTokens(res.Usage) { slog.Warn("Model is missing from the pricing catalogue; recording $0 cost despite token usage", "agent", a.Name(), "model", modelID, @@ -873,23 +909,6 @@ func (r *LocalRuntime) recordAssistantMessage( return msgUsage } -// computeMessageCost returns the dollar cost of a single assistant message -// and whether pricing information was actually available. priced is false -// when usage is nil, the model is unknown to the catalogue, or it carries no -// price table; callers use that signal to distinguish a genuine $0 turn from -// an uncatalogued-model turn whose real cost is unknown. The arithmetic is -// unchanged from the original inline computation. -func computeMessageCost(usage *chat.Usage, m *modelsdev.Model) (cost float64, priced bool) { - if usage == nil || m == nil || m.Cost == nil { - return 0, false - } - cost = (float64(usage.InputTokens)*m.Cost.Input + - float64(usage.OutputTokens)*m.Cost.Output + - float64(usage.CachedInputTokens)*m.Cost.CacheRead + - float64(usage.CacheWriteTokens)*m.Cost.CacheWrite) / 1e6 - return cost, true -} - // usageHasTokens reports whether any billable tokens were recorded for a turn. // Used to suppress the missing-price warning for empty/no-op turns. func usageHasTokens(usage *chat.Usage) bool { From e78f3b54dca79a66964521a7f5fba02814136273 Mon Sep 17 00:00:00 2001 From: kimizuka Date: Sun, 21 Jun 2026 02:21:27 +0000 Subject: [PATCH 2/4] test(runtime): cover after_llm_call usage and cost payload Verify that after_llm_call populates usage and cost, that cost is nil when the model is unpriced, the nil-vs-zero JSON contract, harness usage with no cost surfacing as unpriced, and computeMessageCost. Signed-off-by: kimizuka --- pkg/runtime/after_llm_call_test.go | 260 +++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) diff --git a/pkg/runtime/after_llm_call_test.go b/pkg/runtime/after_llm_call_test.go index 2f0f519d8..7a76994ad 100644 --- a/pkg/runtime/after_llm_call_test.go +++ b/pkg/runtime/after_llm_call_test.go @@ -2,6 +2,9 @@ package runtime import ( "context" + "encoding/json" + "os" + stdruntime "runtime" "sync/atomic" "testing" @@ -9,12 +12,28 @@ import ( "github.com/stretchr/testify/require" "github.com/docker/docker-agent/pkg/agent" + "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/config/latest" "github.com/docker/docker-agent/pkg/hooks" + "github.com/docker/docker-agent/pkg/modelsdev" "github.com/docker/docker-agent/pkg/session" "github.com/docker/docker-agent/pkg/team" ) +// mockModelStoreWithCost returns a model carrying a fixed pricing +// table so after_llm_call can compute a non-nil per-turn cost. The +// zero mockModelStore returns a nil model, which exercises the +// unpriced (nil cost) path instead. +type mockModelStoreWithCost struct { + ModelStore + cost modelsdev.Cost +} + +func (m mockModelStoreWithCost) GetModel(_ context.Context, _ modelsdev.ID) (*modelsdev.Model, error) { + c := m.cost + return &modelsdev.Model{Cost: &c}, nil +} + // TestAfterLLMCallHook_PopulatesModelID is a regression test for the // doc/impl mismatch where [hooks.Input.ModelID] is documented as // populated for after_llm_call but executeAfterLLMCallHooks never @@ -74,3 +93,244 @@ func TestAfterLLMCallHook_PopulatesModelID(t *testing.T) { "after_llm_call payload must include the canonical model id; "+ "see pkg/hooks/types.go:177-186 for the documented contract") } + +// captureAfterLLMCall runs a single successful turn against the given +// model store and returns the after_llm_call payload the runtime +// dispatched, together with the session so callers can cross-check the +// hook cost against what the session recorded. Usage is fixed at 10 +// input / 5 output tokens so callers can assert an exact computed cost. +func captureAfterLLMCall(t *testing.T, store ModelStore) (*hooks.Input, *session.Session) { + t.Helper() + + const hookName = "test-after-llm-usage-cost" + + var captured atomic.Pointer[hooks.Input] + + stream := newStreamBuilder(). + AddContent("ok"). + AddStopWithUsage(10, 5). + Build() + prov := &mockProvider{id: "test/mock-model", stream: stream} + + root := agent.New("root", "test agent", + agent.WithModel(prov), + agent.WithHooks(&latest.HooksConfig{ + AfterLLMCall: []latest.HookDefinition{ + {Type: "builtin", Command: hookName}, + }, + }), + ) + tm := team.New(team.WithAgents(root)) + + rt, err := NewLocalRuntime(tm, + WithSessionCompaction(false), + WithModelStore(store), + ) + require.NoError(t, err) + + require.NoError(t, rt.hooksRegistry.RegisterBuiltin( + hookName, + func(_ context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) { + snap := *in + captured.Store(&snap) + return nil, nil + }, + )) + + sess := session.New(session.WithUserMessage("hi")) + sess.Title = "Unit Test" + + for range rt.RunStream(t.Context(), sess) { + } + + got := captured.Load() + require.NotNil(t, got, "after_llm_call hook must fire on a successful turn") + return got, sess +} + +// TestAfterLLMCallHook_PopulatesUsageAndCost pins the priced-call +// contract: when the model has a pricing table, after_llm_call carries +// the provider's token usage and a non-nil Cost equal to the value the +// runtime records on the assistant message (same computeMessageCost +// call, threaded to both). +func TestAfterLLMCallHook_PopulatesUsageAndCost(t *testing.T) { + t.Parallel() + + rate := modelsdev.Cost{Input: 2.0, Output: 4.0} + in, sess := captureAfterLLMCall(t, mockModelStoreWithCost{cost: rate}) + + require.NotNil(t, in.Usage, "Usage must be populated on after_llm_call") + assert.Equal(t, int64(10), in.Usage.InputTokens) + assert.Equal(t, int64(5), in.Usage.OutputTokens) + + // Same arithmetic as computeMessageCost; inputs chosen for exact + // float64 representation so equality is reliable. + expected := (float64(10)*rate.Input + float64(5)*rate.Output) / 1e6 + require.NotNil(t, in.Cost, "Cost must be non-nil for a priced model") + assert.Equal(t, expected, *in.Cost, + "hook Cost must equal computeMessageCost(usage, model)") + + // The headline guarantee: the cost the hook reports is the same + // cost the session bills for the turn. OwnCost sums the recorded + // assistant message's Cost, set from the same computeMessageCost + // value threaded into recordAssistantMessage. + assert.Equal(t, *in.Cost, sess.OwnCost(), + "hook Cost must equal the cost the session recorded for the turn") +} + +// TestAfterLLMCallHook_CostNilWhenUnpriced pins the unpriced contract: +// when the model has no pricing data (the zero mockModelStore returns a +// nil model), Usage is still populated but Cost is nil — the signal a +// sidecar reads as "this model is unpriced", distinct from a priced +// free call (a non-nil pointer to 0). +func TestAfterLLMCallHook_CostNilWhenUnpriced(t *testing.T) { + t.Parallel() + + in, _ := captureAfterLLMCall(t, mockModelStore{}) + + require.NotNil(t, in.Usage, + "Usage must still be populated even when the model is unpriced") + assert.Equal(t, int64(10), in.Usage.InputTokens) + assert.Nil(t, in.Cost, + "Cost must be nil for an unpriced model so handlers can "+ + "distinguish it from a priced free call (pointer to 0)") +} + +// TestAfterLLMCallInput_CostJSONContract pins the wire format sidecar +// scripts depend on. With Cost as a *float64 + omitempty: +// - nil → the "cost" key is absent (unpriced), +// - &0 → "cost": 0 is present, NOT elided (priced free call — +// omitempty drops only nil pointers, never a pointer to 0), +// - &N → "cost": N. +// +// The same nil-omitted rule applies to Usage, keeping every non- +// after_llm_call event's payload free of spurious cost/usage keys. +func TestAfterLLMCallInput_CostJSONContract(t *testing.T) { + t.Parallel() + + marshalKeys := func(in *hooks.Input) map[string]any { + b, err := json.Marshal(in) + require.NoError(t, err) + var m map[string]any + require.NoError(t, json.Unmarshal(b, &m)) + return m + } + + t.Run("unpriced omits cost and usage", func(t *testing.T) { + t.Parallel() + m := marshalKeys(&hooks.Input{HookEventName: hooks.EventAfterLLMCall}) + _, hasCost := m["cost"] + _, hasUsage := m["usage"] + assert.False(t, hasCost, "nil Cost must be omitted, not emitted as null") + assert.False(t, hasUsage, "nil Usage must be omitted") + }) + + t.Run("priced free call emits explicit zero", func(t *testing.T) { + t.Parallel() + zero := 0.0 + m := marshalKeys(&hooks.Input{ + HookEventName: hooks.EventAfterLLMCall, + Usage: &chat.Usage{InputTokens: 1, OutputTokens: 1}, + Cost: &zero, + }) + raw, hasCost := m["cost"] + require.True(t, hasCost, + "a non-nil pointer to 0 must emit \"cost\": 0, not be elided — "+ + "this is what distinguishes a free priced call from an unpriced model") + assert.Equal(t, float64(0), raw) + _, hasUsage := m["usage"] + assert.True(t, hasUsage, "Usage must be present when set") + }) + + t.Run("priced call emits the value", func(t *testing.T) { + t.Parallel() + v := 0.0125 + m := marshalKeys(&hooks.Input{HookEventName: hooks.EventAfterLLMCall, Cost: &v}) + assert.Equal(t, 0.0125, m["cost"]) + }) +} + +// TestAfterLLMCallHook_HarnessUsageWithoutCostIsUnpriced pins the +// harness cost gate. The codex harness reports token counts via +// turn.completed but never a cost, so the harness library's +// TotalCostUSD defaults to 0. That 0 must be treated as unpriced (nil +// cost on the hook), NOT as a free priced call (cost 0) — otherwise a +// cost ledger would record a real, billed harness turn as $0. +func TestAfterLLMCallHook_HarnessUsageWithoutCostIsUnpriced(t *testing.T) { + if stdruntime.GOOS == "windows" { + t.Skip("shell script shim test") + } + + const hookName = "test-after-llm-harness-cost" + + binDir := t.TempDir() + writeHarnessScript(t, binDir, "codex", `#!/bin/sh +printf '%s\n' '{"type":"item.completed","item":{"type":"agent_message","text":"harness done"}}' +printf '%s\n' '{"type":"turn.completed","usage":{"input_tokens":120,"output_tokens":30}}' +`) + t.Setenv("PATH", binDir+string(os.PathListSeparator)+os.Getenv("PATH")) + + var captured atomic.Pointer[hooks.Input] + + root := agent.New("root", "You are an external coder.", + agent.WithHarness(&latest.HarnessConfig{Type: "codex"}), + agent.WithHooks(&latest.HooksConfig{ + AfterLLMCall: []latest.HookDefinition{{Type: "builtin", Command: hookName}}, + }), + ) + rt, err := NewLocalRuntime(team.New(team.WithAgents(root)), + WithSessionCompaction(false), WithModelStore(mockModelStore{})) + require.NoError(t, err) + + require.NoError(t, rt.hooksRegistry.RegisterBuiltin( + hookName, + func(_ context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) { + snap := *in + captured.Store(&snap) + return nil, nil + }, + )) + + sess := session.New(session.WithUserMessage("do the task")) + sess.Title = "Harness Unit Test" + for range rt.RunStream(t.Context(), sess) { + } + + in := captured.Load() + require.NotNil(t, in, "after_llm_call must fire for a harness turn") + require.NotNil(t, in.Usage, "harness usage must be forwarded to the hook") + assert.Equal(t, int64(120), in.Usage.InputTokens) + assert.Equal(t, int64(30), in.Usage.OutputTokens) + assert.Nil(t, in.Cost, + "a harness that reports no cost must yield nil cost (unpriced), not 0 (free)") +} + +// TestComputeMessageCost unit-tests the single cost-arithmetic source +// shared by the persisted message and the after_llm_call payload, +// including every branch that yields nil (unpriced). +func TestComputeMessageCost(t *testing.T) { + t.Parallel() + + rate := &modelsdev.Cost{Input: 2.0, Output: 4.0, CacheRead: 1.0, CacheWrite: 5.0} + + t.Run("nil usage is unpriced", func(t *testing.T) { + t.Parallel() + assert.Nil(t, computeMessageCost(nil, &modelsdev.Model{Cost: rate})) + }) + t.Run("nil model is unpriced", func(t *testing.T) { + t.Parallel() + assert.Nil(t, computeMessageCost(&chat.Usage{InputTokens: 1}, nil)) + }) + t.Run("model without pricing table is unpriced", func(t *testing.T) { + t.Parallel() + assert.Nil(t, computeMessageCost(&chat.Usage{InputTokens: 1}, &modelsdev.Model{})) + }) + t.Run("priced computes from all token classes", func(t *testing.T) { + t.Parallel() + usage := &chat.Usage{InputTokens: 10, OutputTokens: 5, CachedInputTokens: 4, CacheWriteTokens: 2} + got := computeMessageCost(usage, &modelsdev.Model{Cost: rate}) + require.NotNil(t, got) + expected := (10*rate.Input + 5*rate.Output + 4*rate.CacheRead + 2*rate.CacheWrite) / 1e6 + assert.Equal(t, expected, *got) + }) +} From d7c8d29438583495eb9ba52b929fc1a68e981233 Mon Sep 17 00:00:00 2001 From: kimizuka Date: Sun, 21 Jun 2026 02:21:27 +0000 Subject: [PATCH 3/4] docs(hooks): document after_llm_call usage and cost fields Describe the new usage and cost fields, the priced/unpriced/free semantics and the harness caveat, and add a per-call cost-ledger example to examples/hooks.yaml. Signed-off-by: kimizuka --- docs/configuration/hooks/index.md | 7 +++++-- examples/hooks.yaml | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/docs/configuration/hooks/index.md b/docs/configuration/hooks/index.md index 37ef7f2bc..bf44cf02c 100644 --- a/docs/configuration/hooks/index.md +++ b/docs/configuration/hooks/index.md @@ -263,7 +263,7 @@ In addition to the common fields, each event ships its own payload: | `turn_start` | _none_ (just the common fields) | | `turn_end` | `agent_name`, `reason` — one of `normal`, `continue`, `steered`, `error`, `canceled`, `hook_blocked`, `loop_detected` | | `before_llm_call` | `iteration` — 1-based run-loop iteration counter (the model call this hook is gating), `model_id` | -| `after_llm_call` | `agent_name`, `stop_response`, `last_user_message`, `model_id` | +| `after_llm_call` | `agent_name`, `stop_response`, `last_user_message`, `model_id`, `usage`, `cost` | | `session_end` | `reason` — one of `clear`, `logout`, `prompt_input_exit`, `other` | | `pre_compact` | `source` — one of `manual`, `auto`, `overflow`, `tool_overflow` | | `before_compaction` | `input_tokens`, `output_tokens`, `context_limit`, `compaction_reason` (one of `threshold`/`overflow`/`manual`) | @@ -288,6 +288,9 @@ Notes: - `prompt` is also populated for `user_followup_submit`, carrying the text of the dequeued follow-up message (a user message queued for end-of-turn processing via the FollowUp API / queue, as opposed to mid-turn steering). - `stop_response` carries the model's final assistant text for `stop`, `after_llm_call`, and `subagent_stop`. `last_user_message` carries the latest user message at dispatch time. - `model_id` is populated for `after_llm_call` (and `before_llm_call`) in the canonical `/` form (e.g. `anthropic/claude-sonnet-4-5`). For harness agents, `model_id` is the harness label (e.g. `claude-code`) rather than a canonical model name — see [Coding Harnesses]({{ '/features/harnesses/' | relative_url }}). +- `usage` and `cost` are populated for `after_llm_call` only. `usage` is the per-call token usage object (`input_tokens`, `output_tokens`, `cached_input_tokens`, `cached_write_tokens`, and `reasoning_tokens` — the last is itself omitted for non-reasoning models); the whole object is absent when the provider reported no usage. `cost` is the USD price of that one model response. For a **native model call** it is the price computed from `usage` and the model's pricing table, and equals the cost the session records for the turn: it is **absent** when the response is unpriced (no pricing data on file, or no usage) and an explicit `0` for a priced call that was free — so a present `cost` is authoritative and an absent one means "unpriced", with no need to cross-check `usage`. (For harness agents the meaning differs — see the next note.) A cost ledger can therefore record per-call spend from the payload alone, without subscribing to the runtime event channel. +- For [harness agents]({{ '/features/harnesses/' | relative_url }}), `cost` is the harness's own reported total for the call rather than a computed price, and is present only when the harness reported a non-zero cost (some harnesses, e.g. `codex`, report token counts but no cost — those turns carry `usage` with `cost` absent, even though the recorded message stores `0`). +- `after_llm_call` fires for **every** model call, including calls made inside sub-sessions (transferred tasks, background agents, skills). For those, `session_id` is the sub-session's id. Summing `cost` across `after_llm_call` events therefore captures **all** spend, including sub-sessions (and even sub-sessions that error before their cost is persisted). Do **not** add a separately-queried session cost total on top: the runtime's own total already recurses into and includes completed sub-session spend, so combining the two double-counts. Pick one source — the summed hook costs — as the authoritative ledger. - `context_limit` is `0` when the model definition is unavailable (treat `0` as "unknown", not as a real limit). - `approval_decision` is one of `allow`, `deny`, `canceled`. `approval_source` is a stable classifier of which step decided (e.g. `yolo`, `session_permissions_allow`, `session_permissions_deny`, `team_permissions_allow`, `team_permissions_deny`, `pre_tool_use_hook_allow`, `pre_tool_use_hook_deny`, `readonly_hint`, `user_approved`, `user_approved_session`, `user_approved_tool`, `user_rejected`, `context_canceled`). @@ -559,7 +562,7 @@ The `reason` field classifies the exit: `before_llm_call` fires immediately before every model call (after `turn_start` has assembled the messages). It cannot contribute context — use `turn_start` for that — but it can **stop the run** by returning `decision: block` (or exit code 2). The built-in `max_iterations` hook implements a hard cap on top of this event. -`after_llm_call` fires immediately after each successful model call, before the response is recorded into the session and tool calls are dispatched. The assistant text is in `stop_response`. Use it for response auditing, redaction logging, or quality metrics. Failed model calls fire `on_error` instead. +`after_llm_call` fires immediately after each successful model call, before the response is recorded into the session and tool calls are dispatched. The assistant text is in `stop_response`, and the call's `usage` and `cost` carry the per-turn token usage and computed USD spend (see the field notes above). Use it for response auditing, redaction logging, quality metrics, or a sidecar cost ledger that records per-call spend without subscribing to the runtime event channel. Failed model calls fire `on_error` instead. ### Before/After-Compaction: structured compaction control diff --git a/examples/hooks.yaml b/examples/hooks.yaml index 812357084..b9d10a584 100644 --- a/examples/hooks.yaml +++ b/examples/hooks.yaml @@ -74,6 +74,7 @@ # /tmp/agent-steering.log (user_steering_messages_submit) # /tmp/agent-followups.log (user_followup_submit) # /tmp/agent-llm-calls.log (before_llm_call, after_llm_call) +# /tmp/agent-cost-ledger.csv (after_llm_call: per-call token usage + cost) # /tmp/agent-turns.log (turn_end) # /tmp/agent-tool-results.log (post_tool_use) # /tmp/agent-permissions.log (permission_request) @@ -331,6 +332,14 @@ agents: # assistant text content arrives via stop_response (matching the # stop event's payload). Failed calls fire on_error instead and # skip this event. + # + # The payload also carries this call's token usage in .usage and its + # computed USD cost in .cost. .cost is ABSENT for an unpriced model + # (test with `has("cost")`) and an explicit 0 for a priced free call, + # so a present cost is authoritative without checking usage. That is + # everything a sidecar cost ledger needs — no event-channel wiring. + # after_llm_call also fires for sub-session turns (each with its own + # session_id), so summing .cost is the full spend for the run. # ==================================================================== after_llm_call: - type: command @@ -340,6 +349,12 @@ agents: SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "unknown"') LEN=$(echo "$INPUT" | jq -r '.stop_response // ""' | wc -c | tr -d ' ') echo "[$(date)] [←] $SESSION_ID llm call complete, content=$LEN chars" >> /tmp/agent-llm-calls.log + # Per-call cost ledger: timestamp, session, model, tokens, cost. + echo "$INPUT" | jq -r '[ + (now | todateiso8601), .session_id, .model_id, + (.usage.input_tokens // 0), (.usage.output_tokens // 0), + (if has("cost") then (.cost | tostring) else "unpriced" end) + ] | @csv' >> /tmp/agent-cost-ledger.csv # ==================================================================== # SESSION-END - cleanup when the session terminates. From f69f17c41890b5d9a2b84d5c5f903e7c513f46c9 Mon Sep 17 00:00:00 2001 From: kimizuka Date: Sun, 21 Jun 2026 02:21:27 +0000 Subject: [PATCH 4/4] test(runtime): satisfy golangci-lint in after_llm_call test Add the empty line embeddedstructfieldcheck wants between the embedded ModelStore and the cost field, and switch the float equality assertions to assert.InDelta to satisfy testifylint's float-compare rule. --- pkg/runtime/after_llm_call_test.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/runtime/after_llm_call_test.go b/pkg/runtime/after_llm_call_test.go index 7a76994ad..4eb8c6045 100644 --- a/pkg/runtime/after_llm_call_test.go +++ b/pkg/runtime/after_llm_call_test.go @@ -26,6 +26,7 @@ import ( // unpriced (nil cost) path instead. type mockModelStoreWithCost struct { ModelStore + cost modelsdev.Cost } @@ -167,14 +168,14 @@ func TestAfterLLMCallHook_PopulatesUsageAndCost(t *testing.T) { // float64 representation so equality is reliable. expected := (float64(10)*rate.Input + float64(5)*rate.Output) / 1e6 require.NotNil(t, in.Cost, "Cost must be non-nil for a priced model") - assert.Equal(t, expected, *in.Cost, + assert.InDelta(t, expected, *in.Cost, 1e-9, "hook Cost must equal computeMessageCost(usage, model)") // The headline guarantee: the cost the hook reports is the same // cost the session bills for the turn. OwnCost sums the recorded // assistant message's Cost, set from the same computeMessageCost // value threaded into recordAssistantMessage. - assert.Equal(t, *in.Cost, sess.OwnCost(), + assert.InDelta(t, *in.Cost, sess.OwnCost(), 1e-9, "hook Cost must equal the cost the session recorded for the turn") } @@ -237,7 +238,7 @@ func TestAfterLLMCallInput_CostJSONContract(t *testing.T) { require.True(t, hasCost, "a non-nil pointer to 0 must emit \"cost\": 0, not be elided — "+ "this is what distinguishes a free priced call from an unpriced model") - assert.Equal(t, float64(0), raw) + assert.InDelta(t, float64(0), raw, 1e-9) _, hasUsage := m["usage"] assert.True(t, hasUsage, "Usage must be present when set") }) @@ -246,7 +247,7 @@ func TestAfterLLMCallInput_CostJSONContract(t *testing.T) { t.Parallel() v := 0.0125 m := marshalKeys(&hooks.Input{HookEventName: hooks.EventAfterLLMCall, Cost: &v}) - assert.Equal(t, 0.0125, m["cost"]) + assert.InDelta(t, 0.0125, m["cost"], 1e-9) }) } @@ -331,6 +332,6 @@ func TestComputeMessageCost(t *testing.T) { got := computeMessageCost(usage, &modelsdev.Model{Cost: rate}) require.NotNil(t, got) expected := (10*rate.Input + 5*rate.Output + 4*rate.CacheRead + 2*rate.CacheWrite) / 1e6 - assert.Equal(t, expected, *got) + assert.InDelta(t, expected, *got, 1e-9) }) }