Merge pull request #774 from nonoqing/yuyiqing/dev

nonoqing · web-flow · commit 9bfb4b67d33c · 2026-05-18T21:46:59.000+08:00
feat(token usage)
diff --git a/src/apps/desktop/src/api/agentic_api.rs b/src/apps/desktop/src/api/agentic_api.rs
@@ -1469,6 +1469,7 @@ mod tests {
             session_id: "session-1".to_string(),
             timestamp: 1,
             kind: Default::default(),
+            agent_type: None,
             user_message: UserMessageData {
                 id: "user-1".to_string(),
                 content: "hello".to_string(),
@@ -1529,6 +1530,7 @@ mod tests {
             session_id: "session-1".to_string(),
             timestamp: 1,
             kind: Default::default(),
+            agent_type: None,
             user_message: UserMessageData {
                 id: "user-1".to_string(),
                 content: "hello".to_string(),
@@ -1586,6 +1588,7 @@ mod tests {
             session_id: "session-1".to_string(),
             timestamp: 1,
             kind: Default::default(),
+            agent_type: None,
             user_message: UserMessageData {
                 id: "user-1".to_string(),
                 content: "hello".to_string(),
diff --git a/src/crates/agent-stream/src/lib.rs b/src/crates/agent-stream/src/lib.rs
@@ -1071,6 +1071,7 @@ mod tests {
             total_token_count: total_tokens,
             reasoning_token_count: None,
             cached_content_token_count: None,
+            cache_creation_token_count: None,
         }
     }
 
diff --git a/src/crates/ai-adapters/src/stream/types/anthropic.rs b/src/crates/ai-adapters/src/stream/types/anthropic.rs
@@ -45,22 +45,27 @@ impl Usage {
 
 impl From<Usage> for UnifiedTokenUsage {
     fn from(value: Usage) -> Self {
-        let cache_read = value.cache_read_input_tokens.unwrap_or(0);
-        let cache_creation = value.cache_creation_input_tokens.unwrap_or(0);
-        let prompt_token_count = value.input_tokens.unwrap_or(0) + cache_read + cache_creation;
+        let cache_read = value.cache_read_input_tokens;
+        let cache_creation = value.cache_creation_input_tokens;
+
+        // prompt_token_count = total context tokens occupied (industry-standard
+        // "input tokens" metric). For Anthropic this is the three disjoint
+        // components summed; for other providers the API reports this directly.
+        let prompt_token_count = value.input_tokens.unwrap_or(0)
+            + cache_read.unwrap_or(0)
+            + cache_creation.unwrap_or(0);
         let candidates_token_count = value.output_tokens.unwrap_or(0);
+
         Self {
             prompt_token_count,
             candidates_token_count,
             total_token_count: prompt_token_count + candidates_token_count,
             reasoning_token_count: None,
-            cached_content_token_count: match (
-                value.cache_read_input_tokens,
-                value.cache_creation_input_tokens,
-            ) {
-                (None, None) => None,
-                (read, creation) => Some(read.unwrap_or(0) + creation.unwrap_or(0)),
-            },
+            // cached_content_token_count = cache READS only. This is the
+            // numerator for `cache hit rate = cached / prompt`. Writes go
+            // to cache_creation_token_count below.
+            cached_content_token_count: cache_read,
+            cache_creation_token_count: cache_creation,
         }
     }
 }
@@ -210,3 +215,76 @@ impl From<AnthropicSSEErrorDetails> for String {
         format!("{}: {}", value.error_type, value.message)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::stream::types::unified::UnifiedTokenUsage;
+
+    #[test]
+    fn cached_content_token_count_is_reads_only_not_sum() {
+        let raw = r#"{
+            "input_tokens": 100,
+            "output_tokens": 50,
+            "cache_read_input_tokens": 30,
+            "cache_creation_input_tokens": 20
+        }"#;
+        let usage: Usage = serde_json::from_str(raw).expect("valid anthropic usage");
+        let unified: UnifiedTokenUsage = usage.into();
+
+        // cached_content_token_count must be reads only — NOT read + creation.
+        // This guarantees `cached_content / prompt` is a correct hit rate.
+        assert_eq!(unified.cached_content_token_count, Some(30));
+        assert_eq!(unified.cache_creation_token_count, Some(20));
+
+        // prompt_token_count keeps "total context" semantic (matches industry
+        // standard "input tokens" metric across providers).
+        assert_eq!(unified.prompt_token_count, 150);
+        assert_eq!(unified.candidates_token_count, 50);
+        assert_eq!(unified.total_token_count, 200);
+
+        // Hit rate computed by downstream:
+        //   30 / 150 == 20% (correct: only reads count as hits)
+        // Pre-fix this would have been wrongly 50/150 == 33%.
+    }
+
+    #[test]
+    fn absent_cache_fields_stay_none() {
+        let raw = r#"{ "input_tokens": 100, "output_tokens": 50 }"#;
+        let usage: Usage = serde_json::from_str(raw).expect("valid anthropic usage");
+        let unified: UnifiedTokenUsage = usage.into();
+        assert_eq!(unified.cached_content_token_count, None);
+        assert_eq!(unified.cache_creation_token_count, None);
+    }
+
+    #[test]
+    fn zero_cache_fields_are_some_zero_not_none() {
+        // Cache support reported but zero this call must be distinguishable
+        // from "provider did not report cache fields at all".
+        let raw = r#"{
+            "input_tokens": 100,
+            "output_tokens": 50,
+            "cache_read_input_tokens": 0,
+            "cache_creation_input_tokens": 0
+        }"#;
+        let usage: Usage = serde_json::from_str(raw).expect("valid anthropic usage");
+        let unified: UnifiedTokenUsage = usage.into();
+        assert_eq!(unified.cached_content_token_count, Some(0));
+        assert_eq!(unified.cache_creation_token_count, Some(0));
+    }
+
+    #[test]
+    fn only_read_present_no_creation() {
+        let raw = r#"{
+            "input_tokens": 100,
+            "output_tokens": 50,
+            "cache_read_input_tokens": 30
+        }"#;
+        let usage: Usage = serde_json::from_str(raw).expect("valid anthropic usage");
+        let unified: UnifiedTokenUsage = usage.into();
+        assert_eq!(unified.cached_content_token_count, Some(30));
+        assert_eq!(unified.cache_creation_token_count, None);
+        // prompt_token_count = input + read (no creation contribution)
+        assert_eq!(unified.prompt_token_count, 130);
+    }
+}
diff --git a/src/crates/ai-adapters/src/stream/types/gemini.rs b/src/crates/ai-adapters/src/stream/types/gemini.rs
@@ -104,6 +104,7 @@ impl From<GeminiUsageMetadata> for UnifiedTokenUsage {
             total_token_count: usage.total_token_count,
             reasoning_token_count,
             cached_content_token_count: usage.cached_content_token_count,
+            cache_creation_token_count: None,
         }
     }
 }
@@ -770,4 +771,21 @@ mod tests {
             .and_then(|metadata| metadata.get("promptFeedback"))
             .is_some());
     }
+
+    #[test]
+    fn gemini_cache_creation_is_always_none() {
+        let payload = serde_json::json!({
+            "candidates": [{ "content": { "parts": [{ "text": "answer" }] } }],
+            "usageMetadata": {
+                "promptTokenCount": 100,
+                "candidatesTokenCount": 20,
+                "totalTokenCount": 120,
+                "cachedContentTokenCount": 35
+            }
+        });
+        let data: GeminiSSEData = serde_json::from_value(payload).expect("gemini payload");
+        let usage = data.into_unified_responses()[0].usage.as_ref().expect("usage").clone();
+        assert_eq!(usage.cached_content_token_count, Some(35));
+        assert_eq!(usage.cache_creation_token_count, None);
+    }
 }
diff --git a/src/crates/ai-adapters/src/stream/types/openai.rs b/src/crates/ai-adapters/src/stream/types/openai.rs
@@ -15,18 +15,34 @@ struct OpenAIUsage {
     #[serde(default)]
     total_tokens: u32,
     prompt_tokens_details: Option<PromptTokensDetails>,
+    /// DeepSeek extension. Subset of `prompt_tokens`. Absent on non-DeepSeek
+    /// providers. Prefer this over `prompt_tokens_details.cached_tokens` when
+    /// both are present — DeepSeek-native is the authoritative source.
+    #[serde(default)]
+    prompt_cache_hit_tokens: Option<u32>,
+    /// DeepSeek extension. Equals `prompt_tokens - prompt_cache_hit_tokens`.
+    /// Deserialized so a future strict serde lint doesn't reject the payload;
+    /// not propagated (the miss count is derivable from the other two).
+    #[serde(default)]
+    #[allow(dead_code)]
+    prompt_cache_miss_tokens: Option<u32>,
 }
 
 impl From<OpenAIUsage> for UnifiedTokenUsage {
     fn from(usage: OpenAIUsage) -> Self {
+        let standard_cached = usage
+            .prompt_tokens_details
+            .and_then(|details| details.cached_tokens);
+        // DeepSeek extension wins when both present.
+        let cache_read = usage.prompt_cache_hit_tokens.or(standard_cached);
+
         Self {
             prompt_token_count: usage.prompt_tokens,
             candidates_token_count: usage.completion_tokens,
             total_token_count: usage.total_tokens,
             reasoning_token_count: None,
-            cached_content_token_count: usage
-                .prompt_tokens_details
-                .and_then(|prompt_tokens_details| prompt_tokens_details.cached_tokens),
+            cached_content_token_count: cache_read,
+            cache_creation_token_count: None,
         }
     }
 }
@@ -692,4 +708,85 @@ mod tests {
         assert_eq!(responses.len(), 1);
         assert!(responses[0].tool_call.is_some());
     }
+
+    #[test]
+    fn standard_openai_cached_tokens_maps_through() {
+        let raw = r#"{
+            "id": "chatcmpl_test",
+            "created": 1,
+            "model": "gpt-test",
+            "choices": [],
+            "usage": {
+                "prompt_tokens": 100,
+                "completion_tokens": 20,
+                "total_tokens": 120,
+                "prompt_tokens_details": { "cached_tokens": 40 }
+            }
+        }"#;
+        let data: OpenAISSEData = serde_json::from_str(raw).expect("valid openai sse data");
+        let usage = data.into_unified_responses()[0].usage.as_ref().expect("usage").clone();
+        assert_eq!(usage.cached_content_token_count, Some(40));
+        assert_eq!(usage.cache_creation_token_count, None);
+    }
+
+    #[test]
+    fn deepseek_prompt_cache_hit_tokens_is_captured() {
+        // Pre-fix this field was silently dropped (strict serde, unknown key).
+        let raw = r#"{
+            "id": "chatcmpl_test",
+            "created": 1,
+            "model": "deepseek-chat",
+            "choices": [],
+            "usage": {
+                "prompt_tokens": 100,
+                "completion_tokens": 20,
+                "total_tokens": 120,
+                "prompt_cache_hit_tokens": 64,
+                "prompt_cache_miss_tokens": 36
+            }
+        }"#;
+        let data: OpenAISSEData = serde_json::from_str(raw).expect("valid deepseek sse data");
+        let usage = data.into_unified_responses()[0].usage.as_ref().expect("usage").clone();
+        assert_eq!(usage.cached_content_token_count, Some(64));
+    }
+
+    #[test]
+    fn deepseek_extension_preferred_over_standard_cached_tokens_if_both() {
+        // Defensive: if a proxy forwards both, prefer the DeepSeek-native field.
+        let raw = r#"{
+            "id": "chatcmpl_test",
+            "created": 1,
+            "model": "deepseek-chat",
+            "choices": [],
+            "usage": {
+                "prompt_tokens": 100,
+                "completion_tokens": 20,
+                "total_tokens": 120,
+                "prompt_cache_hit_tokens": 64,
+                "prompt_tokens_details": { "cached_tokens": 0 }
+            }
+        }"#;
+        let data: OpenAISSEData = serde_json::from_str(raw).expect("valid proxy payload");
+        let usage = data.into_unified_responses()[0].usage.as_ref().expect("usage").clone();
+        assert_eq!(usage.cached_content_token_count, Some(64));
+    }
+
+    #[test]
+    fn openai_no_cache_fields_stays_none() {
+        let raw = r#"{
+            "id": "chatcmpl_test",
+            "created": 1,
+            "model": "gpt-test",
+            "choices": [],
+            "usage": {
+                "prompt_tokens": 100,
+                "completion_tokens": 20,
+                "total_tokens": 120
+            }
+        }"#;
+        let data: OpenAISSEData = serde_json::from_str(raw).expect("valid openai sse data");
+        let usage = data.into_unified_responses()[0].usage.as_ref().expect("usage").clone();
+        assert_eq!(usage.cached_content_token_count, None);
+        assert_eq!(usage.cache_creation_token_count, None);
+    }
 }
diff --git a/src/crates/ai-adapters/src/stream/types/responses.rs b/src/crates/ai-adapters/src/stream/types/responses.rs
@@ -62,6 +62,7 @@ impl From<ResponsesUsage> for UnifiedTokenUsage {
             cached_content_token_count: usage
                 .input_tokens_details
                 .map(|details| details.cached_tokens),
+            cache_creation_token_count: None,
         }
     }
 }
diff --git a/src/crates/ai-adapters/src/stream/types/unified.rs b/src/crates/ai-adapters/src/stream/types/unified.rs
@@ -66,6 +66,16 @@ pub struct UnifiedTokenUsage {
     pub total_token_count: u32,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub reasoning_token_count: Option<u32>,
+    /// Cache READ tokens (i.e., served from cache this call). Universal across
+    /// providers: OpenAI `cached_tokens`, DeepSeek `prompt_cache_hit_tokens`,
+    /// Anthropic `cache_read_input_tokens`, Gemini `cachedContentTokenCount`.
+    /// Hit rate consumers must use this as numerator and `prompt_token_count`
+    /// as denominator.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub cached_content_token_count: Option<u32>,
+    /// Cache WRITE tokens (only Anthropic reports this per-token; others either
+    /// have no creation concept or bill creation by storage time). Disjoint from
+    /// `cached_content_token_count`. Do NOT include in hit-rate numerator.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub cache_creation_token_count: Option<u32>,
 }

Original file line number	Diff line number	Diff line change
`@@ -1071,6 +1071,7 @@ mod tests {`
`1071`	`1071`	`total_token_count: total_tokens,`
`1072`	`1072`	`reasoning_token_count: None,`
`1073`	`1073`	`cached_content_token_count: None,`
	`1074`	`+ cache_creation_token_count: None,`
`1074`	`1075`	`}`
`1075`	`1076`	`}`
`1076`	`1077`
Original file line number	Diff line number	Diff line change
`@@ -104,6 +104,7 @@ impl From<GeminiUsageMetadata> for UnifiedTokenUsage {`
`104`	`104`	`total_token_count: usage.total_token_count,`
`105`	`105`	`reasoning_token_count,`
`106`	`106`	`cached_content_token_count: usage.cached_content_token_count,`
	`107`	`+ cache_creation_token_count: None,`
`107`	`108`	`}`
`108`	`109`	`}`
`109`	`110`	`}`
`@@ -770,4 +771,21 @@ mod tests {`
`770`	`771`	`.and_then(\|metadata\| metadata.get("promptFeedback"))`
`771`	`772`	`.is_some());`
`772`	`773`	`}`
	`774`	`+`
	`775`	`+ #[test]`
	`776`	`+ fn gemini_cache_creation_is_always_none() {`
	`777`	`+ let payload = serde_json::json!({`
	`778`	`+ "candidates": [{ "content": { "parts": [{ "text": "answer" }] } }],`
	`779`	`+ "usageMetadata": {`
	`780`	`+ "promptTokenCount": 100,`
	`781`	`+ "candidatesTokenCount": 20,`
	`782`	`+ "totalTokenCount": 120,`
	`783`	`+ "cachedContentTokenCount": 35`
	`784`	`+ }`
	`785`	`+ });`
	`786`	`+ let data: GeminiSSEData = serde_json::from_value(payload).expect("gemini payload");`
	`787`	`+ let usage = data.into_unified_responses()[0].usage.as_ref().expect("usage").clone();`
	`788`	`+ assert_eq!(usage.cached_content_token_count, Some(35));`
	`789`	`+ assert_eq!(usage.cache_creation_token_count, None);`
	`790`	`+ }`
`773`	`791`	`}`
Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ impl From<ResponsesUsage> for UnifiedTokenUsage {`
`62`	`62`	`cached_content_token_count: usage`
`63`	`63`	`.input_tokens_details`
`64`	`64`	`.map(\|details\| details.cached_tokens),`
	`65`	`+ cache_creation_token_count: None,`
`65`	`66`	`}`
`66`	`67`	`}`
`67`	`68`	`}`