perf: revert F32 lm_head — F16 is correct after norm fixes

evilsocket · claude · evilsocket · commit f1787d99df11 · 2026-03-24T18:18:52.000+01:00
The F32 lm_head was added to compensate for logit distribution errors
caused by the GDN gated norm bug (+1.0 on non-residual weights). Now
that the norm is fixed, F16 lm_head produces correct output at all
temperatures.

Removing F32 saves:
- 1 GB memory (no cached F32 weight)
- ~6ms/token (reads 508 MB instead of 1 GB from memory bandwidth)

Benchmark (M3 Pro, Qwen3.5-0.8B, 50 tokens):
- F32 lm_head (cached): 36.7 tok/s, +1 GB memory
- F16 lm_head:          42.4 tok/s, no extra memory (+15.5%)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/cake-core/src/models/common/text_model.rs b/cake-core/src/models/common/text_model.rs
@@ -137,8 +137,6 @@ pub struct TextModelBase {
     pub ln_f_weight: Tensor,
     pub ln_f_eps: f32,
     pub lm_head_weight: Tensor,
-    /// Cached F32 lm_head weight for logit precision (avoids 1 GB allocation per token).
-    pub lm_head_weight_f32: Tensor,
 
     pub logits_processor: LogitsProcessor,
 
@@ -247,9 +245,6 @@ impl TextModelBase {
 
         let generated = 0;
 
-        // Pre-cache F32 lm_head weight to avoid 1 GB allocation+copy per token.
-        let lm_head_weight_f32 = lm_head_weight.to_dtype(candle_core::DType::F32)?;
-
         Ok(Self {
             tokenizer,
             tokens,
@@ -263,7 +258,6 @@ impl TextModelBase {
             ln_f_weight,
             ln_f_eps,
             lm_head_weight,
-            lm_head_weight_f32,
             logits_processor,
         })
     }
@@ -351,15 +345,10 @@ impl TextModelBase {
             .contiguous()
             .map_err(|e| anyhow!("error in x.i.contiguous: {e}"))?;
 
-        // lm_head in F32 for logit precision — F16 matmul over 248k vocab
-        // amplifies accumulated errors, shifting the sampling distribution.
-        // Uses pre-cached F32 weight to avoid 1 GB allocation per token.
-        let x_f32 = x.to_dtype(candle_core::DType::F32)
-            .map_err(|e| anyhow!("error in lm_head x to_f32: {e}"))?;
         let logits = self
             .ctx
             .backend
-            .linear_forward(&x_f32, &self.lm_head_weight_f32, None)
+            .linear_forward(&x, &self.lm_head_weight, None)
             .map_err(|e| anyhow!("error in lm_head.forward: {e}"))?;
         // Note: no explicit sync needed here — the CPU-side logits sampling
         // (to_vec1 in LogitsProcessor) implicitly synchronizes the Metal command buffer.