fix: correct residual RMS norm for Qwen3.5 — fixes garbage output

evilsocket · claude · evilsocket · commit c04420f85d61 · 2026-03-24T16:10:54.000+01:00
The root cause of garbage output (on both CPU and Metal) was incorrect
handling of residual RMS norm weights. Qwen3.5 uses (1 + weight) * rms_norm(x)
for all norm layers, meaning stored weights are deltas from 0 that need
+1.0 added at load time.

Two bugs:
1. load_rms_norm_weight had an auto-detection heuristic (threshold 0.5)
   that incorrectly skipped the +1.0 for weights that had drifted above
   0.5 during training (e.g., linear_attn.norm, later layer input norms).
   Removed the heuristic — always add 1.0 when residual_rms_norm is set.

2. RmsNormGated in linear_attention.rs loaded its weight directly without
   adding +1.0, unlike all other norms that go through load_rms_norm_weight.

Also reverted unnecessary .contiguous() calls added during debugging.

Verified against HuggingFace reference implementation (transformers 5.3.0)
which confirms: forward = output * (1.0 + self.weight.float()).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/cake-core/src/models/common/config.rs b/cake-core/src/models/common/config.rs
@@ -159,20 +159,9 @@ pub fn load_rms_norm_weight(
 ) -> candle_core::Result<candle_core::Tensor> {
     let weight = vb.get(size, "weight")?;
     if residual {
-        // Auto-detect: some quantized model variants (e.g., MLX-quantized) already apply
-        // the (1+w) transformation during quantization, storing the final weight directly.
-        // Detect by checking if the mean is closer to 0 (residual) or 1 (already transformed).
-        let mean: f32 = weight.to_dtype(candle_core::DType::F32)?
-            .mean_all()?
-            .to_scalar()?;
-        if mean.abs() < 0.5 {
-            // Weights near 0: residual pattern, add 1.0
-            Ok((weight + 1.0)?)
-        } else {
-            // Weights near 1: already transformed (e.g., MLX quantized), use as-is
-            log::debug!("rms_norm weight mean={mean:.3}: skipping residual +1.0");
-            Ok(weight)
-        }
+        // Residual RMS norm: forward = (1 + weight) * rms_norm(x).
+        // Weights are stored as deltas from 0, so add 1.0 at load time.
+        Ok((weight + 1.0)?)
     } else {
         Ok(weight)
     }
diff --git a/cake-core/src/models/qwen3_5/linear_attention.rs b/cake-core/src/models/qwen3_5/linear_attention.rs
@@ -26,8 +26,9 @@ struct RmsNormGated {
 
 impl RmsNormGated {
     fn load(size: usize, eps: f64, vb: VarBuilder, backend: Arc<dyn ComputeBackend>) -> Result<Self> {
-        // Store weight as F32 to match the recurrent step's F32 output.
-        let weight = vb.get(size, "weight")?.to_dtype(DType::F32)?;
+        // Residual RMS norm: forward = (1 + weight) * rms_norm(x) * silu(z).
+        // Store as F32 to match the recurrent step's F32 output.
+        let weight = (vb.get(size, "weight")?.to_dtype(DType::F32)? + 1.0)?;
         Ok(Self { weight, eps: eps as f32, backend })
     }