Merge branch 'main' of github.com:evilsocket/cake

evilsocket · evilsocket · commit 225396a83dc4 · 2026-03-24T21:05:51.000+01:00
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -24,6 +24,22 @@ cargo build --release --features metal
 cargo build --release --features vulkan
 ```
 
+## Acceleration Features
+
+| Feature | Platform | Backend | Best For | Notes |
+|---------|----------|---------|----------|-------|
+| `metal` | macOS (Apple Silicon) | GPU via MPS + custom MSL kernels | Primary inference on Mac | Fastest option on Apple Silicon (~42 tok/s on M3 Pro) |
+| `cuda` | Linux (NVIDIA GPU) | GPU via cuBLAS/cuDNN | Primary inference on Linux | Requires CUDA toolkit matching driver version |
+| `accelerate` | macOS | CPU via Apple Accelerate (AMX) | CPU-only F32 inference on Mac | 2.7x faster than pure-Rust for F32 matmul; no F16 support |
+| `vulkan` | Any (Vulkan 1.3+) | GPU via Vulkan compute shaders | Steam Deck, AMD GPUs | Portable but less optimized than Metal/CUDA |
+| (none) | Any | CPU via pure-Rust `gemm` | Portable CPU fallback | F16 weights stay F16, avoids bandwidth doubling |
+
+**When to use which:**
+- **Apple Silicon (stevie.local):** Use `--features metal`. Metal is 1.6x faster than CPU F16 (42 vs 26 tok/s). The `accelerate` feature doesn't help with Metal and doesn't support F16 matmul, so CPU F16 (default, no features) is actually faster than `accelerate` with F32 (26 vs 23 tok/s).
+- **NVIDIA GPU (blade/bahamut):** Use `--features cuda`. Add `flash-attn` for flash attention support.
+- **CPU-only with F32 models:** Use `--features accelerate` on macOS for 2.7x faster F32 matmul. On Linux, consider linking against MKL or OpenBLAS.
+- **CPU-only with F16 models:** Use no features — pure-Rust `gemm` with F16 avoids the 2x memory bandwidth penalty of converting to F32.
+
 ## Interactive Chat
 
 ```bash
diff --git a/README.md b/README.md
@@ -32,10 +32,11 @@ Cake is a **multimodal AI inference server** written in Rust that can run models
 ### Build
 
 ```sh
-cargo build --release --features cuda   # Linux (NVIDIA)
-cargo build --release --features metal  # macOS (Apple Silicon)
-cargo build --release --features vulkan # Linux (AMD/Intel/Steam Deck)
-cargo build --release                   # CPU only
+cargo build --release --features cuda        # Linux (NVIDIA)
+cargo build --release --features metal       # macOS (Apple Silicon GPU)
+cargo build --release --features accelerate  # macOS (Apple Silicon CPU, F32 models)
+cargo build --release --features vulkan      # Linux (AMD/Intel/Steam Deck)
+cargo build --release                        # CPU only (portable)
 ```
 
 ### Models
diff --git a/cake-core/Cargo.toml b/cake-core/Cargo.toml
@@ -70,6 +70,7 @@ base64 = "0.22.1"
 default = ["master", "llama", "qwen2", "qwen3_5", "qwen3", "qwen3_moe", "qwen3_5_moe", "phi4", "mistral", "gemma3", "falcon3", "olmo2", "exaone4", "flux", "vibevoice", "luxtts"]
 
 metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal", "dep:candle-metal-kernels", "dep:objc2-metal"]
+accelerate = ["candle-core/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
 cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda", "dep:bindgen_cuda"]
 flash-attn = ["cuda", "dep:candle-flash-attn"]
 vulkan = ["dep:ash", "dep:gpu-allocator", "dep:bytemuck"]
diff --git a/cake-core/src/backends/metal/ops.msl b/cake-core/src/backends/metal/ops.msl
@@ -838,3 +838,4 @@ kernel void fused_vector_attention_f32(
 
     output[bh * head_dim + d] = acc * (1.0f / sum_exp);
 }
+
diff --git a/cake-core/src/models/common/text_model.rs b/cake-core/src/models/common/text_model.rs
@@ -333,17 +333,10 @@ impl TextModelBase {
 
         let head_start = std::time::Instant::now();
 
-        // Final norm + lm_head in F32 for logit precision — F16 through 24 layers
-        // accumulates small errors that get amplified across 248k vocab entries,
-        // shifting the sampling distribution enough to cause wrong-language output.
-        let x_f32 = x.to_dtype(candle_core::DType::F32)
-            .map_err(|e| anyhow!("error in ln_f x to_f32: {e}"))?;
-        let w_f32 = self.ln_f_weight.to_dtype(candle_core::DType::F32)
-            .map_err(|e| anyhow!("error in ln_f w to_f32: {e}"))?;
         let x = self
             .ctx
             .backend
-            .rms_norm(&x_f32, &w_f32, self.ln_f_eps)
+            .rms_norm(&x, &self.ln_f_weight, self.ln_f_eps)
             .map_err(|e| anyhow!("error in ln_f.forward: {e}"))?;
 
         let x = x
@@ -352,12 +345,10 @@ impl TextModelBase {
             .contiguous()
             .map_err(|e| anyhow!("error in x.i.contiguous: {e}"))?;
 
-        let lm_w_f32 = self.lm_head_weight.to_dtype(candle_core::DType::F32)
-            .map_err(|e| anyhow!("error in lm_head w to_f32: {e}"))?;
         let logits = self
             .ctx
             .backend
-            .linear_forward(&x, &lm_w_f32, None)
+            .linear_forward(&x, &self.lm_head_weight, None)
             .map_err(|e| anyhow!("error in lm_head.forward: {e}"))?;
         // Note: no explicit sync needed here — the CPU-side logits sampling
         // (to_vec1 in LogitsProcessor) implicitly synchronizes the Metal command buffer.
diff --git a/cake-core/src/models/qwen3_5/full_attention.rs b/cake-core/src/models/qwen3_5/full_attention.rs
@@ -138,9 +138,11 @@ impl Qwen3_5FullAttention {
         let qkv = self.backend.linear_forward(x, &self.qkv_proj_weight, None)
             .map_err(|e| anyhow!("qkv_proj: {e}"))?;
 
-        // Flush GPU commands after QKV matmul (always needed — full attention
-        // accumulates ~24 commands between syncs, can't afford more)
-        let _ = self.backend.synchronize();
+        // Flush GPU commands after QKV matmul — needed for prefill where many
+        // operations follow. Generation (seq_len=1) uses fused SDPA with few commands.
+        if seq_len > 1 {
+            let _ = self.backend.synchronize();
+        }
 
         // Split: Q (doubled for gating), K, V
         let q_out = qkv.narrow(D::Minus1, 0, self.q_size)
@@ -206,45 +208,46 @@ impl Qwen3_5FullAttention {
                 ).map_err(|e| anyhow!("flash_attn: {e}"))?;
             }
 
-            // Metal: mixed-precision attention — F16 matmuls + F32 softmax.
-            // F16 SDPA causes garbage, F32 SDPA exceeds threadgroup memory.
+            // Metal path: fused SDPA for generation, mixed-precision for prefill.
             #[cfg(feature = "metal")]
             if matches!(q.device(), candle_core::Device::Metal(_)) {
+                // Generation (seq_len=1): fused kernel — single dispatch with native
+                // GQA (no repeat_kv), online softmax, no attention matrix materialization.
+                // Replaces 4+ separate dispatches (repeat_kv + 2 matmuls + softmax + dtype casts).
+                if seq_len == 1 {
+                    let scale = 1.0 / (self.head_dim as f32).sqrt();
+                    break 'attn self.backend.sdpa(&q, &k, &v, None, false, scale)
+                        .map_err(|e| anyhow!("sdpa: {e}"))?;
+                }
+
+                // Prefill (seq_len > 1): F16 matmuls + F32 softmax (F16 SDPA causes
+                // garbage, F32 SDPA exceeds threadgroup memory).
                 let k = self.repeat_kv(k).map_err(|e| anyhow!("repeat_kv k: {e}"))?;
                 let v = self.repeat_kv(v).map_err(|e| anyhow!("repeat_kv v: {e}"))?;
                 let att = (q.matmul(&k.t()?)? / (self.head_dim as f64).sqrt())?;
                 let att = att.to_dtype(candle_core::DType::F32)?;
-                let att = if seq_len == 1 {
-                    att
-                } else {
-                    let tril = Tensor::tril2(seq_len, candle_core::DType::F32, att.device())
-                        .map_err(|e| anyhow!("tril: {e}"))?;
-                    let mask = ((tril - 1.0)? * 1e9)?;
-                    let mask = mask.broadcast_as(att.shape())
-                        .map_err(|e| anyhow!("mask broadcast: {e}"))?;
-                    (att + mask).map_err(|e| anyhow!("mask add: {e}"))?
-                };
+                let tril = Tensor::tril2(seq_len, candle_core::DType::F32, att.device())
+                    .map_err(|e| anyhow!("tril: {e}"))?;
+                let mask = ((tril - 1.0)? * 1e9)?;
+                let mask = mask.broadcast_as(att.shape())
+                    .map_err(|e| anyhow!("mask broadcast: {e}"))?;
+                let att = (att + mask).map_err(|e| anyhow!("mask add: {e}"))?;
                 let att = self.backend.softmax(&att, att.rank() - 1)?;
                 let att = att.to_dtype(v.dtype())?;
                 break 'attn att.matmul(&v.contiguous()?)
                     .map_err(|e| anyhow!("att matmul v: {e}"))?;
             }
 
-            // Manual attention with GQA head expansion (CPU fallback)
+            // CPU: manual attention with GQA head expansion
             let k = self.repeat_kv(k).map_err(|e| anyhow!("repeat_kv k: {e}"))?;
             let v = self.repeat_kv(v).map_err(|e| anyhow!("repeat_kv v: {e}"))?;
-
             let att = (q.matmul(&k.t()?)? / (self.head_dim as f64).sqrt())?;
-            let att = if seq_len == 1 {
-                att
-            } else {
-                let mask = cache.mask(seq_len, att.device())
-                    .map_err(|e| anyhow!("mask: {e}"))?
-                    .broadcast_as(att.shape())
-                    .map_err(|e| anyhow!("mask broadcast: {e}"))?;
-                masked_fill(&att, &mask, f32::NEG_INFINITY)
-                    .map_err(|e| anyhow!("masked_fill: {e}"))?
-            };
+            let mask = cache.mask(seq_len, att.device())
+                .map_err(|e| anyhow!("mask: {e}"))?
+                .broadcast_as(att.shape())
+                .map_err(|e| anyhow!("mask broadcast: {e}"))?;
+            let att = masked_fill(&att, &mask, f32::NEG_INFINITY)
+                .map_err(|e| anyhow!("masked_fill: {e}"))?;
             let att = self.backend.softmax(&att, att.rank() - 1)?;
             att.matmul(&v.contiguous()?)?
         };
diff --git a/docs/install.md b/docs/install.md
@@ -98,15 +98,18 @@ make mobile_ios
 
 By default, inference runs on CPU. Enable GPU acceleration with:
 
-| Feature | Backend | Platforms |
-|---------|---------|-----------|
-| `cuda` | NVIDIA CUDA (PTX kernels + flash-attn) | Linux, Windows |
-| `metal` | Apple Metal (MSL shaders + fused SDPA) | macOS, iOS |
-| `vulkan` | Vulkan via wgpu | Linux, Windows, Steam Deck |
-| `flash-attn` | Flash Attention 2 (implies `cuda`) | Linux, Windows |
+| Feature | Backend | Platforms | Notes |
+|---------|---------|-----------|-------|
+| `cuda` | NVIDIA CUDA (PTX kernels + flash-attn) | Linux, Windows | Best for NVIDIA GPUs |
+| `metal` | Apple Metal (MSL shaders + fused SDPA) | macOS, iOS | Best for Apple Silicon (~42 tok/s on M3 Pro with 0.8B model) |
+| `accelerate` | Apple Accelerate (AMX hardware) | macOS | CPU-only; 2.7x faster F32 matmul via Apple BLAS. No F16 support — use `metal` for F16 models |
+| `vulkan` | Vulkan via wgpu | Linux, Windows, Steam Deck | Portable GPU backend |
+| `flash-attn` | Flash Attention 2 (implies `cuda`) | Linux, Windows | Fused attention kernel for long sequences |
 
 Multiple backends can be compiled together — the runtime auto-selects based on available hardware.
 
+**Apple Silicon guidance:** Use `metal` for best performance. The `accelerate` feature only helps CPU inference with F32 models — for F16 models (default), CPU without `accelerate` is actually faster (26 vs 23 tok/s) because F16 halves memory bandwidth vs the F32 conversion Accelerate requires.
+
 ### Model Features
 
 By default, all text model architectures are compiled in. To build only for specific models:

Original file line number	Diff line number	Diff line change
`@@ -838,3 +838,4 @@ kernel void fused_vector_attention_f32(`
`838`	`838`
`839`	`839`	`output[bh * head_dim + d] = acc * (1.0f / sum_exp);`
`840`	`840`	`}`
	`841`	`+`