flash-moe: parallel expert warmup + dequant optimization — 8× faster loading

evilsocket · evilsocket · commit 874db56b6d0b · 2026-03-25T00:18:59.000+01:00
- Parallelize expert pre-warming with rayon (256 experts concurrently per layer)
- Optimize dequantize_packed_4bit: par_chunks_mut instead of flat_map (eliminates
  per-row Vec allocation)
- Remove redundant to_dtype(U32) for already-U32 tensors

Loading: ~34s (was ~4.5 min). Inference: 2.10 tok/s.
diff --git a/cake-core/src/models/common/disk_expert_provider.rs b/cake-core/src/models/common/disk_expert_provider.rs
@@ -306,15 +306,20 @@ impl DiskExpertProvider {
             },
         };
 
-        // Pre-warm: dequantize all experts at construction (moves cost from first token to loading)
+        // Pre-warm: dequantize all experts in parallel (moves cost from first token to loading)
         if provider.cache.is_some() {
             log::info!("pre-warming expert cache for {} experts...", num_experts);
-            for i in 0..num_experts {
-                if let Ok(ew) = provider.get_expert_uncached(i) {
-                    if let Some(ref cache) = provider.cache {
-                        if let Ok(mut entries) = cache.entries.write() {
-                            entries.insert(i, ew);
-                        }
+            use rayon::prelude::*;
+            let results: Vec<(usize, ExpertWeights)> = (0..num_experts)
+                .into_par_iter()
+                .filter_map(|i| {
+                    provider.get_expert_uncached(i).ok().map(|ew| (i, ew))
+                })
+                .collect();
+            if let Some(ref cache) = provider.cache {
+                if let Ok(mut entries) = cache.entries.write() {
+                    for (i, ew) in results {
+                        entries.insert(i, ew);
                     }
                 }
             }
diff --git a/cake-core/src/utils/gptq.rs b/cake-core/src/utils/gptq.rs
@@ -171,24 +171,17 @@ pub fn dequantize_packed_4bit(
     let cols = packed_cols * 8;
     let (_, groups) = scales.dims2()?;
 
-    let pw: Vec<u32> = packed
-        .to_dtype(DType::U32)?
-        .flatten_all()?
-        .to_vec1::<u32>()?;
-    let sc: Vec<f32> = scales
-        .to_dtype(DType::F32)?
-        .flatten_all()?
-        .to_vec1::<f32>()?;
-    let bi: Vec<f32> = biases
-        .to_dtype(DType::F32)?
-        .flatten_all()?
-        .to_vec1::<f32>()?;
+    // Extract raw data — avoid Tensor intermediates for the hot path
+    let pw: Vec<u32> = packed.flatten_all()?.to_vec1::<u32>()?;
+    let sc: Vec<f32> = scales.to_dtype(DType::F32)?.flatten_all()?.to_vec1::<f32>()?;
+    let bi: Vec<f32> = biases.to_dtype(DType::F32)?.flatten_all()?.to_vec1::<f32>()?;
 
     use rayon::prelude::*;
-    let weight: Vec<f32> = (0..rows)
-        .into_par_iter()
-        .flat_map(|i| {
-            let mut row = vec![0f32; cols];
+    let mut weight = vec![0f32; rows * cols];
+    weight
+        .par_chunks_mut(cols)
+        .enumerate()
+        .for_each(|(i, row)| {
             for pc in 0..packed_cols {
                 let packed_val = pw[i * packed_cols + pc];
                 for bit in 0..8u32 {
@@ -200,9 +193,7 @@ pub fn dequantize_packed_4bit(
                     row[j] = w4 * scale + bias;
                 }
             }
-            row
-        })
-        .collect();
+        });
 
     Tensor::from_vec(weight, (rows, cols), &Device::Cpu)
 }