File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -306,15 +306,20 @@ impl DiskExpertProvider {
306306 } ,
307307 } ;
308308
309- // Pre-warm: dequantize all experts at construction (moves cost from first token to loading)
309+ // Pre-warm: dequantize all experts in parallel (moves cost from first token to loading)
310310 if provider. cache . is_some ( ) {
311311 log:: info!( "pre-warming expert cache for {} experts..." , num_experts) ;
312- for i in 0 ..num_experts {
313- if let Ok ( ew) = provider. get_expert_uncached ( i) {
314- if let Some ( ref cache) = provider. cache {
315- if let Ok ( mut entries) = cache. entries . write ( ) {
316- entries. insert ( i, ew) ;
317- }
312+ use rayon:: prelude:: * ;
313+ let results: Vec < ( usize , ExpertWeights ) > = ( 0 ..num_experts)
314+ . into_par_iter ( )
315+ . filter_map ( |i| {
316+ provider. get_expert_uncached ( i) . ok ( ) . map ( |ew| ( i, ew) )
317+ } )
318+ . collect ( ) ;
319+ if let Some ( ref cache) = provider. cache {
320+ if let Ok ( mut entries) = cache. entries . write ( ) {
321+ for ( i, ew) in results {
322+ entries. insert ( i, ew) ;
318323 }
319324 }
320325 }
Original file line number Diff line number Diff line change @@ -171,24 +171,17 @@ pub fn dequantize_packed_4bit(
171171 let cols = packed_cols * 8 ;
172172 let ( _, groups) = scales. dims2 ( ) ?;
173173
174- let pw: Vec < u32 > = packed
175- . to_dtype ( DType :: U32 ) ?
176- . flatten_all ( ) ?
177- . to_vec1 :: < u32 > ( ) ?;
178- let sc: Vec < f32 > = scales
179- . to_dtype ( DType :: F32 ) ?
180- . flatten_all ( ) ?
181- . to_vec1 :: < f32 > ( ) ?;
182- let bi: Vec < f32 > = biases
183- . to_dtype ( DType :: F32 ) ?
184- . flatten_all ( ) ?
185- . to_vec1 :: < f32 > ( ) ?;
174+ // Extract raw data — avoid Tensor intermediates for the hot path
175+ let pw: Vec < u32 > = packed. flatten_all ( ) ?. to_vec1 :: < u32 > ( ) ?;
176+ let sc: Vec < f32 > = scales. to_dtype ( DType :: F32 ) ?. flatten_all ( ) ?. to_vec1 :: < f32 > ( ) ?;
177+ let bi: Vec < f32 > = biases. to_dtype ( DType :: F32 ) ?. flatten_all ( ) ?. to_vec1 :: < f32 > ( ) ?;
186178
187179 use rayon:: prelude:: * ;
188- let weight: Vec < f32 > = ( 0 ..rows)
189- . into_par_iter ( )
190- . flat_map ( |i| {
191- let mut row = vec ! [ 0f32 ; cols] ;
180+ let mut weight = vec ! [ 0f32 ; rows * cols] ;
181+ weight
182+ . par_chunks_mut ( cols)
183+ . enumerate ( )
184+ . for_each ( |( i, row) | {
192185 for pc in 0 ..packed_cols {
193186 let packed_val = pw[ i * packed_cols + pc] ;
194187 for bit in 0 ..8u32 {
@@ -200,9 +193,7 @@ pub fn dequantize_packed_4bit(
200193 row[ j] = w4 * scale + bias;
201194 }
202195 }
203- row
204- } )
205- . collect ( ) ;
196+ } ) ;
206197
207198 Tensor :: from_vec ( weight, ( rows, cols) , & Device :: Cpu )
208199}
You can’t perform that action at this time.
0 commit comments