From eb2180f8aeb2802402b09c29f902b7eff40a88fd Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Thu, 16 Apr 2026 11:09:47 -0700 Subject: [PATCH] feat: integrate dynamic speculative decoding profiling mode and Qwen 3.6 35B matrix --- README.md | 23 +++++++++++++++++++ .../profiling_results_simbas-MacBook-Pro.md | 19 +++++++++++++-- run_benchmark.sh | 2 +- scripts/profiling/profile_runner.py | 16 +++++++++++++ 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9bf4bacd..377965d9 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,29 @@ Then start the server (models download automatically if not cached): *(Add `--stream-experts` when running oversized MoE models to bypass macOS virtual memory swapping and stream expert layers directly from NVMe SSD.)* +## 📊 Performance: Qwen3.6-35B on Apple Silicon + +Benchmark results for `Qwen3.6-35B-A3B-4bit` (35B Dense, 4-bit) on M5 Pro 64 GB, evaluating extreme context limits. + +### Headline Numbers + +| Configuration | 512 ctx | 40K ctx | 100K ctx | +|---|---|---|---| +| **Dense/Vanilla** | 32.1 tok/s · 33.6 GB | 24.0 tok/s · 64.2 GB | 18.6 tok/s · 63.9 GB | +| **SSD Stream** | 15.0 tok/s · **18.8 GB** | 5.1 tok/s · 51.7 GB | 4.1 tok/s · 63.9 GB | +| **TurboQuant** | 33.1 tok/s · 33.3 GB | 2.5 tok/s · 37.0 GB | 4.7 tok/s · 42.0 GB | +| **TurboQuant + SpecDecode (0.8B)**| 30.5 tok/s · 34.2 GB | 7.4 tok/s · 38.1 GB | 4.5 tok/s · 43.0 GB | +| **SSD + TurboQuant** | 14.5 tok/s · 19.3 GB | 5.4 tok/s · **23.2 GB** | 3.9 tok/s · **28.3 GB** | + +> Values shown as `generation speed · GPU memory allocated` + +**Key takeaways:** +- 📄 **40K context on 24 GB Mac**: SSD + TurboQuant effortlessly fits a whopping 35B model inside just **23.2 GB** of memory footprint. +- 🚀 **Speculative Decoding Rescue**: While TurboQuant compresses memory significantly, long contexts natively suffer from KV cache lookup latency (2.5 tok/s at 40K on TurboQuant alone). Firing up a tiny 0.8B Speculative Draft model reclaims operation speeds by 3× (**7.4 tok/s**) with nearly non-existent memory overhead. +- 📚 **100K context Memory Cliff**: Running a 35B model natively at 100K context locks the entire M5 Pro hardware memory limit (**63.9 GB** memory requested/swapped). Combining SSD weight streaming and KV TurboQuant drops this by ~60% down to a highly responsive **28.3 GB**. + +--- + ## 📊 Performance: Gemma 4-26B on Apple Silicon Benchmark results for `gemma-4-26b-a4b-it-4bit` (26B MoE, 4-bit) on M5 Pro 64 GB. diff --git a/docs/profiling/profiling_results_simbas-MacBook-Pro.md b/docs/profiling/profiling_results_simbas-MacBook-Pro.md index fe843469..cf574e13 100644 --- a/docs/profiling/profiling_results_simbas-MacBook-Pro.md +++ b/docs/profiling/profiling_results_simbas-MacBook-Pro.md @@ -1,9 +1,24 @@ -### `mlx-community/gemma-4-26b-a4b-it-4bit` — Context & Memory Profile +### `mlx-community/Qwen3.6-35B-A3B-4bit` — Context & Memory Profile -Context depths tested: 512 +Context depths tested: 512,40000,100000 | Configuration | Context Size | TTFT | Generation Speed | Model Size | Active RAM (Physical) | GPU Memory Allocated | |---|---|---|---|---|---|---| +| Dense/Vanilla | 512 | 4.01s | 32.10 tok/s | N/A | 18.9 GB | 33.6 GB | +| Dense/Vanilla | 40000 | 26.41s | 23.99 tok/s | N/A | 49.4 GB | 64.2 GB | +| Dense/Vanilla | 100000 | 151.76s | 18.64 tok/s | N/A | 49.3 GB | 63.9 GB | +| SSD Stream | 512 | 1.81s | 15.01 tok/s | N/A | 4.5 GB | 18.8 GB | +| SSD Stream | 40000 | 28.89s | 5.13 tok/s | N/A | 37.4 GB | 51.7 GB | +| SSD Stream | 100000 | 100.72s | 4.08 tok/s | N/A | 49.4 GB | 63.9 GB | +| TurboQuant | 512 | 0.44s | 33.14 tok/s | N/A | 18.9 GB | 33.3 GB | +| TurboQuant | 40000 | 20.90s | 2.54 tok/s | N/A | 22.7 GB | 37.0 GB | +| TurboQuant | 100000 | 60.30s | 4.73 tok/s | N/A | 27.7 GB | 42.0 GB | +| SSD + TurboQuant | 512 | 1.64s | 14.51 tok/s | N/A | 4.5 GB | 19.3 GB | +| SSD + TurboQuant | 40000 | 27.56s | 5.39 tok/s | N/A | 8.5 GB | 23.2 GB | +| SSD + TurboQuant | 100000 | 75.59s | 3.86 tok/s | N/A | 13.6 GB | 28.3 GB | +| SSD + 16-Worker Prefetch | 512 | 0.94s | 16.70 tok/s | N/A | 4.5 GB | 19.4 GB | +| SSD + 16-Worker Prefetch | 40000 | 28.88s | 5.17 tok/s | N/A | 37.4 GB | 51.9 GB | +| SSD + 16-Worker Prefetch | 100000 | 101.96s | 3.79 tok/s | N/A | 49.4 GB | 63.9 GB | > **Active RAM (Physical)**: Real memory wired into RAM by macOS (capped by device RAM). > **GPU Memory Allocated**: Total memory requested by the GPU — includes data swapped to SSD. This shows the TRUE memory demand and reveals TurboQuant compression benefits even when Active RAM is saturated. diff --git a/run_benchmark.sh b/run_benchmark.sh index b7c6bc54..fea39048 100755 --- a/run_benchmark.sh +++ b/run_benchmark.sh @@ -221,9 +221,9 @@ else "mlx-community/gemma-4-31b-it-8bit" "mlx-community/gemma-4-e4b-it-8bit" "mlx-community/gemma-4-26b-a4b-it-4bit" - "mlx-community/gemma-4-26b-a4b-it-4bit" "mlx-community/Qwen2.5-7B-Instruct-4bit" "mlx-community/Qwen2.5-14B-Instruct-4bit" + "mlx-community/Qwen3.6-35B-A3B-4bit" "mlx-community/phi-4-mlx-4bit" "baa-ai/GLM-5.1-RAM-270GB-MLX" "baa-ai/GLM-5.1-4bit" diff --git a/scripts/profiling/profile_runner.py b/scripts/profiling/profile_runner.py index 3aee6a66..0a729126 100755 --- a/scripts/profiling/profile_runner.py +++ b/scripts/profiling/profile_runner.py @@ -17,6 +17,16 @@ {"name": "SSD + 16-Worker Prefetch", "flags": ["--stream-experts", "--ssd-prefetch"]} ] +def get_draft_model(base_model: str) -> str: + m = base_model.lower() + if "qwen3" in m or "qwen2.5" in m or "qwen2" in m: + return "mlx-community/Qwen3.5-0.8B-MLX-4bit" + if "gemma" in m: + return "mlx-community/gemma-3-1b-it-4bit" + if "phi" in m: + return "mlx-community/phi-3-mini-4k-instruct-4bit" + return "" + SWIFTLM_PATH = ".build/arm64-apple-macosx/release/SwiftLM" def get_physical_ram_gb(): @@ -249,6 +259,12 @@ def main(): if args.ssd_only: CONFIGS = [c for c in CONFIGS if "--stream-experts" in c["flags"]] + # Speculative Decoding Mode Auto-Injection + draft_model = get_draft_model(args.model) + if draft_model: + draft_name = draft_model.split("/")[-1].split("-")[1] # extract roughly '0.8B' or '1b' + CONFIGS.append({"name": f"TurboQuant + Speculative ({draft_name})", "flags": ["--turbo-kv", "--draft-model", draft_model]}) + # SwiftLM handles model downloading natively via HubApi. # Just pass the model ID directly — prepend mlx-community/ if no org is specified. model_id = args.model if "/" in args.model else f"mlx-community/{args.model}"