Enable GQA on voxtral_realtime

mergennachin · mergennachin · commit e43437fee09d · 2026-03-30T06:34:03.000-07:00
diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock
@@ -0,0 +1 @@
+{"sessionId":"2321a402-406a-4717-8d8b-5d17b8c5210a","pid":921768,"acquiredAt":1774525472191}
diff --git a/backends/cuda/tests/bench_sdpa_gqa.py b/backends/cuda/tests/bench_sdpa_gqa.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Benchmark: Pack GQA SDPA vs repeat_interleave + MHA SDPA.
+
+Compares two approaches for GQA attention on consumer GPUs:
+  1. repeat_interleave: expand K/V to H_q heads, then call SDPA with H_q==H_kv
+  2. pack_gqa: call SDPA with enable_gqa=True (kernel handles head mapping)
+
+Usage:
+    LD_LIBRARY_PATH=/home/mnachin/local/miniconda3/envs/executorch/lib:$LD_LIBRARY_PATH \
+    python3 backends/cuda/tests/bench_sdpa_gqa.py
+"""
+
+import sys
+import os
+import time
+
+import torch
+import torch.nn.functional as F
+
+# Import Triton SDPA
+kernels_dir = os.path.join(os.path.dirname(__file__), "..", "triton", "kernels")
+sys.path.insert(0, os.path.abspath(kernels_dir))
+from sdpa import sdpa
+
+
+def _benchmark_fn(fn, warmup=10, repeats=100):
+    """Benchmark a function, return median time in microseconds."""
+    # Warmup
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+
+    times = []
+    for _ in range(repeats):
+        torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn()
+        torch.cuda.synchronize()
+        end = time.perf_counter()
+        times.append((end - start) * 1e6)  # microseconds
+
+    times.sort()
+    return times[len(times) // 2]  # median
+
+
+def bench_config(B, H_q, H_kv, L_q, L_kv, D, has_mask=False):
+    """Benchmark one configuration, return (repeat_interleave_us, pack_gqa_us)."""
+    num_groups = H_q // H_kv
+
+    torch.manual_seed(42)
+    q = torch.randn(B, H_q, L_q, D, dtype=torch.bfloat16, device="cuda")
+    k = torch.randn(B, H_kv, L_kv, D, dtype=torch.bfloat16, device="cuda")
+    v = torch.randn(B, H_kv, L_kv, D, dtype=torch.bfloat16, device="cuda")
+
+    if has_mask:
+        mask = torch.ones(B, 1, L_q, L_kv, dtype=torch.bool, device="cuda")
+    else:
+        mask = None
+
+    # Approach 1: repeat_interleave + MHA SDPA
+    def fn_repeat():
+        k_exp = k.repeat_interleave(num_groups, dim=1)
+        v_exp = v.repeat_interleave(num_groups, dim=1)
+        if mask is not None:
+            mask_exp = mask.expand(B, H_q, L_q, L_kv)
+            return sdpa(q, k_exp, v_exp, attn_mask=mask_exp)
+        return sdpa(q, k_exp, v_exp)
+
+    # Approach 2: pack GQA SDPA
+    def fn_pack_gqa():
+        return sdpa(q, k, v, attn_mask=mask, enable_gqa=True)
+
+    t_repeat = _benchmark_fn(fn_repeat)
+    t_pack = _benchmark_fn(fn_pack_gqa)
+
+    return t_repeat, t_pack
+
+
+def main():
+    if not torch.cuda.is_available():
+        print("CUDA not available")
+        return
+
+    gpu_name = torch.cuda.get_device_name(0)
+    print(f"GPU: {gpu_name}")
+    print()
+
+    configs = [
+        # Decode configs (L_q=1) — pack GQA should dominate
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 1, "L_kv": 128,  "D": 256, "label": "Qwen3.5 decode, ctx=128"},
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 1, "L_kv": 512,  "D": 256, "label": "Qwen3.5 decode, ctx=512"},
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 1, "L_kv": 1024, "D": 256, "label": "Qwen3.5 decode, ctx=1024"},
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 1, "L_kv": 2048, "D": 256, "label": "Qwen3.5 decode, ctx=2048"},
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 1, "L_kv": 4096, "D": 256, "label": "Qwen3.5 decode, ctx=4096"},
+
+        # Decode with mask
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 1, "L_kv": 1024, "D": 256, "label": "Qwen3.5 decode+mask, ctx=1024", "has_mask": True},
+
+        # Decode with different GQA ratios
+        {"B": 1, "H_q": 32, "H_kv": 8, "L_q": 1, "L_kv": 2048, "D": 128, "label": "Llama-style 4:1 decode, ctx=2048"},
+        {"B": 1, "H_q": 8,  "H_kv": 1, "L_q": 1, "L_kv": 2048, "D": 128, "label": "MQA 8:1 decode, ctx=2048"},
+
+        # Short seqlen (pack GQA should help)
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 4, "L_kv": 1024, "D": 256, "label": "Qwen3.5 short L_q=4, ctx=1024"},
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 8, "L_kv": 1024, "D": 256, "label": "Qwen3.5 short L_q=8, ctx=1024"},
+
+        # Prefill configs (L_q=L_kv) — repeat_interleave should be comparable
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 128,  "L_kv": 128,  "D": 256, "label": "Qwen3.5 prefill, L=128"},
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 512,  "L_kv": 512,  "D": 256, "label": "Qwen3.5 prefill, L=512"},
+        {"B": 1, "H_q": 16, "H_kv": 2, "L_q": 1024, "L_kv": 1024, "D": 256, "label": "Qwen3.5 prefill, L=1024"},
+
+        # Batch > 1
+        {"B": 4, "H_q": 16, "H_kv": 2, "L_q": 1, "L_kv": 1024, "D": 256, "label": "Qwen3.5 B=4 decode, ctx=1024"},
+    ]
+
+    header = f"{'Config':<45} {'repeat_interleave':>18} {'pack_gqa':>12} {'Speedup':>10}"
+    print(header)
+    print("-" * len(header))
+
+    for cfg in configs:
+        label = cfg.pop("label")
+        has_mask = cfg.pop("has_mask", False)
+        t_repeat, t_pack = bench_config(**cfg, has_mask=has_mask)
+        speedup = t_repeat / t_pack
+        print(
+            f"{label:<45} {t_repeat:>14.1f} us {t_pack:>8.1f} us {speedup:>9.2f}x"
+        )
+
+    print()
+    print("Speedup > 1.0 means pack_gqa is faster.")
+    print("Speedup < 1.0 means repeat_interleave is faster.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backends/cuda/tests/test_triton_sdpa.py b/backends/cuda/tests/test_triton_sdpa.py
@@ -68,12 +68,6 @@ def _reference_sdpa(q, k, v, attn_mask=None, is_causal=False, scale=None):
     )
 
 
-def _max_relative_error(out, ref):
-    """Max absolute error normalized by reference magnitude."""
-    diff = (out.float() - ref.float()).abs()
-    return (diff / ref.float().abs().clamp(min=1e-6)).max().item()
-
-
 def _max_abs_error(out, ref):
     return (out.float() - ref.float()).abs().max().item()
 
@@ -102,11 +96,6 @@ def _max_abs_error(out, ref):
     (256, 256),
 ]
 
-SEQLEN_PAIRS_LONG = [
-    (512, 512),
-    (1024, 1024),
-]
-
 # GQA configurations: (H_q, H_kv, label)
 GQA_CONFIGS = [
     (4, 4, "mha"),  # MHA: 1:1
@@ -533,6 +522,16 @@ def test_gqa_validation_errors(self):
         with self.assertRaises(RuntimeError):
             self.sdpa(q, k, v, enable_gqa=False)
 
+    def test_per_head_mask_rejected(self):
+        """Per-head masks (H>1) should be rejected since the kernel broadcasts."""
+        B, H, Lq, Lk, D = 1, 4, 4, 64, 64
+        q = torch.randn(B, H, Lq, D, dtype=torch.bfloat16, device="cuda")
+        k = torch.randn(B, H, Lk, D, dtype=torch.bfloat16, device="cuda")
+        v = torch.randn(B, H, Lk, D, dtype=torch.bfloat16, device="cuda")
+        mask = torch.ones(B, H, Lq, Lk, dtype=torch.bool, device="cuda")
+        with self.assertRaises(RuntimeError):
+            self.sdpa(q, k, v, attn_mask=mask)
+
     def test_gqa_all_masked_decode(self):
         """GQA decode with all-masked block should not NaN."""
         B, H_q, H_kv, Lq, Lk, D = 1, 8, 2, 1, 128, 64
@@ -547,6 +546,29 @@ def test_gqa_all_masked_decode(self):
         self.assertFalse(torch.isnan(out).any())
         self.assertFalse(torch.isinf(out).any())
 
+    def test_causal_lq_ne_lkv_rejected(self):
+        """is_causal=True with L_q != L_kv should raise RuntimeError."""
+        B, H, D = 1, 4, 64
+        q = torch.randn(B, H, 1, D, dtype=torch.bfloat16, device="cuda")
+        k = torch.randn(B, H, 128, D, dtype=torch.bfloat16, device="cuda")
+        v = torch.randn(B, H, 128, D, dtype=torch.bfloat16, device="cuda")
+        with self.assertRaises(RuntimeError):
+            self.sdpa(q, k, v, is_causal=True)
+
+    def test_non_pow2_no_mask(self):
+        """Non-pow2 head dim without mask should work (mask_ptr=0 path)."""
+        B, H, Lq, Lk, D = 1, 4, 4, 64, 40  # D=40 is not pow2
+        torch.manual_seed(42)
+        q = torch.randn(B, H, Lq, D, dtype=torch.bfloat16, device="cuda")
+        k = torch.randn(B, H, Lk, D, dtype=torch.bfloat16, device="cuda")
+        v = torch.randn(B, H, Lk, D, dtype=torch.bfloat16, device="cuda")
+
+        out = self.sdpa(q, k, v)
+        ref = _reference_sdpa(q, k, v)
+
+        self.assertFalse(torch.isnan(out).any())
+        self.assertLess(_max_abs_error(out, ref), 0.05)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/backends/cuda/triton/kernels/sdpa.py b/backends/cuda/triton/kernels/sdpa.py
@@ -851,7 +851,7 @@ def _launch_non_pow2_kernel(
         stride_mlq_np2 = attn_mask.stride(2)
         stride_mlk_np2 = attn_mask.stride(3)
     else:
-        mask_ptr = torch.empty((1,), device=query.device, dtype=torch.bool)
+        mask_ptr = 0
         stride_mb_np2 = stride_mh_np2 = stride_mlq_np2 = stride_mlk_np2 = 0
 
     def grid_non_pow2(meta):
@@ -941,12 +941,15 @@ def sdpa(
 
     if is_causal and L_q != L_kv:
         raise RuntimeError(
-            f"Causal masking requires L_q == L_kv; got L_q={L_q}, L_kv={L_kv}."
+            f"Causal masking requires L_q == L_kv; got L_q={L_q}, L_kv={L_kv}. "
+            "For decode (L_q < L_kv), use an explicit bool mask instead."
         )
 
     # Decide whether to pack GQA based on tile utilization heuristic.
-    # Use 64 as the reference BLOCK_M for the heuristic (the common case).
-    pack_gqa = _should_pack_gqa(L_q, num_groups, 64)
+    # Mirror the kernel selection logic: M32 when CTAs are sparse, M64 otherwise.
+    total_ctas_m64 = ((L_q * num_groups + 63) // 64) * (B * H_kv)
+    block_m = 32 if total_ctas_m64 < 4 * 84 else 64
+    pack_gqa = _should_pack_gqa(L_q, num_groups, block_m)
 
     out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
     sm_scale = 1.0 / math.sqrt(D) if scale == 0.0 else scale
diff --git a/examples/models/voxtral_realtime/model.py b/examples/models/voxtral_realtime/model.py
@@ -514,9 +514,10 @@ def forward(
 class StandardSDPA(nn.Module):
     """Scaled dot-product attention using F.scaled_dot_product_attention.
 
-    Supports GQA via repeat_interleave when n_heads != n_kv_heads.
-    Expects Q in [B, S, H, D]; K/V in [B, H, S, D] by default
-    (set transpose_kv=True if K/V arrive in [B, S, H, D]).
+    Supports GQA via enable_gqa=True — the kernel maps Q heads to KV heads
+    internally, avoiding redundant K/V memory expansion.
+    Expects Q in [B, S, H_q, D]; K/V in [B, H_kv, S, D] by default
+    (set transpose_kv=True if K/V arrive in [B, S, H_kv, D]).
     """
 
     def __init__(
@@ -525,7 +526,7 @@ def __init__(
         super().__init__()
         self.n_heads = n_heads
         self.n_kv_heads = n_kv_heads
-        self.n_rep = n_heads // n_kv_heads
+        self.enable_gqa = n_heads != n_kv_heads
         self.head_dim = head_dim
         self.dim = n_heads * head_dim
         self.transpose_kv = transpose_kv
@@ -545,15 +546,11 @@ def forward(
             k = k.transpose(1, 2)
             v = v.transpose(1, 2)
 
-        if self.n_rep > 1:
-            k = k.repeat_interleave(self.n_rep, dim=1)
-            v = v.repeat_interleave(self.n_rep, dim=1)
-
         if attn_mask is None:
             attn_mask = _build_causal_mask_bool(input_pos, k.shape[2], q.device)
 
         y = F.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, is_causal=False
+            q, k, v, attn_mask=attn_mask, is_causal=False, enable_gqa=self.enable_gqa
         )
 
         y = y.transpose(1, 2).contiguous()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"sessionId":"2321a402-406a-4717-8d8b-5d17b8c5210a","pid":921768,"acquiredAt":1774525472191}`