Addressing digantdesai's feedback

mergennachin · mergennachin · commit 3c4a653c1ff9 · 2026-03-30T18:31:18.000-07:00
diff --git a/backends/cuda/tests/test_triton_sdpa.py b/backends/cuda/tests/test_triton_sdpa.py
@@ -7,9 +7,8 @@
 """Comprehensive tests for the Triton SDPA kernel.
 
 Tests MHA, GQA, MQA with various head dims, sequence lengths, causal/non-causal,
-bool masks, and the pack_gqa optimization. Reference outputs are computed using
-torch SDPA with expanded KV heads (for GQA/MQA) in float32 for numerical
-stability.
+and bool masks. Reference outputs are computed using torch SDPA with expanded KV
+heads (for GQA/MQA) in float32 for numerical stability.
 
 Test parametrization adapted from FlashAttention (tests/cute/test_flash_attn.py).
 """
@@ -31,11 +30,7 @@ def _skip_if_no_cuda():
 def _import_sdpa():
     from executorch.backends.cuda.triton.kernels.sdpa import sdpa
 
-    try:
-        from executorch.backends.cuda.triton.kernels.sdpa import _should_pack_gqa
-    except ImportError:
-        _should_pack_gqa = None
-    return sdpa, _should_pack_gqa
+    return sdpa
 
 
 def _reference_sdpa(q, k, v, attn_mask=None, is_causal=False, scale=None):
@@ -112,11 +107,7 @@ class TestTritonSdpa(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         _skip_if_no_cuda()
-        cls.sdpa, cls.should_pack_gqa = _import_sdpa()
-
-    @staticmethod
-    def _should_pack_gqa(L_q, num_groups, block_m):
-        return TestTritonSdpa.should_pack_gqa(L_q, num_groups, block_m)
+        cls.sdpa = _import_sdpa()
 
     # ------------------------------------------------------------------
     # MHA tests (no GQA, backwards compatibility)
@@ -220,12 +211,7 @@ def test_mha_non_pow2_causal(self):
     # ------------------------------------------------------------------
 
     def test_gqa_decode(self):
-        """GQA decode (seqlen_q=1): exercises pack_gqa path.
-
-        This is the critical test for the pack_gqa optimization. With
-        seqlen_q=1, the heuristic should choose pack_gqa, folding all
-        Q heads into a single tile.
-        """
+        """GQA decode (seqlen_q=1)."""
         for (H_q, H_kv, label), D, Lk in itertools.product(
             GQA_CONFIGS, [64, 128, 256], [64, 128, 512]
         ):
@@ -238,13 +224,6 @@ def test_gqa_decode(self):
                 k = torch.randn(B, H_kv, Lk, D, dtype=torch.bfloat16, device="cuda")
                 v = torch.randn(B, H_kv, Lk, D, dtype=torch.bfloat16, device="cuda")
 
-                # Verify heuristic chooses pack_gqa for decode
-                num_groups = H_q // H_kv
-                self.assertTrue(
-                    self._should_pack_gqa(Lq, num_groups, 64),
-                    "Heuristic should choose pack_gqa for decode",
-                )
-
                 out = self.sdpa(q, k, v, enable_gqa=True)
                 ref = _reference_sdpa(q, k, v)
 
@@ -277,49 +256,35 @@ def test_gqa_decode_with_mask(self):
                 self.assertLess(_max_abs_error(out, ref), 0.05)
 
     def test_gqa_short_seqlen(self):
-        """GQA with short seqlen_q (2-8): pack_gqa should still activate."""
+        """GQA with short seqlen_q (2-8)."""
         for Lq in [2, 4, 8]:
             for H_q, H_kv, label in [(8, 2, "gqa_4x"), (16, 2, "gqa_8x")]:
                 with self.subTest(label=label, Lq=Lq):
                     B, Lk, D = 1, 256, 128
-                    num_groups = H_q // H_kv
                     torch.manual_seed(42)
                     q = torch.randn(B, H_q, Lq, D, dtype=torch.bfloat16, device="cuda")
                     k = torch.randn(B, H_kv, Lk, D, dtype=torch.bfloat16, device="cuda")
                     v = torch.randn(B, H_kv, Lk, D, dtype=torch.bfloat16, device="cuda")
 
-                    # Verify heuristic activates pack_gqa
-                    self.assertTrue(
-                        self._should_pack_gqa(Lq, num_groups, 64),
-                        f"Should pack for Lq={Lq}, groups={num_groups}",
-                    )
-
                     out = self.sdpa(q, k, v, enable_gqa=True)
                     ref = _reference_sdpa(q, k, v)
 
                     self.assertFalse(torch.isnan(out).any())
                     self.assertLess(_max_abs_error(out, ref), 0.05)
 
     def test_gqa_prefill(self):
-        """GQA prefill (long seqlen_q): should NOT use pack_gqa."""
+        """GQA prefill (long seqlen_q)."""
         for (H_q, H_kv, label), L in itertools.product(
             [(8, 2, "gqa_4x"), (16, 2, "gqa_8x"), (6, 1, "mqa")],
             [64, 128, 256],
         ):
             with self.subTest(label=label, L=L):
                 B, D = 1, 128
-                num_groups = H_q // H_kv
                 torch.manual_seed(42)
                 q = torch.randn(B, H_q, L, D, dtype=torch.bfloat16, device="cuda")
                 k = torch.randn(B, H_kv, L, D, dtype=torch.bfloat16, device="cuda")
                 v = torch.randn(B, H_kv, L, D, dtype=torch.bfloat16, device="cuda")
 
-                # Verify heuristic does NOT pack for long seqlen
-                self.assertFalse(
-                    self._should_pack_gqa(L, num_groups, 64),
-                    f"Should NOT pack for L={L}",
-                )
-
                 out = self.sdpa(q, k, v, is_causal=True, enable_gqa=True)
                 ref = _reference_sdpa(q, k, v, is_causal=True)
 
@@ -431,30 +396,6 @@ def test_qwen35_moe_config(self):
                     _max_abs_error(out, ref), 0.05, f"Qwen config Lq={Lq} Lk={Lk}"
                 )
 
-    # ------------------------------------------------------------------
-    # Pack GQA heuristic tests
-    # ------------------------------------------------------------------
-
-    def test_pack_gqa_heuristic(self):
-        """Verify _should_pack_gqa matches expected behavior."""
-        # MHA: never pack
-        self.assertFalse(self._should_pack_gqa(1, 1, 64))
-        self.assertFalse(self._should_pack_gqa(128, 1, 64))
-
-        # GQA decode (seqlen=1): always pack
-        self.assertTrue(self._should_pack_gqa(1, 8, 64))
-        self.assertTrue(self._should_pack_gqa(1, 4, 64))
-        self.assertTrue(self._should_pack_gqa(1, 2, 64))
-
-        # GQA short seqlen: pack when utilization improves
-        self.assertTrue(self._should_pack_gqa(4, 8, 64))
-        self.assertTrue(self._should_pack_gqa(8, 8, 64))
-
-        # GQA long seqlen: don't pack (tiles already full)
-        self.assertFalse(self._should_pack_gqa(64, 8, 64))
-        self.assertFalse(self._should_pack_gqa(128, 4, 64))
-        self.assertFalse(self._should_pack_gqa(256, 2, 64))
-
     # ------------------------------------------------------------------
     # Edge cases and validation
     # ------------------------------------------------------------------
diff --git a/backends/cuda/triton/kernels/sdpa.py b/backends/cuda/triton/kernels/sdpa.py
@@ -240,22 +240,24 @@ def _sdpa_fwd_kernel_non_pow2(
     NEG_INF: tl.constexpr = float("-inf")
 
     for start_n in tl.range(0, LK, BLOCK_N, num_stages=2):
-        kn = start_n + tl.arange(0, BLOCK_N)
-        kv_col_mask = kn < LK
+        offs_n = start_n + tl.arange(0, BLOCK_N)
+        kv_col_mask = offs_n < LK
 
-        k_ptrs = k_base + (kn[:, None] * stride_kl + offs_d[None, :] * stride_kd)
+        k_ptrs = k_base + (offs_n[:, None] * stride_kl + offs_d[None, :] * stride_kd)
         k = tl.load(k_ptrs, mask=kv_col_mask[:, None] & d_mask[None, :], other=0.0)
 
         qk = tl.dot(q, tl.trans(k))
         qk = (qk * qk_scale_log2).to(tl.float32)
 
         if IS_CAUSAL:
-            causal_mask = kn[None, :] > seq_pos[:, None]
+            causal_mask = offs_n[None, :] > seq_pos[:, None]
             qk = tl.where(causal_mask, tl.full(qk.shape, NEG_INF, dtype=tl.float32), qk)
 
         if HAS_MASK:
             m_ptrs = (
-                mask_b_base + seq_pos[:, None] * stride_mlq + kn[None, :] * stride_mlk
+                mask_b_base
+                + seq_pos[:, None] * stride_mlq
+                + offs_n[None, :] * stride_mlk
             )
             tile_valid = row_valid[:, None] & kv_col_mask[None, :]
             keep = tl.load(m_ptrs, mask=tile_valid, other=False)
@@ -276,7 +278,7 @@ def _sdpa_fwd_kernel_non_pow2(
 
         acc = (acc * alpha[:, None]).to(tl.float32)
 
-        v_ptrs = v_base + (kn[:, None] * stride_vl + offs_d[None, :] * stride_vd)
+        v_ptrs = v_base + (offs_n[:, None] * stride_vl + offs_d[None, :] * stride_vd)
         v = tl.load(v_ptrs, mask=kv_col_mask[:, None] & d_mask[None, :], other=0.0)
 
         acc = tl.dot(p.to(v.dtype), v, acc).to(tl.float32)