hw-native-sys · zhangstevenunity · May 22, 2026 · May 22, 2026 · gemini-code-assist · May 22, 2026
diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py
@@ -86,7 +86,25 @@
     "xor",
 })
 
-CASE_INT_SCALAR_DEFAULTS = {}
+DEEPSEEK_V4_DIRECT_CASES = frozenset({
+    "attention_csa_test_refresh_incore_81",
+    "attention_hca_test_incore_54",
+    "attention_swa_test_incore_40",
+    "decode_csa_test_incore_81",
+    "decode_hca_test_incore_54",
+    "decode_swa_test_incore_40",
+    "sparse_attn_test_incore_7",
+})
+
+CASE_INT_SCALAR_DEFAULTS = {
+    testcase: {
+        "v4": 0,
+        "v5": 32,
+    }
+    for testcase in DEEPSEEK_V4_DIRECT_CASES
+}
+
+CASE_BOOL_SCALAR_DEFAULTS = {}
 
 CASE_POINTER_COUNT_MINIMUMS = {
     "down_proj_residual": {
@@ -97,6 +115,14 @@
         "v1": 123648,
         "v2": 123648,
     },
+    **{
+        testcase: {
+            "v1": 1024 * 4096,
+            "v2": 8192 * 64,
+            "v3": 8192 * 64,
+        }
+        for testcase in DEEPSEEK_V4_DIRECT_CASES
+    },
 }
 
 
@@ -833,6 +859,13 @@ def _integer_scalar_default_value(testcase: str, name: str, host_type: str) -> O
     return None
 
 
+def _bool_scalar_default_value(testcase: str, name: str) -> Optional[bool]:
+    override = CASE_BOOL_SCALAR_DEFAULTS.get(testcase, {}).get(name)
+    if override is None:
+        return None
+    return bool(override)
+
+
 def _derive_testcase_name(input_cpp: Path) -> str:
     name = input_cpp.stem
     if name.endswith("-pto"):
@@ -1704,9 +1737,11 @@ def generate_testcase(
             param_decls_lines.append(f"    {t} {p['name']}{{128, 128, 128, 128}};")
             continue
         if t == "bool":
-            value = "true"
+            bool_override = _bool_scalar_default_value(testcase, p["name"])
+            value = "true" if bool_override is None else ("true" if bool_override else "false")
-            value = "true" if bool_override is None else ("true" if bool_override else "false")
+            value = "false" if bool_override is False else "true"
-            value = "true" if bool_override is None else ("true" if bool_override else "false")
+            value = "false" if bool_override is False else "true"
         elif re.match(r"^(u?int)(8|16|32|64)_t$", t) or t in {"int", "unsigned", "size_t"}:
-            value = str(_integer_scalar_default_value(testcase, p["name"], t) or 1)
+            int_override = _integer_scalar_default_value(testcase, p["name"], t)
+            value = "1" if int_override is None else str(int_override)
         elif t in {"float"}:
             value = "1.0f"
         elif t in {"double"}:

diff --git a/test/samples/DeepseekV4DecodeA3/README.md b/test/samples/DeepseekV4DecodeA3/README.md
@@ -0,0 +1,18 @@
+DeepSeek V4 decode PTO kernels for A3, generated from `hw-native-sys/pypto-lib` `models/deepseek/v4` at commit `be3c7942420b48fbab4ab1150edbc4ca8a125b94`.
+
+Scope:
+- compile-regression inputs for `ptoas`
+- board-validation inputs for direct `.pto` kernels
+
+Notes:
+- This directory vendors the primary raw `.pto` fragments emitted from these source modules:
+  - `decode_attention_csa.py`
+  - `decode_attention_hca.py`
+  - `decode_attention_swa.py`
+  - `decode_csa.py`
+  - `decode_hca.py`
+  - `decode_sparse_attn.py`
+  - `decode_swa.py`
+- The `.pto` file contents are copied directly from PyPTO raw PTO backend output and are not hand-edited.
+- `runop.sh` defaults these cases to `--pto-level=level3`.
+- Board-validation uses custom `*_golden.py` references for the standalone rope-pack kernel and full-buffer sizing/default block args wired in `generate_testcase.py`.
diff --git a/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81.pto b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @attention_csa_test_refresh_incore_81(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline1734__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline1700__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline1743__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline1564__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline1564__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline1564__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline1586__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline1586__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline1551__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline1700__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline1700__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline1700__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline1551__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline1580__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline1743__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline1743__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline1743__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline1580__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline1551__tile, %rope_odd_tile_inline1580__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline1610__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline1610__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline1664__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline1664__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline1610__tile[%rope_pack_hh_inline1664__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline1734__iter_v12_pview = pto.partition_view %o_packed_inline1734__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline1734__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81_golden.py b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("attention_csa_test_refresh_incore_81")
diff --git a/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @attention_hca_test_incore_54(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline2863__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline2905__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline2903__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline2884__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline2884__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline2884__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline2763__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline2763__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline2886__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline2905__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline2905__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline2905__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline2886__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline2893__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline2903__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline2903__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline2903__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline2893__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline2886__tile, %rope_odd_tile_inline2893__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline2779__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline2779__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline2823__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline2823__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline2779__tile[%rope_pack_hh_inline2823__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline2863__iter_v12_pview = pto.partition_view %o_packed_inline2863__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline2863__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("attention_hca_test_incore_54")