Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions test/npu_validation/scripts/generate_testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,25 @@
"xor",
})

CASE_INT_SCALAR_DEFAULTS = {}
DEEPSEEK_V4_DIRECT_CASES = frozenset({
"attention_csa_test_refresh_incore_81",
"attention_hca_test_incore_54",
"attention_swa_test_incore_40",
"decode_csa_test_incore_81",
"decode_hca_test_incore_54",
"decode_swa_test_incore_40",
"sparse_attn_test_incore_7",
})

CASE_INT_SCALAR_DEFAULTS = {
testcase: {
"v4": 0,
"v5": 32,
}
for testcase in DEEPSEEK_V4_DIRECT_CASES
}

CASE_BOOL_SCALAR_DEFAULTS = {}

CASE_POINTER_COUNT_MINIMUMS = {
"down_proj_residual": {
Expand All @@ -97,6 +115,14 @@
"v1": 123648,
"v2": 123648,
},
**{
testcase: {
"v1": 1024 * 4096,
"v2": 8192 * 64,
"v3": 8192 * 64,
}
for testcase in DEEPSEEK_V4_DIRECT_CASES
},
}


Expand Down Expand Up @@ -833,6 +859,13 @@ def _integer_scalar_default_value(testcase: str, name: str, host_type: str) -> O
return None


def _bool_scalar_default_value(testcase: str, name: str) -> Optional[bool]:
override = CASE_BOOL_SCALAR_DEFAULTS.get(testcase, {}).get(name)
if override is None:
return None
return bool(override)


def _derive_testcase_name(input_cpp: Path) -> str:
name = input_cpp.stem
if name.endswith("-pto"):
Expand Down Expand Up @@ -1704,9 +1737,11 @@ def generate_testcase(
param_decls_lines.append(f" {t} {p['name']}{{128, 128, 128, 128}};")
continue
if t == "bool":
value = "true"
bool_override = _bool_scalar_default_value(testcase, p["name"])
value = "true" if bool_override is None else ("true" if bool_override else "false")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The boolean string assignment can be simplified. Since bool_override is Optional[bool], and the default behavior (when None) is true, you only need to check for the False case explicitly.

Suggested change
value = "true" if bool_override is None else ("true" if bool_override else "false")
value = "false" if bool_override is False else "true"

elif re.match(r"^(u?int)(8|16|32|64)_t$", t) or t in {"int", "unsigned", "size_t"}:
value = str(_integer_scalar_default_value(testcase, p["name"], t) or 1)
int_override = _integer_scalar_default_value(testcase, p["name"], t)
value = "1" if int_override is None else str(int_override)
elif t in {"float"}:
value = "1.0f"
elif t in {"double"}:
Expand Down
18 changes: 18 additions & 0 deletions test/samples/DeepseekV4DecodeA3/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
DeepSeek V4 decode PTO kernels for A3, generated from `hw-native-sys/pypto-lib` `models/deepseek/v4` at commit `be3c7942420b48fbab4ab1150edbc4ca8a125b94`.

Scope:
- compile-regression inputs for `ptoas`
- board-validation inputs for direct `.pto` kernels

Notes:
- This directory vendors the primary raw `.pto` fragments emitted from these source modules:
- `decode_attention_csa.py`
- `decode_attention_hca.py`
- `decode_attention_swa.py`
- `decode_csa.py`
- `decode_hca.py`
- `decode_sparse_attn.py`
- `decode_swa.py`
- The `.pto` file contents are copied directly from PyPTO raw PTO backend output and are not hand-edited.
- `runop.sh` defaults these cases to `--pto-level=level3`.
- Board-validation uses custom `*_golden.py` references for the standalone rope-pack kernel and full-buffer sizing/default block args wired in `generate_testcase.py`.
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
module attributes {pto.target_arch = "a2a3"} {
func.func @attention_csa_test_refresh_incore_81(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i64 = arith.constant 0 : i64
%c2048_i64 = arith.constant 2048 : i64
%c4096_i64 = arith.constant 4096 : i64
%c1024_index = arith.constant 1024 : index
%c4096_index = arith.constant 4096 : index
%c1_index = arith.constant 1 : index
%c8192_index = arith.constant 8192 : index
%c64_index = arith.constant 64 : index
%c8_index = arith.constant 8 : index
%c32_index = arith.constant 32 : index
%c0_index = arith.constant 0 : index
%c128_index = arith.constant 128 : index
%c512_index = arith.constant 512 : index
%c448_index = arith.constant 448 : index
%o_packed_inline1734__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
%rope_even_interleave_buf_inline1700__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%rope_odd_interleave_buf_inline1743__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%rope_pack_block_inline1564__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
%1 = arith.divsi %rope_pack_block_inline1564__ssa_v0, %c8_index : index
%2 = arith.muli %1, %c8_index : index
%3 = arith.subi %rope_pack_block_inline1564__ssa_v0, %2 : index
%4 = arith.muli %1, %c32_index : index
scf.for %rope_combine_dt_inline1586__idx_v0 = %c0_index to %c32_index step %c1_index {
%5 = arith.addi %4, %rope_combine_dt_inline1586__idx_v0 : index
%6 = arith.muli %5, %c64_index : index
%7 = arith.muli %3, %c8_index : index
%8 = arith.addi %6, %7 : index
%rope_even_tile_inline1551__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%rope_even_interleave_buf_inline1700__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline1700__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
pto.tload ins(%rope_even_interleave_buf_inline1700__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline1551__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%rope_odd_tile_inline1580__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%rope_odd_interleave_buf_inline1743__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline1743__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
pto.tload ins(%rope_odd_interleave_buf_inline1743__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline1580__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tadd ins(%rope_even_tile_inline1551__tile, %rope_odd_tile_inline1580__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%rope_full_inline1610__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline1610__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%9 = arith.muli %3, %c128_index : index
%10 = arith.addi %9, %5 : index
scf.for %rope_pack_hh_inline1664__idx_v0 = %c0_index to %c8_index step %c1_index {
%11 = arith.muli %rope_pack_hh_inline1664__idx_v0, %c512_index : index
%12 = arith.addi %11, %c448_index : index
%0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%slice_view = pto.subview %rope_full_inline1610__tile[%rope_pack_hh_inline1664__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%o_packed_inline1734__iter_v12_pview = pto.partition_view %o_packed_inline1734__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline1734__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
}
}
return
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/python3
# Copyright (c) 2026 Huawei Technologies Co., Ltd.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.

from deepseek_v4_decode_golden_lib import run_case


if __name__ == "__main__":
run_case("attention_csa_test_refresh_incore_81")
53 changes: 53 additions & 0 deletions test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
module attributes {pto.target_arch = "a2a3"} {
func.func @attention_hca_test_incore_54(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i64 = arith.constant 0 : i64
%c2048_i64 = arith.constant 2048 : i64
%c4096_i64 = arith.constant 4096 : i64
%c1024_index = arith.constant 1024 : index
%c4096_index = arith.constant 4096 : index
%c1_index = arith.constant 1 : index
%c8192_index = arith.constant 8192 : index
%c64_index = arith.constant 64 : index
%c8_index = arith.constant 8 : index
%c32_index = arith.constant 32 : index
%c0_index = arith.constant 0 : index
%c128_index = arith.constant 128 : index
%c512_index = arith.constant 512 : index
%c448_index = arith.constant 448 : index
%o_packed_inline2863__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
%rope_even_interleave_buf_inline2905__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%rope_odd_interleave_buf_inline2903__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%rope_pack_block_inline2884__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
%1 = arith.divsi %rope_pack_block_inline2884__ssa_v0, %c8_index : index
%2 = arith.muli %1, %c8_index : index
%3 = arith.subi %rope_pack_block_inline2884__ssa_v0, %2 : index
%4 = arith.muli %1, %c32_index : index
scf.for %rope_combine_dt_inline2763__idx_v0 = %c0_index to %c32_index step %c1_index {
%5 = arith.addi %4, %rope_combine_dt_inline2763__idx_v0 : index
%6 = arith.muli %5, %c64_index : index
%7 = arith.muli %3, %c8_index : index
%8 = arith.addi %6, %7 : index
%rope_even_tile_inline2886__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%rope_even_interleave_buf_inline2905__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline2905__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
pto.tload ins(%rope_even_interleave_buf_inline2905__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline2886__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%rope_odd_tile_inline2893__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%rope_odd_interleave_buf_inline2903__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline2903__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
pto.tload ins(%rope_odd_interleave_buf_inline2903__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline2893__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tadd ins(%rope_even_tile_inline2886__tile, %rope_odd_tile_inline2893__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%rope_full_inline2779__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline2779__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%9 = arith.muli %3, %c128_index : index
%10 = arith.addi %9, %5 : index
scf.for %rope_pack_hh_inline2823__idx_v0 = %c0_index to %c8_index step %c1_index {
%11 = arith.muli %rope_pack_hh_inline2823__idx_v0, %c512_index : index
%12 = arith.addi %11, %c448_index : index
%0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%slice_view = pto.subview %rope_full_inline2779__tile[%rope_pack_hh_inline2823__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%o_packed_inline2863__iter_v12_pview = pto.partition_view %o_packed_inline2863__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline2863__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
}
}
return
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/python3
# Copyright (c) 2026 Huawei Technologies Co., Ltd.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.

from deepseek_v4_decode_golden_lib import run_case


if __name__ == "__main__":
run_case("attention_hca_test_incore_54")
Loading
Loading