diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index bc7f208a1..2a30e5db7 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -86,7 +86,25 @@ "xor", }) -CASE_INT_SCALAR_DEFAULTS = {} +DEEPSEEK_V4_DIRECT_CASES = frozenset({ + "attention_csa_test_refresh_incore_81", + "attention_hca_test_incore_54", + "attention_swa_test_incore_40", + "decode_csa_test_incore_81", + "decode_hca_test_incore_54", + "decode_swa_test_incore_40", + "sparse_attn_test_incore_7", +}) + +CASE_INT_SCALAR_DEFAULTS = { + testcase: { + "v4": 0, + "v5": 32, + } + for testcase in DEEPSEEK_V4_DIRECT_CASES +} + +CASE_BOOL_SCALAR_DEFAULTS = {} CASE_POINTER_COUNT_MINIMUMS = { "down_proj_residual": { @@ -97,6 +115,14 @@ "v1": 123648, "v2": 123648, }, + **{ + testcase: { + "v1": 1024 * 4096, + "v2": 8192 * 64, + "v3": 8192 * 64, + } + for testcase in DEEPSEEK_V4_DIRECT_CASES + }, } @@ -833,6 +859,13 @@ def _integer_scalar_default_value(testcase: str, name: str, host_type: str) -> O return None +def _bool_scalar_default_value(testcase: str, name: str) -> Optional[bool]: + override = CASE_BOOL_SCALAR_DEFAULTS.get(testcase, {}).get(name) + if override is None: + return None + return bool(override) + + def _derive_testcase_name(input_cpp: Path) -> str: name = input_cpp.stem if name.endswith("-pto"): @@ -1704,9 +1737,11 @@ def generate_testcase( param_decls_lines.append(f" {t} {p['name']}{{128, 128, 128, 128}};") continue if t == "bool": - value = "true" + bool_override = _bool_scalar_default_value(testcase, p["name"]) + value = "true" if bool_override is None else ("true" if bool_override else "false") elif re.match(r"^(u?int)(8|16|32|64)_t$", t) or t in {"int", "unsigned", "size_t"}: - value = str(_integer_scalar_default_value(testcase, p["name"], t) or 1) + int_override = _integer_scalar_default_value(testcase, p["name"], t) + value = "1" if int_override is None else str(int_override) elif t in {"float"}: value = "1.0f" elif t in {"double"}: diff --git a/test/samples/DeepseekV4DecodeA3/README.md b/test/samples/DeepseekV4DecodeA3/README.md new file mode 100644 index 000000000..9fe0a95fa --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/README.md @@ -0,0 +1,18 @@ +DeepSeek V4 decode PTO kernels for A3, generated from `hw-native-sys/pypto-lib` `models/deepseek/v4` at commit `be3c7942420b48fbab4ab1150edbc4ca8a125b94`. + +Scope: +- compile-regression inputs for `ptoas` +- board-validation inputs for direct `.pto` kernels + +Notes: +- This directory vendors the primary raw `.pto` fragments emitted from these source modules: + - `decode_attention_csa.py` + - `decode_attention_hca.py` + - `decode_attention_swa.py` + - `decode_csa.py` + - `decode_hca.py` + - `decode_sparse_attn.py` + - `decode_swa.py` +- The `.pto` file contents are copied directly from PyPTO raw PTO backend output and are not hand-edited. +- `runop.sh` defaults these cases to `--pto-level=level3`. +- Board-validation uses custom `*_golden.py` references for the standalone rope-pack kernel and full-buffer sizing/default block args wired in `generate_testcase.py`. diff --git a/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81.pto b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81.pto new file mode 100644 index 000000000..316ff61b1 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a2a3"} { + func.func @attention_csa_test_refresh_incore_81(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline1734__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline1700__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline1743__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline1564__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline1564__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline1564__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline1586__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline1586__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline1551__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline1700__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline1700__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline1700__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline1551__tile : !pto.tile_buf) + %rope_odd_tile_inline1580__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline1743__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline1743__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline1743__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline1580__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline1551__tile, %rope_odd_tile_inline1580__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline1610__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline1610__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline1664__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline1664__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline1610__tile[%rope_pack_hh_inline1664__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline1734__iter_v12_pview = pto.partition_view %o_packed_inline1734__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline1734__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81_golden.py b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81_golden.py new file mode 100644 index 000000000..be04f4648 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("attention_csa_test_refresh_incore_81") diff --git a/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto new file mode 100644 index 000000000..532544884 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a2a3"} { + func.func @attention_hca_test_incore_54(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline2863__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline2905__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline2903__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline2884__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline2884__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline2884__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline2763__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline2763__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline2886__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline2905__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline2905__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline2905__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline2886__tile : !pto.tile_buf) + %rope_odd_tile_inline2893__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline2903__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline2903__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline2903__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline2893__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline2886__tile, %rope_odd_tile_inline2893__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline2779__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline2779__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline2823__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline2823__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline2779__tile[%rope_pack_hh_inline2823__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline2863__iter_v12_pview = pto.partition_view %o_packed_inline2863__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline2863__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54_golden.py new file mode 100644 index 000000000..c06cd7cad --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("attention_hca_test_incore_54") diff --git a/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40.pto b/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40.pto new file mode 100644 index 000000000..83ab158bc --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a2a3"} { + func.func @attention_swa_test_incore_40(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline3949__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline3805__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline3909__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline3775__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline3775__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline3775__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline3815__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline3815__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline3768__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline3805__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline3805__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline3805__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline3768__tile : !pto.tile_buf) + %rope_odd_tile_inline3938__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline3909__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline3909__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline3909__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline3938__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline3768__tile, %rope_odd_tile_inline3938__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline3923__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline3923__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline3881__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline3881__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline3923__tile[%rope_pack_hh_inline3881__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline3949__iter_v12_pview = pto.partition_view %o_packed_inline3949__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline3949__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40_golden.py b/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40_golden.py new file mode 100644 index 000000000..d952e67d4 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("attention_swa_test_incore_40") diff --git a/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81.pto b/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81.pto new file mode 100644 index 000000000..4a4814276 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a2a3"} { + func.func @decode_csa_test_incore_81(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline5034_inline5466__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline5005_inline5933__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline5144_inline5315__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline4996_inline5240__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline4996_inline5240__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline4996_inline5240__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline5139_inline5238__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline5139_inline5238__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline5063_inline5236__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline5005_inline5933__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline5005_inline5933__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline5005_inline5933__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline5063_inline5236__tile : !pto.tile_buf) + %rope_odd_tile_inline4993_inline5563__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline5144_inline5315__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline5144_inline5315__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline5144_inline5315__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline4993_inline5563__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline5063_inline5236__tile, %rope_odd_tile_inline4993_inline5563__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline5180_inline5657__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline5180_inline5657__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline5002_inline5496__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline5002_inline5496__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline5180_inline5657__tile[%rope_pack_hh_inline5002_inline5496__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline5034_inline5466__iter_v12_pview = pto.partition_view %o_packed_inline5034_inline5466__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline5034_inline5466__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81_golden.py b/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81_golden.py new file mode 100644 index 000000000..dd19fb8e8 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("decode_csa_test_incore_81") diff --git a/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54.pto new file mode 100644 index 000000000..97d2cc6e0 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a2a3"} { + func.func @decode_hca_test_incore_54(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline8608_inline9732__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline8487_inline9733__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline8509_inline9849__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline8480_inline9901__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline8480_inline9901__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline8480_inline9901__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline8470_inline9643__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline8470_inline9643__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline8559_inline9640__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline8487_inline9733__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline8487_inline9733__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline8487_inline9733__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline8559_inline9640__tile : !pto.tile_buf) + %rope_odd_tile_inline8611_inline9639__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline8509_inline9849__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline8509_inline9849__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline8509_inline9849__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline8611_inline9639__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline8559_inline9640__tile, %rope_odd_tile_inline8611_inline9639__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline8468_inline9638__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline8468_inline9638__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline8516_inline9636__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline8516_inline9636__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline8468_inline9638__tile[%rope_pack_hh_inline8516_inline9636__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline8608_inline9732__iter_v12_pview = pto.partition_view %o_packed_inline8608_inline9732__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline8608_inline9732__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54_golden.py new file mode 100644 index 000000000..586712473 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("decode_hca_test_incore_54") diff --git a/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40.pto b/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40.pto new file mode 100644 index 000000000..cd35a5270 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a2a3"} { + func.func @decode_swa_test_incore_40(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline11580_inline11963__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline11549_inline12171__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline11657_inline11858__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline11502_inline11743__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline11502_inline11743__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline11502_inline11743__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline11596_inline12136__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline11596_inline12136__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline11616_inline11737__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline11549_inline12171__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline11549_inline12171__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline11549_inline12171__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline11616_inline11737__tile : !pto.tile_buf) + %rope_odd_tile_inline11499_inline11736__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline11657_inline11858__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline11657_inline11858__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline11657_inline11858__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline11499_inline11736__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline11616_inline11737__tile, %rope_odd_tile_inline11499_inline11736__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline11567_inline11735__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline11567_inline11735__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline11511_inline11733__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline11511_inline11733__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline11567_inline11735__tile[%rope_pack_hh_inline11511_inline11733__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline11580_inline11963__iter_v12_pview = pto.partition_view %o_packed_inline11580_inline11963__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline11580_inline11963__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40_golden.py b/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40_golden.py new file mode 100644 index 000000000..9314c315d --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("decode_swa_test_incore_40") diff --git a/test/samples/DeepseekV4DecodeA3/deepseek_v4_decode_golden_lib.py b/test/samples/DeepseekV4DecodeA3/deepseek_v4_decode_golden_lib.py new file mode 100644 index 000000000..008d217c9 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/deepseek_v4_decode_golden_lib.py @@ -0,0 +1,113 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + float32_to_bf16, + load_case_meta, + load_int32_assignments, + rng, + write_buffers, + write_golden, +) + +SUPPORTED_CASES = frozenset({ + "attention_csa_test_refresh_incore_81", + "attention_hca_test_incore_54", + "attention_swa_test_incore_40", + "decode_csa_test_incore_81", + "decode_hca_test_incore_54", + "decode_swa_test_incore_40", + "sparse_attn_test_incore_7", +}) + +OUTPUT_ROWS = 1024 +OUTPUT_COLS = 4096 +INPUT_ROWS = 8192 +INPUT_COLS = 64 +BLOCK_GROUP = 8 +DT_PER_GROUP = 32 +HH_PER_TILE = 8 +OUTPUT_ROW_GROUP = 128 +OUTPUT_COL_STRIDE = 512 +OUTPUT_COL_BASE = 448 + + +def _require_count(meta, name: str, expected: int) -> None: + actual = int(meta.elem_counts[name]) + if actual != expected: + raise ValueError(f"{name}: expected {expected} elements, got {actual}") + + +def _make_bf16_zeros(meta, name: str, expected: int) -> np.ndarray: + _require_count(meta, name, expected) + return np.zeros(expected, dtype=meta.np_types[name]) + + +def _make_fp32_input(meta, name: str, generator, expected: int) -> np.ndarray: + _require_count(meta, name, expected) + values = generator.uniform(-0.5, 0.5, size=expected).astype(np.float32) + return values.astype(meta.np_types[name], copy=False) + + +def build_case(meta, generator, ints): + if meta.outputs != ["v1"]: + raise ValueError(f"unexpected outputs: {meta.outputs}") + if meta.read_order != ["v1", "v2", "v3"]: + raise ValueError(f"unexpected read order: {meta.read_order}") + if len(ints) < 2: + raise ValueError(f"expected block_idx/block_num int32 params, got {ints}") + + block_idx, block_num = ints[:2] + if block_num <= 0: + raise ValueError(f"invalid block_num={block_num}") + if block_idx < 0 or block_idx >= block_num: + raise ValueError(f"invalid block_idx={block_idx} for block_num={block_num}") + + output_elems = OUTPUT_ROWS * OUTPUT_COLS + input_elems = INPUT_ROWS * INPUT_COLS + buffers = { + "v1": _make_bf16_zeros(meta, "v1", output_elems), + "v2": _make_fp32_input(meta, "v2", generator, input_elems), + "v3": _make_fp32_input(meta, "v3", generator, input_elems), + } + + out = np.array(buffers["v1"], copy=True).reshape(OUTPUT_ROWS, OUTPUT_COLS) + rope_even = np.asarray(buffers["v2"], dtype=np.float32).reshape(INPUT_ROWS, INPUT_COLS) + rope_odd = np.asarray(buffers["v3"], dtype=np.float32).reshape(INPUT_ROWS, INPUT_COLS) + + group_idx = block_idx // BLOCK_GROUP + lane_idx = block_idx % BLOCK_GROUP + dt_base = group_idx * DT_PER_GROUP + out_row_base = lane_idx * OUTPUT_ROW_GROUP + src_row_lane_offset = lane_idx * HH_PER_TILE + + for dt in range(DT_PER_GROUP): + dt_idx = dt_base + dt + src_row = dt_idx * INPUT_COLS + src_row_lane_offset + tile = rope_even[src_row:src_row + HH_PER_TILE, :] + rope_odd[src_row:src_row + HH_PER_TILE, :] + tile_bf16 = float32_to_bf16(tile) + dst_row = out_row_base + dt_idx + for hh in range(HH_PER_TILE): + col0 = OUTPUT_COL_BASE + hh * OUTPUT_COL_STRIDE + out[dst_row, col0:col0 + INPUT_COLS] = tile_bf16[hh] + + return buffers, {"v1": out.reshape(-1)} + + +def run_case(case_name: str): + if case_name not in SUPPORTED_CASES: + raise KeyError(f"unsupported case: {case_name}") + meta = load_case_meta() + generator = rng() + ints = load_int32_assignments() + buffers, golden = build_case(meta, generator, ints) + write_buffers(meta, buffers) + write_golden(meta, golden) diff --git a/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7.pto b/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7.pto new file mode 100644 index 000000000..37788d06a --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a2a3"} { + func.func @sparse_attn_test_incore_7(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline10659__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline10617__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline10794__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline10638__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline10638__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline10638__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline10624__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline10624__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline10786__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline10617__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline10617__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline10617__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline10786__tile : !pto.tile_buf) + %rope_odd_tile_inline10683__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline10794__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline10794__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline10794__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline10683__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline10786__tile, %rope_odd_tile_inline10683__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline10618__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline10618__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline10802__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline10802__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline10618__tile[%rope_pack_hh_inline10802__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline10659__iter_v12_pview = pto.partition_view %o_packed_inline10659__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline10659__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7_golden.py b/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7_golden.py new file mode 100644 index 000000000..94772cb17 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("sparse_attn_test_incore_7") diff --git a/test/samples/DeepseekV4DecodeA5/README.md b/test/samples/DeepseekV4DecodeA5/README.md new file mode 100644 index 000000000..64aeaaf75 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/README.md @@ -0,0 +1,18 @@ +DeepSeek V4 decode PTO kernels for A5, generated from `hw-native-sys/pypto-lib` `models/deepseek/v4` at commit `be3c7942420b48fbab4ab1150edbc4ca8a125b94`. + +Scope: +- compile-regression inputs for `ptoas` +- board-validation inputs for direct `.pto` kernels + +Notes: +- This directory vendors the primary raw `.pto` fragments emitted from these source modules: + - `decode_attention_csa.py` + - `decode_attention_hca.py` + - `decode_attention_swa.py` + - `decode_csa.py` + - `decode_hca.py` + - `decode_sparse_attn.py` + - `decode_swa.py` +- The `.pto` file contents are copied directly from PyPTO raw PTO backend output and are not hand-edited. +- `runop.sh` defaults these cases to `--pto-arch=a5 --pto-level=level3`. +- Board-validation uses custom `*_golden.py` references for the standalone rope-pack kernel and full-buffer sizing/default block args wired in `generate_testcase.py`. diff --git a/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81.pto b/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81.pto new file mode 100644 index 000000000..f9e79f0d0 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a5"} { + func.func @attention_csa_test_refresh_incore_81(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline1689__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline1729__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline1709__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline1568__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline1568__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline1568__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline1562__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline1562__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline1627__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline1729__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline1729__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline1729__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline1627__tile : !pto.tile_buf) + %rope_odd_tile_inline1733__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline1709__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline1709__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline1709__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline1733__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline1627__tile, %rope_odd_tile_inline1733__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline1651__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline1651__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline1748__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline1748__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline1651__tile[%rope_pack_hh_inline1748__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline1689__iter_v12_pview = pto.partition_view %o_packed_inline1689__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline1689__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81_golden.py b/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81_golden.py new file mode 100644 index 000000000..be04f4648 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("attention_csa_test_refresh_incore_81") diff --git a/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54.pto new file mode 100644 index 000000000..92efddc85 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a5"} { + func.func @attention_hca_test_incore_54(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline2857__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline2862__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline2930__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline2876__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline2876__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline2876__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline2873__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline2873__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline2893__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline2862__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline2862__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline2862__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline2893__tile : !pto.tile_buf) + %rope_odd_tile_inline2867__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline2930__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline2930__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline2930__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline2867__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline2893__tile, %rope_odd_tile_inline2867__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline2784__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline2784__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline2871__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline2871__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline2784__tile[%rope_pack_hh_inline2871__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline2857__iter_v12_pview = pto.partition_view %o_packed_inline2857__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline2857__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54_golden.py new file mode 100644 index 000000000..c06cd7cad --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("attention_hca_test_incore_54") diff --git a/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40.pto b/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40.pto new file mode 100644 index 000000000..e8ba1da42 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a5"} { + func.func @attention_swa_test_incore_40(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline3851__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline3880__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline3847__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline3768__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline3768__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline3768__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline3765__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline3765__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline3791__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline3880__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline3880__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline3880__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline3791__tile : !pto.tile_buf) + %rope_odd_tile_inline3796__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline3847__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline3847__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline3847__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline3796__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline3791__tile, %rope_odd_tile_inline3796__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline3924__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline3924__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline3762__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline3762__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline3924__tile[%rope_pack_hh_inline3762__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline3851__iter_v12_pview = pto.partition_view %o_packed_inline3851__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline3851__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40_golden.py b/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40_golden.py new file mode 100644 index 000000000..d952e67d4 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("attention_swa_test_incore_40") diff --git a/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81.pto b/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81.pto new file mode 100644 index 000000000..6d90a8cc0 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_csa_test_incore_81(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline5166_inline5302__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline5130_inline5946__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline5186_inline5504__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline5024_inline5227__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline5024_inline5227__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline5024_inline5227__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline5054_inline5659__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline5054_inline5659__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline4995_inline5303__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline5130_inline5946__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline5130_inline5946__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline5130_inline5946__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline4995_inline5303__tile : !pto.tile_buf) + %rope_odd_tile_inline5019_inline5875__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline5186_inline5504__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline5186_inline5504__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline5186_inline5504__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline5019_inline5875__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline4995_inline5303__tile, %rope_odd_tile_inline5019_inline5875__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline4992_inline5443__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline4992_inline5443__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline4999_inline5764__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline4999_inline5764__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline4992_inline5443__tile[%rope_pack_hh_inline4999_inline5764__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline5166_inline5302__iter_v12_pview = pto.partition_view %o_packed_inline5166_inline5302__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline5166_inline5302__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81_golden.py b/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81_golden.py new file mode 100644 index 000000000..dd19fb8e8 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("decode_csa_test_incore_81") diff --git a/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54.pto new file mode 100644 index 000000000..86635e446 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_hca_test_incore_54(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline8609_inline9735__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline8627_inline9736__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline8617_inline9986__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline8470_inline9832__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline8470_inline9832__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline8470_inline9832__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline8542_inline10208__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline8542_inline10208__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline8641_inline10183__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline8627_inline9736__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline8627_inline9736__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline8627_inline9736__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline8641_inline10183__tile : !pto.tile_buf) + %rope_odd_tile_inline8468_inline9641__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline8617_inline9986__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline8617_inline9986__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline8617_inline9986__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline8468_inline9641__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline8641_inline10183__tile, %rope_odd_tile_inline8468_inline9641__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline8624_inline9902__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline8624_inline9902__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline8588_inline9639__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline8588_inline9639__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline8624_inline9902__tile[%rope_pack_hh_inline8588_inline9639__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline8609_inline9735__iter_v12_pview = pto.partition_view %o_packed_inline8609_inline9735__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline8609_inline9735__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54_golden.py new file mode 100644 index 000000000..586712473 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("decode_hca_test_incore_54") diff --git a/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40.pto b/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40.pto new file mode 100644 index 000000000..f4535d03a --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a5"} { + func.func @decode_swa_test_incore_40(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline11646_inline11838__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline11658_inline11945__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline11652_inline11841__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline11509_inline11925__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline11509_inline11925__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline11509_inline11925__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline11505_inline11741__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline11505_inline11741__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline11616_inline11782__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline11658_inline11945__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline11658_inline11945__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline11658_inline11945__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline11616_inline11782__tile : !pto.tile_buf) + %rope_odd_tile_inline11503_inline11738__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline11652_inline11841__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline11652_inline11841__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline11652_inline11841__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline11503_inline11738__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline11616_inline11782__tile, %rope_odd_tile_inline11503_inline11738__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline11673_inline12024__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline11673_inline12024__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline11502_inline11737__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline11502_inline11737__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline11673_inline12024__tile[%rope_pack_hh_inline11502_inline11737__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline11646_inline11838__iter_v12_pview = pto.partition_view %o_packed_inline11646_inline11838__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline11646_inline11838__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40_golden.py b/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40_golden.py new file mode 100644 index 000000000..9314c315d --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("decode_swa_test_incore_40") diff --git a/test/samples/DeepseekV4DecodeA5/deepseek_v4_decode_golden_lib.py b/test/samples/DeepseekV4DecodeA5/deepseek_v4_decode_golden_lib.py new file mode 100644 index 000000000..008d217c9 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/deepseek_v4_decode_golden_lib.py @@ -0,0 +1,113 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from validation_runtime import ( + float32_to_bf16, + load_case_meta, + load_int32_assignments, + rng, + write_buffers, + write_golden, +) + +SUPPORTED_CASES = frozenset({ + "attention_csa_test_refresh_incore_81", + "attention_hca_test_incore_54", + "attention_swa_test_incore_40", + "decode_csa_test_incore_81", + "decode_hca_test_incore_54", + "decode_swa_test_incore_40", + "sparse_attn_test_incore_7", +}) + +OUTPUT_ROWS = 1024 +OUTPUT_COLS = 4096 +INPUT_ROWS = 8192 +INPUT_COLS = 64 +BLOCK_GROUP = 8 +DT_PER_GROUP = 32 +HH_PER_TILE = 8 +OUTPUT_ROW_GROUP = 128 +OUTPUT_COL_STRIDE = 512 +OUTPUT_COL_BASE = 448 + + +def _require_count(meta, name: str, expected: int) -> None: + actual = int(meta.elem_counts[name]) + if actual != expected: + raise ValueError(f"{name}: expected {expected} elements, got {actual}") + + +def _make_bf16_zeros(meta, name: str, expected: int) -> np.ndarray: + _require_count(meta, name, expected) + return np.zeros(expected, dtype=meta.np_types[name]) + + +def _make_fp32_input(meta, name: str, generator, expected: int) -> np.ndarray: + _require_count(meta, name, expected) + values = generator.uniform(-0.5, 0.5, size=expected).astype(np.float32) + return values.astype(meta.np_types[name], copy=False) + + +def build_case(meta, generator, ints): + if meta.outputs != ["v1"]: + raise ValueError(f"unexpected outputs: {meta.outputs}") + if meta.read_order != ["v1", "v2", "v3"]: + raise ValueError(f"unexpected read order: {meta.read_order}") + if len(ints) < 2: + raise ValueError(f"expected block_idx/block_num int32 params, got {ints}") + + block_idx, block_num = ints[:2] + if block_num <= 0: + raise ValueError(f"invalid block_num={block_num}") + if block_idx < 0 or block_idx >= block_num: + raise ValueError(f"invalid block_idx={block_idx} for block_num={block_num}") + + output_elems = OUTPUT_ROWS * OUTPUT_COLS + input_elems = INPUT_ROWS * INPUT_COLS + buffers = { + "v1": _make_bf16_zeros(meta, "v1", output_elems), + "v2": _make_fp32_input(meta, "v2", generator, input_elems), + "v3": _make_fp32_input(meta, "v3", generator, input_elems), + } + + out = np.array(buffers["v1"], copy=True).reshape(OUTPUT_ROWS, OUTPUT_COLS) + rope_even = np.asarray(buffers["v2"], dtype=np.float32).reshape(INPUT_ROWS, INPUT_COLS) + rope_odd = np.asarray(buffers["v3"], dtype=np.float32).reshape(INPUT_ROWS, INPUT_COLS) + + group_idx = block_idx // BLOCK_GROUP + lane_idx = block_idx % BLOCK_GROUP + dt_base = group_idx * DT_PER_GROUP + out_row_base = lane_idx * OUTPUT_ROW_GROUP + src_row_lane_offset = lane_idx * HH_PER_TILE + + for dt in range(DT_PER_GROUP): + dt_idx = dt_base + dt + src_row = dt_idx * INPUT_COLS + src_row_lane_offset + tile = rope_even[src_row:src_row + HH_PER_TILE, :] + rope_odd[src_row:src_row + HH_PER_TILE, :] + tile_bf16 = float32_to_bf16(tile) + dst_row = out_row_base + dt_idx + for hh in range(HH_PER_TILE): + col0 = OUTPUT_COL_BASE + hh * OUTPUT_COL_STRIDE + out[dst_row, col0:col0 + INPUT_COLS] = tile_bf16[hh] + + return buffers, {"v1": out.reshape(-1)} + + +def run_case(case_name: str): + if case_name not in SUPPORTED_CASES: + raise KeyError(f"unsupported case: {case_name}") + meta = load_case_meta() + generator = rng() + ints = load_int32_assignments() + buffers, golden = build_case(meta, generator, ints) + write_buffers(meta, buffers) + write_golden(meta, golden) diff --git a/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7.pto b/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7.pto new file mode 100644 index 000000000..90cf94b28 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7.pto @@ -0,0 +1,53 @@ +module attributes {pto.target_arch = "a5"} { + func.func @sparse_attn_test_incore_7(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c2048_i64 = arith.constant 2048 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c1024_index = arith.constant 1024 : index + %c4096_index = arith.constant 4096 : index + %c1_index = arith.constant 1 : index + %c8192_index = arith.constant 8192 : index + %c64_index = arith.constant 64 : index + %c8_index = arith.constant 8 : index + %c32_index = arith.constant 32 : index + %c0_index = arith.constant 0 : index + %c128_index = arith.constant 128 : index + %c512_index = arith.constant 512 : index + %c448_index = arith.constant 448 : index + %o_packed_inline10752__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_even_interleave_buf_inline10767__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_odd_interleave_buf_inline10773__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout}: !pto.tensor_view + %rope_pack_block_inline10687__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index + %1 = arith.divsi %rope_pack_block_inline10687__ssa_v0, %c8_index : index + %2 = arith.muli %1, %c8_index : index + %3 = arith.subi %rope_pack_block_inline10687__ssa_v0, %2 : index + %4 = arith.muli %1, %c32_index : index + scf.for %rope_combine_dt_inline10698__idx_v0 = %c0_index to %c32_index step %c1_index { + %5 = arith.addi %4, %rope_combine_dt_inline10698__idx_v0 : index + %6 = arith.muli %5, %c64_index : index + %7 = arith.muli %3, %c8_index : index + %8 = arith.addi %6, %7 : index + %rope_even_tile_inline10807__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_even_interleave_buf_inline10767__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline10767__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_even_interleave_buf_inline10767__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline10807__tile : !pto.tile_buf) + %rope_odd_tile_inline10623__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + %rope_odd_interleave_buf_inline10773__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline10773__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + pto.tload ins(%rope_odd_interleave_buf_inline10773__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline10623__tile : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tadd ins(%rope_even_tile_inline10807__tile, %rope_odd_tile_inline10623__tile : !pto.tile_buf, !pto.tile_buf) outs(%t__tile : !pto.tile_buf) + %rope_full_inline10765__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf + pto.tcvt ins(%t__tile {rmode = #pto} : !pto.tile_buf) outs(%rope_full_inline10765__tile : !pto.tile_buf) + %9 = arith.muli %3, %c128_index : index + %10 = arith.addi %9, %5 : index + scf.for %rope_pack_hh_inline10731__idx_v0 = %c0_index to %c8_index step %c1_index { + %11 = arith.muli %rope_pack_hh_inline10731__idx_v0, %c512_index : index + %12 = arith.addi %11, %c448_index : index + %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf + %slice_view = pto.subview %rope_full_inline10765__tile[%rope_pack_hh_inline10731__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf -> !pto.tile_buf + %o_packed_inline10752__iter_v12_pview = pto.partition_view %o_packed_inline10752__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view -> !pto.partition_tensor_view<1x64xbf16> + pto.tstore ins(%slice_view : !pto.tile_buf) outs(%o_packed_inline10752__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>) + } + } + return + } +} diff --git a/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7_golden.py b/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7_golden.py new file mode 100644 index 000000000..94772cb17 --- /dev/null +++ b/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7_golden.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +from deepseek_v4_decode_golden_lib import run_case + + +if __name__ == "__main__": + run_case("sparse_attn_test_incore_7") diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 13a449071..bbf1e056b 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -20,7 +20,7 @@ PTOAS_OUT_DIR="${PTOAS_OUT_DIR:-}" PTO_BUILD_DIR="${PTO_BUILD_DIR:-}" PTOAS_ENABLE_INSERT_SYNC="${PTOAS_ENABLE_INSERT_SYNC:-1}" PTOAS_FLAGS="${PTOAS_FLAGS:-}" -PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3DecodeA3 Qwen3DecodeA5 CommSync}" +PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3DecodeA3 Qwen3DecodeA5 DeepseekV4DecodeA3 DeepseekV4DecodeA5 CommSync}" ENABLE_BC=0 usage() { @@ -38,7 +38,7 @@ Env: PTO_BUILD_DIR # build directory root that contains tools/ptoas and tools/ptobc (optional) PTOAS_FLAGS # extra flags passed to ptoas (e.g. --enable-insert-sync) PTOAS_ENABLE_INSERT_SYNC # 1 to append --enable-insert-sync to PTOAS_FLAGS (default: 1) - PTO_PTO_DIRS # space-separated dirs to run .pto directly (default: Sync Qwen3DecodeA3 Qwen3DecodeA5) + PTO_PTO_DIRS # space-separated dirs to run .pto directly (default: Sync Qwen3DecodeA3 Qwen3DecodeA5 DeepseekV4DecodeA3 DeepseekV4DecodeA5) Flags: --enablebc # enable: python -> .pto -> ptobc -> .pto -> ptoas @@ -167,7 +167,7 @@ process_one_dir() { if [[ "${ENABLE_BC}" == "1" ]]; then use_ptobc_roundtrip=1 fi - if [[ "$A" == "Qwen3DecodeA3" || "$A" == "Qwen3DecodeA5" ]]; then + if [[ "$A" == "Qwen3DecodeA3" || "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA3" || "$A" == "DeepseekV4DecodeA5" ]]; then use_ptobc_roundtrip=0 fi local -a ptoas_flags=() @@ -206,7 +206,7 @@ process_one_dir() { fi done fi - if [[ "$A" == "Qwen3DecodeA5" ]]; then + if [[ "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA5" ]]; then if [[ $has_pto_arch_override -eq 0 ]]; then ptoas_flags+=(--pto-arch a5) target_arch="a5" @@ -214,7 +214,7 @@ process_one_dir() { if [[ $has_pto_level_override -eq 0 ]]; then ptoas_flags+=(--pto-level=level3) fi - elif [[ "$A" == "Qwen3DecodeA3" ]]; then + elif [[ "$A" == "Qwen3DecodeA3" || "$A" == "DeepseekV4DecodeA3" ]]; then if [[ $has_pto_level_override -eq 0 ]]; then ptoas_flags+=(--pto-level=level3) fi @@ -251,47 +251,47 @@ process_one_dir() { fi local soc_lc="${SOC_VERSION:-}" soc_lc="$(printf '%s' "${soc_lc}" | tr '[:upper:]' '[:lower:]')" - if [[ "$A" == "Qwen3DecodeA3" && "${target_arch_lc}" != "a3" ]]; then - local qwen_case - for qwen_case in "$dir"/*.pto; do - [[ -f "$qwen_case" ]] || continue - case "$qwen_case" in + if [[ ( "$A" == "Qwen3DecodeA3" || "$A" == "DeepseekV4DecodeA3" ) && "${target_arch_lc}" != "a3" ]]; then + local direct_case + for direct_case in "$dir"/*.pto; do + [[ -f "$direct_case" ]] || continue + case "$direct_case" in *-pto-ir.pto) continue ;; esac - echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires --pto-arch=a3" + echo -e "${A}($(basename "$direct_case"))\tSKIP\trequires --pto-arch=a3" done return 0 fi - if [[ "$A" == "Qwen3DecodeA3" && -n "${soc_lc}" && ( "${soc_lc}" == *"a5"* || "${soc_lc}" == *"950"* ) ]]; then - local qwen_case - for qwen_case in "$dir"/*.pto; do - [[ -f "$qwen_case" ]] || continue - case "$qwen_case" in + if [[ ( "$A" == "Qwen3DecodeA3" || "$A" == "DeepseekV4DecodeA3" ) && -n "${soc_lc}" && ( "${soc_lc}" == *"a5"* || "${soc_lc}" == *"950"* ) ]]; then + local direct_case + for direct_case in "$dir"/*.pto; do + [[ -f "$direct_case" ]] || continue + case "$direct_case" in *-pto-ir.pto) continue ;; esac - echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires A3 target SOC" + echo -e "${A}($(basename "$direct_case"))\tSKIP\trequires A3 target SOC" done return 0 fi - if [[ "$A" == "Qwen3DecodeA5" && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then - local qwen_case - for qwen_case in "$dir"/*.pto; do - [[ -f "$qwen_case" ]] || continue - case "$qwen_case" in + if [[ ( "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA5" ) && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then + local direct_case + for direct_case in "$dir"/*.pto; do + [[ -f "$direct_case" ]] || continue + case "$direct_case" in *-pto-ir.pto) continue ;; esac - echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires --pto-arch=a5" + echo -e "${A}($(basename "$direct_case"))\tSKIP\trequires --pto-arch=a5" done return 0 fi - if [[ "$A" == "Qwen3DecodeA5" && -n "${soc_lc}" && "${soc_lc}" != *"a5"* && "${soc_lc}" != *"950"* ]]; then - local qwen_case - for qwen_case in "$dir"/*.pto; do - [[ -f "$qwen_case" ]] || continue - case "$qwen_case" in + if [[ ( "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA5" ) && -n "${soc_lc}" && "${soc_lc}" != *"a5"* && "${soc_lc}" != *"950"* ]]; then + local direct_case + for direct_case in "$dir"/*.pto; do + [[ -f "$direct_case" ]] || continue + case "$direct_case" in *-pto-ir.pto) continue ;; esac - echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires A5 target SOC" + echo -e "${A}($(basename "$direct_case"))\tSKIP\trequires A5 target SOC" done return 0 fi @@ -1307,7 +1307,7 @@ PY ptobc_file="${out_subdir}/${base}.ptobc" decoded_pto="${out_subdir}/${base}-roundtrip.pto" cpp="${out_subdir}/${base}.cpp" - if [[ "$A" == "Qwen3DecodeA3" || "$A" == "Qwen3DecodeA5" ]]; then + if [[ "$A" == "Qwen3DecodeA3" || "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA3" || "$A" == "DeepseekV4DecodeA5" ]]; then cpp="${out_subdir}/${base}-pto.cpp" fi local sample_use_ptobc_roundtrip="$use_ptobc_roundtrip"