diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py
index bc7f208a1..2a30e5db7 100644
--- a/test/npu_validation/scripts/generate_testcase.py
+++ b/test/npu_validation/scripts/generate_testcase.py
@@ -86,7 +86,25 @@
     "xor",
 })
 
-CASE_INT_SCALAR_DEFAULTS = {}
+DEEPSEEK_V4_DIRECT_CASES = frozenset({
+    "attention_csa_test_refresh_incore_81",
+    "attention_hca_test_incore_54",
+    "attention_swa_test_incore_40",
+    "decode_csa_test_incore_81",
+    "decode_hca_test_incore_54",
+    "decode_swa_test_incore_40",
+    "sparse_attn_test_incore_7",
+})
+
+CASE_INT_SCALAR_DEFAULTS = {
+    testcase: {
+        "v4": 0,
+        "v5": 32,
+    }
+    for testcase in DEEPSEEK_V4_DIRECT_CASES
+}
+
+CASE_BOOL_SCALAR_DEFAULTS = {}
 
 CASE_POINTER_COUNT_MINIMUMS = {
     "down_proj_residual": {
@@ -97,6 +115,14 @@
         "v1": 123648,
         "v2": 123648,
     },
+    **{
+        testcase: {
+            "v1": 1024 * 4096,
+            "v2": 8192 * 64,
+            "v3": 8192 * 64,
+        }
+        for testcase in DEEPSEEK_V4_DIRECT_CASES
+    },
 }
 
 
@@ -833,6 +859,13 @@ def _integer_scalar_default_value(testcase: str, name: str, host_type: str) -> O
     return None
 
 
+def _bool_scalar_default_value(testcase: str, name: str) -> Optional[bool]:
+    override = CASE_BOOL_SCALAR_DEFAULTS.get(testcase, {}).get(name)
+    if override is None:
+        return None
+    return bool(override)
+
+
 def _derive_testcase_name(input_cpp: Path) -> str:
     name = input_cpp.stem
     if name.endswith("-pto"):
@@ -1704,9 +1737,11 @@ def generate_testcase(
             param_decls_lines.append(f"    {t} {p['name']}{{128, 128, 128, 128}};")
             continue
         if t == "bool":
-            value = "true"
+            bool_override = _bool_scalar_default_value(testcase, p["name"])
+            value = "true" if bool_override is None else ("true" if bool_override else "false")
         elif re.match(r"^(u?int)(8|16|32|64)_t$", t) or t in {"int", "unsigned", "size_t"}:
-            value = str(_integer_scalar_default_value(testcase, p["name"], t) or 1)
+            int_override = _integer_scalar_default_value(testcase, p["name"], t)
+            value = "1" if int_override is None else str(int_override)
         elif t in {"float"}:
             value = "1.0f"
         elif t in {"double"}:
diff --git a/test/samples/DeepseekV4DecodeA3/README.md b/test/samples/DeepseekV4DecodeA3/README.md
new file mode 100644
index 000000000..9fe0a95fa
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/README.md
@@ -0,0 +1,18 @@
+DeepSeek V4 decode PTO kernels for A3, generated from `hw-native-sys/pypto-lib` `models/deepseek/v4` at commit `be3c7942420b48fbab4ab1150edbc4ca8a125b94`.
+
+Scope:
+- compile-regression inputs for `ptoas`
+- board-validation inputs for direct `.pto` kernels
+
+Notes:
+- This directory vendors the primary raw `.pto` fragments emitted from these source modules:
+  - `decode_attention_csa.py`
+  - `decode_attention_hca.py`
+  - `decode_attention_swa.py`
+  - `decode_csa.py`
+  - `decode_hca.py`
+  - `decode_sparse_attn.py`
+  - `decode_swa.py`
+- The `.pto` file contents are copied directly from PyPTO raw PTO backend output and are not hand-edited.
+- `runop.sh` defaults these cases to `--pto-level=level3`.
+- Board-validation uses custom `*_golden.py` references for the standalone rope-pack kernel and full-buffer sizing/default block args wired in `generate_testcase.py`.
diff --git a/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81.pto b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81.pto
new file mode 100644
index 000000000..316ff61b1
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @attention_csa_test_refresh_incore_81(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline1734__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline1700__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline1743__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline1564__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline1564__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline1564__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline1586__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline1586__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline1551__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline1700__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline1700__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline1700__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline1551__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline1580__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline1743__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline1743__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline1743__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline1580__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline1551__tile, %rope_odd_tile_inline1580__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline1610__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline1610__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline1664__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline1664__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline1610__tile[%rope_pack_hh_inline1664__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline1734__iter_v12_pview = pto.partition_view %o_packed_inline1734__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline1734__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81_golden.py b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81_golden.py
new file mode 100644
index 000000000..be04f4648
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/attention_csa_test_refresh_incore_81_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("attention_csa_test_refresh_incore_81")
diff --git a/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto
new file mode 100644
index 000000000..532544884
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @attention_hca_test_incore_54(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline2863__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline2905__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline2903__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline2884__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline2884__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline2884__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline2763__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline2763__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline2886__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline2905__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline2905__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline2905__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline2886__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline2893__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline2903__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline2903__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline2903__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline2893__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline2886__tile, %rope_odd_tile_inline2893__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline2779__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline2779__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline2823__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline2823__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline2779__tile[%rope_pack_hh_inline2823__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline2863__iter_v12_pview = pto.partition_view %o_packed_inline2863__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline2863__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54_golden.py
new file mode 100644
index 000000000..c06cd7cad
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/attention_hca_test_incore_54_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("attention_hca_test_incore_54")
diff --git a/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40.pto b/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40.pto
new file mode 100644
index 000000000..83ab158bc
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @attention_swa_test_incore_40(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline3949__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline3805__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline3909__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline3775__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline3775__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline3775__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline3815__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline3815__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline3768__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline3805__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline3805__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline3805__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline3768__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline3938__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline3909__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline3909__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline3909__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline3938__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline3768__tile, %rope_odd_tile_inline3938__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline3923__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline3923__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline3881__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline3881__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline3923__tile[%rope_pack_hh_inline3881__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline3949__iter_v12_pview = pto.partition_view %o_packed_inline3949__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline3949__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40_golden.py b/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40_golden.py
new file mode 100644
index 000000000..d952e67d4
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/attention_swa_test_incore_40_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("attention_swa_test_incore_40")
diff --git a/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81.pto b/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81.pto
new file mode 100644
index 000000000..4a4814276
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @decode_csa_test_incore_81(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline5034_inline5466__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline5005_inline5933__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline5144_inline5315__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline4996_inline5240__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline4996_inline5240__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline4996_inline5240__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline5139_inline5238__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline5139_inline5238__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline5063_inline5236__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline5005_inline5933__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline5005_inline5933__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline5005_inline5933__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline5063_inline5236__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline4993_inline5563__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline5144_inline5315__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline5144_inline5315__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline5144_inline5315__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline4993_inline5563__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline5063_inline5236__tile, %rope_odd_tile_inline4993_inline5563__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline5180_inline5657__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline5180_inline5657__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline5002_inline5496__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline5002_inline5496__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline5180_inline5657__tile[%rope_pack_hh_inline5002_inline5496__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline5034_inline5466__iter_v12_pview = pto.partition_view %o_packed_inline5034_inline5466__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline5034_inline5466__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81_golden.py b/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81_golden.py
new file mode 100644
index 000000000..dd19fb8e8
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/decode_csa_test_incore_81_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("decode_csa_test_incore_81")
diff --git a/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54.pto
new file mode 100644
index 000000000..97d2cc6e0
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @decode_hca_test_incore_54(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline8608_inline9732__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline8487_inline9733__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline8509_inline9849__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline8480_inline9901__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline8480_inline9901__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline8480_inline9901__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline8470_inline9643__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline8470_inline9643__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline8559_inline9640__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline8487_inline9733__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline8487_inline9733__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline8487_inline9733__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline8559_inline9640__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline8611_inline9639__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline8509_inline9849__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline8509_inline9849__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline8509_inline9849__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline8611_inline9639__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline8559_inline9640__tile, %rope_odd_tile_inline8611_inline9639__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline8468_inline9638__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline8468_inline9638__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline8516_inline9636__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline8516_inline9636__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline8468_inline9638__tile[%rope_pack_hh_inline8516_inline9636__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline8608_inline9732__iter_v12_pview = pto.partition_view %o_packed_inline8608_inline9732__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline8608_inline9732__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54_golden.py
new file mode 100644
index 000000000..586712473
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/decode_hca_test_incore_54_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("decode_hca_test_incore_54")
diff --git a/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40.pto b/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40.pto
new file mode 100644
index 000000000..cd35a5270
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @decode_swa_test_incore_40(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline11580_inline11963__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline11549_inline12171__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline11657_inline11858__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline11502_inline11743__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline11502_inline11743__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline11502_inline11743__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline11596_inline12136__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline11596_inline12136__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline11616_inline11737__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline11549_inline12171__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline11549_inline12171__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline11549_inline12171__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline11616_inline11737__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline11499_inline11736__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline11657_inline11858__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline11657_inline11858__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline11657_inline11858__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline11499_inline11736__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline11616_inline11737__tile, %rope_odd_tile_inline11499_inline11736__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline11567_inline11735__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline11567_inline11735__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline11511_inline11733__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline11511_inline11733__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline11567_inline11735__tile[%rope_pack_hh_inline11511_inline11733__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline11580_inline11963__iter_v12_pview = pto.partition_view %o_packed_inline11580_inline11963__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline11580_inline11963__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40_golden.py b/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40_golden.py
new file mode 100644
index 000000000..9314c315d
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/decode_swa_test_incore_40_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("decode_swa_test_incore_40")
diff --git a/test/samples/DeepseekV4DecodeA3/deepseek_v4_decode_golden_lib.py b/test/samples/DeepseekV4DecodeA3/deepseek_v4_decode_golden_lib.py
new file mode 100644
index 000000000..008d217c9
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/deepseek_v4_decode_golden_lib.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    rng,
+    write_buffers,
+    write_golden,
+)
+
+SUPPORTED_CASES = frozenset({
+    "attention_csa_test_refresh_incore_81",
+    "attention_hca_test_incore_54",
+    "attention_swa_test_incore_40",
+    "decode_csa_test_incore_81",
+    "decode_hca_test_incore_54",
+    "decode_swa_test_incore_40",
+    "sparse_attn_test_incore_7",
+})
+
+OUTPUT_ROWS = 1024
+OUTPUT_COLS = 4096
+INPUT_ROWS = 8192
+INPUT_COLS = 64
+BLOCK_GROUP = 8
+DT_PER_GROUP = 32
+HH_PER_TILE = 8
+OUTPUT_ROW_GROUP = 128
+OUTPUT_COL_STRIDE = 512
+OUTPUT_COL_BASE = 448
+
+
+def _require_count(meta, name: str, expected: int) -> None:
+    actual = int(meta.elem_counts[name])
+    if actual != expected:
+        raise ValueError(f"{name}: expected {expected} elements, got {actual}")
+
+
+def _make_bf16_zeros(meta, name: str, expected: int) -> np.ndarray:
+    _require_count(meta, name, expected)
+    return np.zeros(expected, dtype=meta.np_types[name])
+
+
+def _make_fp32_input(meta, name: str, generator, expected: int) -> np.ndarray:
+    _require_count(meta, name, expected)
+    values = generator.uniform(-0.5, 0.5, size=expected).astype(np.float32)
+    return values.astype(meta.np_types[name], copy=False)
+
+
+def build_case(meta, generator, ints):
+    if meta.outputs != ["v1"]:
+        raise ValueError(f"unexpected outputs: {meta.outputs}")
+    if meta.read_order != ["v1", "v2", "v3"]:
+        raise ValueError(f"unexpected read order: {meta.read_order}")
+    if len(ints) < 2:
+        raise ValueError(f"expected block_idx/block_num int32 params, got {ints}")
+
+    block_idx, block_num = ints[:2]
+    if block_num <= 0:
+        raise ValueError(f"invalid block_num={block_num}")
+    if block_idx < 0 or block_idx >= block_num:
+        raise ValueError(f"invalid block_idx={block_idx} for block_num={block_num}")
+
+    output_elems = OUTPUT_ROWS * OUTPUT_COLS
+    input_elems = INPUT_ROWS * INPUT_COLS
+    buffers = {
+        "v1": _make_bf16_zeros(meta, "v1", output_elems),
+        "v2": _make_fp32_input(meta, "v2", generator, input_elems),
+        "v3": _make_fp32_input(meta, "v3", generator, input_elems),
+    }
+
+    out = np.array(buffers["v1"], copy=True).reshape(OUTPUT_ROWS, OUTPUT_COLS)
+    rope_even = np.asarray(buffers["v2"], dtype=np.float32).reshape(INPUT_ROWS, INPUT_COLS)
+    rope_odd = np.asarray(buffers["v3"], dtype=np.float32).reshape(INPUT_ROWS, INPUT_COLS)
+
+    group_idx = block_idx // BLOCK_GROUP
+    lane_idx = block_idx % BLOCK_GROUP
+    dt_base = group_idx * DT_PER_GROUP
+    out_row_base = lane_idx * OUTPUT_ROW_GROUP
+    src_row_lane_offset = lane_idx * HH_PER_TILE
+
+    for dt in range(DT_PER_GROUP):
+        dt_idx = dt_base + dt
+        src_row = dt_idx * INPUT_COLS + src_row_lane_offset
+        tile = rope_even[src_row:src_row + HH_PER_TILE, :] + rope_odd[src_row:src_row + HH_PER_TILE, :]
+        tile_bf16 = float32_to_bf16(tile)
+        dst_row = out_row_base + dt_idx
+        for hh in range(HH_PER_TILE):
+            col0 = OUTPUT_COL_BASE + hh * OUTPUT_COL_STRIDE
+            out[dst_row, col0:col0 + INPUT_COLS] = tile_bf16[hh]
+
+    return buffers, {"v1": out.reshape(-1)}
+
+
+def run_case(case_name: str):
+    if case_name not in SUPPORTED_CASES:
+        raise KeyError(f"unsupported case: {case_name}")
+    meta = load_case_meta()
+    generator = rng()
+    ints = load_int32_assignments()
+    buffers, golden = build_case(meta, generator, ints)
+    write_buffers(meta, buffers)
+    write_golden(meta, golden)
diff --git a/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7.pto b/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7.pto
new file mode 100644
index 000000000..37788d06a
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @sparse_attn_test_incore_7(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline10659__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline10617__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline10794__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline10638__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline10638__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline10638__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline10624__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline10624__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline10786__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline10617__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline10617__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline10617__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline10786__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline10683__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline10794__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline10794__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline10794__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline10683__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline10786__tile, %rope_odd_tile_inline10683__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline10618__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline10618__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline10802__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline10802__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline10618__tile[%rope_pack_hh_inline10802__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline10659__iter_v12_pview = pto.partition_view %o_packed_inline10659__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline10659__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7_golden.py b/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7_golden.py
new file mode 100644
index 000000000..94772cb17
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA3/sparse_attn_test_incore_7_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("sparse_attn_test_incore_7")
diff --git a/test/samples/DeepseekV4DecodeA5/README.md b/test/samples/DeepseekV4DecodeA5/README.md
new file mode 100644
index 000000000..64aeaaf75
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/README.md
@@ -0,0 +1,18 @@
+DeepSeek V4 decode PTO kernels for A5, generated from `hw-native-sys/pypto-lib` `models/deepseek/v4` at commit `be3c7942420b48fbab4ab1150edbc4ca8a125b94`.
+
+Scope:
+- compile-regression inputs for `ptoas`
+- board-validation inputs for direct `.pto` kernels
+
+Notes:
+- This directory vendors the primary raw `.pto` fragments emitted from these source modules:
+  - `decode_attention_csa.py`
+  - `decode_attention_hca.py`
+  - `decode_attention_swa.py`
+  - `decode_csa.py`
+  - `decode_hca.py`
+  - `decode_sparse_attn.py`
+  - `decode_swa.py`
+- The `.pto` file contents are copied directly from PyPTO raw PTO backend output and are not hand-edited.
+- `runop.sh` defaults these cases to `--pto-arch=a5 --pto-level=level3`.
+- Board-validation uses custom `*_golden.py` references for the standalone rope-pack kernel and full-buffer sizing/default block args wired in `generate_testcase.py`.
diff --git a/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81.pto b/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81.pto
new file mode 100644
index 000000000..f9e79f0d0
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @attention_csa_test_refresh_incore_81(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline1689__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline1729__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline1709__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline1568__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline1568__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline1568__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline1562__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline1562__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline1627__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline1729__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline1729__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline1729__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline1627__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline1733__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline1709__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline1709__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline1709__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline1733__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline1627__tile, %rope_odd_tile_inline1733__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline1651__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline1651__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline1748__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline1748__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline1651__tile[%rope_pack_hh_inline1748__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline1689__iter_v12_pview = pto.partition_view %o_packed_inline1689__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline1689__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81_golden.py b/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81_golden.py
new file mode 100644
index 000000000..be04f4648
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/attention_csa_test_refresh_incore_81_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("attention_csa_test_refresh_incore_81")
diff --git a/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54.pto
new file mode 100644
index 000000000..92efddc85
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @attention_hca_test_incore_54(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline2857__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline2862__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline2930__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline2876__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline2876__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline2876__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline2873__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline2873__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline2893__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline2862__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline2862__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline2862__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline2893__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline2867__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline2930__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline2930__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline2930__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline2867__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline2893__tile, %rope_odd_tile_inline2867__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline2784__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline2784__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline2871__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline2871__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline2784__tile[%rope_pack_hh_inline2871__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline2857__iter_v12_pview = pto.partition_view %o_packed_inline2857__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline2857__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54_golden.py
new file mode 100644
index 000000000..c06cd7cad
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/attention_hca_test_incore_54_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("attention_hca_test_incore_54")
diff --git a/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40.pto b/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40.pto
new file mode 100644
index 000000000..e8ba1da42
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @attention_swa_test_incore_40(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline3851__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline3880__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline3847__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline3768__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline3768__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline3768__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline3765__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline3765__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline3791__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline3880__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline3880__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline3880__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline3791__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline3796__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline3847__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline3847__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline3847__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline3796__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline3791__tile, %rope_odd_tile_inline3796__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline3924__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline3924__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline3762__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline3762__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline3924__tile[%rope_pack_hh_inline3762__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline3851__iter_v12_pview = pto.partition_view %o_packed_inline3851__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline3851__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40_golden.py b/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40_golden.py
new file mode 100644
index 000000000..d952e67d4
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/attention_swa_test_incore_40_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("attention_swa_test_incore_40")
diff --git a/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81.pto b/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81.pto
new file mode 100644
index 000000000..6d90a8cc0
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_csa_test_incore_81(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline5166_inline5302__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline5130_inline5946__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline5186_inline5504__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline5024_inline5227__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline5024_inline5227__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline5024_inline5227__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline5054_inline5659__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline5054_inline5659__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline4995_inline5303__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline5130_inline5946__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline5130_inline5946__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline5130_inline5946__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline4995_inline5303__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline5019_inline5875__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline5186_inline5504__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline5186_inline5504__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline5186_inline5504__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline5019_inline5875__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline4995_inline5303__tile, %rope_odd_tile_inline5019_inline5875__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline4992_inline5443__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline4992_inline5443__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline4999_inline5764__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline4999_inline5764__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline4992_inline5443__tile[%rope_pack_hh_inline4999_inline5764__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline5166_inline5302__iter_v12_pview = pto.partition_view %o_packed_inline5166_inline5302__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline5166_inline5302__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81_golden.py b/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81_golden.py
new file mode 100644
index 000000000..dd19fb8e8
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/decode_csa_test_incore_81_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("decode_csa_test_incore_81")
diff --git a/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54.pto b/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54.pto
new file mode 100644
index 000000000..86635e446
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_hca_test_incore_54(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline8609_inline9735__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline8627_inline9736__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline8617_inline9986__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline8470_inline9832__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline8470_inline9832__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline8470_inline9832__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline8542_inline10208__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline8542_inline10208__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline8641_inline10183__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline8627_inline9736__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline8627_inline9736__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline8627_inline9736__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline8641_inline10183__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline8468_inline9641__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline8617_inline9986__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline8617_inline9986__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline8617_inline9986__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline8468_inline9641__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline8641_inline10183__tile, %rope_odd_tile_inline8468_inline9641__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline8624_inline9902__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline8624_inline9902__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline8588_inline9639__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline8588_inline9639__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline8624_inline9902__tile[%rope_pack_hh_inline8588_inline9639__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline8609_inline9735__iter_v12_pview = pto.partition_view %o_packed_inline8609_inline9735__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline8609_inline9735__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54_golden.py b/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54_golden.py
new file mode 100644
index 000000000..586712473
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/decode_hca_test_incore_54_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("decode_hca_test_incore_54")
diff --git a/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40.pto b/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40.pto
new file mode 100644
index 000000000..f4535d03a
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_swa_test_incore_40(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline11646_inline11838__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline11658_inline11945__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline11652_inline11841__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline11509_inline11925__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline11509_inline11925__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline11509_inline11925__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline11505_inline11741__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline11505_inline11741__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline11616_inline11782__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline11658_inline11945__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline11658_inline11945__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline11658_inline11945__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline11616_inline11782__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline11503_inline11738__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline11652_inline11841__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline11652_inline11841__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline11652_inline11841__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline11503_inline11738__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline11616_inline11782__tile, %rope_odd_tile_inline11503_inline11738__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline11673_inline12024__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline11673_inline12024__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline11502_inline11737__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline11502_inline11737__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline11673_inline12024__tile[%rope_pack_hh_inline11502_inline11737__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline11646_inline11838__iter_v12_pview = pto.partition_view %o_packed_inline11646_inline11838__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline11646_inline11838__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40_golden.py b/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40_golden.py
new file mode 100644
index 000000000..9314c315d
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/decode_swa_test_incore_40_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("decode_swa_test_incore_40")
diff --git a/test/samples/DeepseekV4DecodeA5/deepseek_v4_decode_golden_lib.py b/test/samples/DeepseekV4DecodeA5/deepseek_v4_decode_golden_lib.py
new file mode 100644
index 000000000..008d217c9
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/deepseek_v4_decode_golden_lib.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import numpy as np
+
+from validation_runtime import (
+    float32_to_bf16,
+    load_case_meta,
+    load_int32_assignments,
+    rng,
+    write_buffers,
+    write_golden,
+)
+
+SUPPORTED_CASES = frozenset({
+    "attention_csa_test_refresh_incore_81",
+    "attention_hca_test_incore_54",
+    "attention_swa_test_incore_40",
+    "decode_csa_test_incore_81",
+    "decode_hca_test_incore_54",
+    "decode_swa_test_incore_40",
+    "sparse_attn_test_incore_7",
+})
+
+OUTPUT_ROWS = 1024
+OUTPUT_COLS = 4096
+INPUT_ROWS = 8192
+INPUT_COLS = 64
+BLOCK_GROUP = 8
+DT_PER_GROUP = 32
+HH_PER_TILE = 8
+OUTPUT_ROW_GROUP = 128
+OUTPUT_COL_STRIDE = 512
+OUTPUT_COL_BASE = 448
+
+
+def _require_count(meta, name: str, expected: int) -> None:
+    actual = int(meta.elem_counts[name])
+    if actual != expected:
+        raise ValueError(f"{name}: expected {expected} elements, got {actual}")
+
+
+def _make_bf16_zeros(meta, name: str, expected: int) -> np.ndarray:
+    _require_count(meta, name, expected)
+    return np.zeros(expected, dtype=meta.np_types[name])
+
+
+def _make_fp32_input(meta, name: str, generator, expected: int) -> np.ndarray:
+    _require_count(meta, name, expected)
+    values = generator.uniform(-0.5, 0.5, size=expected).astype(np.float32)
+    return values.astype(meta.np_types[name], copy=False)
+
+
+def build_case(meta, generator, ints):
+    if meta.outputs != ["v1"]:
+        raise ValueError(f"unexpected outputs: {meta.outputs}")
+    if meta.read_order != ["v1", "v2", "v3"]:
+        raise ValueError(f"unexpected read order: {meta.read_order}")
+    if len(ints) < 2:
+        raise ValueError(f"expected block_idx/block_num int32 params, got {ints}")
+
+    block_idx, block_num = ints[:2]
+    if block_num <= 0:
+        raise ValueError(f"invalid block_num={block_num}")
+    if block_idx < 0 or block_idx >= block_num:
+        raise ValueError(f"invalid block_idx={block_idx} for block_num={block_num}")
+
+    output_elems = OUTPUT_ROWS * OUTPUT_COLS
+    input_elems = INPUT_ROWS * INPUT_COLS
+    buffers = {
+        "v1": _make_bf16_zeros(meta, "v1", output_elems),
+        "v2": _make_fp32_input(meta, "v2", generator, input_elems),
+        "v3": _make_fp32_input(meta, "v3", generator, input_elems),
+    }
+
+    out = np.array(buffers["v1"], copy=True).reshape(OUTPUT_ROWS, OUTPUT_COLS)
+    rope_even = np.asarray(buffers["v2"], dtype=np.float32).reshape(INPUT_ROWS, INPUT_COLS)
+    rope_odd = np.asarray(buffers["v3"], dtype=np.float32).reshape(INPUT_ROWS, INPUT_COLS)
+
+    group_idx = block_idx // BLOCK_GROUP
+    lane_idx = block_idx % BLOCK_GROUP
+    dt_base = group_idx * DT_PER_GROUP
+    out_row_base = lane_idx * OUTPUT_ROW_GROUP
+    src_row_lane_offset = lane_idx * HH_PER_TILE
+
+    for dt in range(DT_PER_GROUP):
+        dt_idx = dt_base + dt
+        src_row = dt_idx * INPUT_COLS + src_row_lane_offset
+        tile = rope_even[src_row:src_row + HH_PER_TILE, :] + rope_odd[src_row:src_row + HH_PER_TILE, :]
+        tile_bf16 = float32_to_bf16(tile)
+        dst_row = out_row_base + dt_idx
+        for hh in range(HH_PER_TILE):
+            col0 = OUTPUT_COL_BASE + hh * OUTPUT_COL_STRIDE
+            out[dst_row, col0:col0 + INPUT_COLS] = tile_bf16[hh]
+
+    return buffers, {"v1": out.reshape(-1)}
+
+
+def run_case(case_name: str):
+    if case_name not in SUPPORTED_CASES:
+        raise KeyError(f"unsupported case: {case_name}")
+    meta = load_case_meta()
+    generator = rng()
+    ints = load_int32_assignments()
+    buffers, golden = build_case(meta, generator, ints)
+    write_buffers(meta, buffers)
+    write_golden(meta, golden)
diff --git a/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7.pto b/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7.pto
new file mode 100644
index 000000000..90cf94b28
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7.pto
@@ -0,0 +1,53 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @sparse_attn_test_incore_7(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %__pypto_spmd_block_idx: i32, %__pypto_spmd_block_num: i32) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c2048_i64 = arith.constant 2048 : i64
+  %c4096_i64 = arith.constant 4096 : i64
+  %c1024_index = arith.constant 1024 : index
+  %c4096_index = arith.constant 4096 : index
+  %c1_index = arith.constant 1 : index
+  %c8192_index = arith.constant 8192 : index
+  %c64_index = arith.constant 64 : index
+  %c8_index = arith.constant 8 : index
+  %c32_index = arith.constant 32 : index
+  %c0_index = arith.constant 0 : index
+  %c128_index = arith.constant 128 : index
+  %c512_index = arith.constant 512 : index
+  %c448_index = arith.constant 448 : index
+  %o_packed_inline10752__rv_v2_view = pto.make_tensor_view %arg0, shape = [%c1024_index, %c4096_index], strides = [%c4096_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %rope_even_interleave_buf_inline10767__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_odd_interleave_buf_inline10773__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8192_index, %c64_index], strides = [%c64_index, %c1_index] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %rope_pack_block_inline10687__ssa_v0 = arith.index_cast %__pypto_spmd_block_idx : i32 to index
+  %1 = arith.divsi %rope_pack_block_inline10687__ssa_v0, %c8_index : index
+  %2 = arith.muli %1, %c8_index : index
+  %3 = arith.subi %rope_pack_block_inline10687__ssa_v0, %2 : index
+  %4 = arith.muli %1, %c32_index : index
+  scf.for %rope_combine_dt_inline10698__idx_v0 = %c0_index to %c32_index step %c1_index {
+    %5 = arith.addi %4, %rope_combine_dt_inline10698__idx_v0 : index
+    %6 = arith.muli %5, %c64_index : index
+    %7 = arith.muli %3, %c8_index : index
+    %8 = arith.addi %6, %7 : index
+    %rope_even_tile_inline10807__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_even_interleave_buf_inline10767__rv_v2_pview = pto.partition_view %rope_even_interleave_buf_inline10767__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_even_interleave_buf_inline10767__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_even_tile_inline10807__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_odd_tile_inline10623__tile = pto.alloc_tile addr = %c2048_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %rope_odd_interleave_buf_inline10773__rv_v2_pview = pto.partition_view %rope_odd_interleave_buf_inline10773__rv_v2_view, offsets = [%8, %c0_index], sizes = [%c8_index, %c64_index] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+    pto.tload ins(%rope_odd_interleave_buf_inline10773__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%rope_odd_tile_inline10623__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c0_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%rope_even_tile_inline10807__tile, %rope_odd_tile_inline10623__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %rope_full_inline10765__tile = pto.alloc_tile addr = %c4096_i64 valid_row = %c8_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile {rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rope_full_inline10765__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %9 = arith.muli %3, %c128_index : index
+    %10 = arith.addi %9, %5 : index
+    scf.for %rope_pack_hh_inline10731__idx_v0 = %c0_index to %c8_index step %c1_index {
+      %11 = arith.muli %rope_pack_hh_inline10731__idx_v0, %c512_index : index
+      %12 = arith.addi %11, %c448_index : index
+      %0 = pto.alloc_tile addr = %c4096_i64 valid_row = %c1_index valid_col = %c64_index : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %slice_view = pto.subview %rope_full_inline10765__tile[%rope_pack_hh_inline10731__idx_v0, %c0_index] sizes [1, 64] : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %o_packed_inline10752__iter_v12_pview = pto.partition_view %o_packed_inline10752__rv_v2_view, offsets = [%10, %12], sizes = [%c1_index, %c64_index] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x64xbf16>
+      pto.tstore ins(%slice_view : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%o_packed_inline10752__iter_v12_pview : !pto.partition_tensor_view<1x64xbf16>)
+    }
+  }
+  return
+  }
+}
diff --git a/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7_golden.py b/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7_golden.py
new file mode 100644
index 000000000..94772cb17
--- /dev/null
+++ b/test/samples/DeepseekV4DecodeA5/sparse_attn_test_incore_7_golden.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from deepseek_v4_decode_golden_lib import run_case
+
+
+if __name__ == "__main__":
+    run_case("sparse_attn_test_incore_7")
diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index 13a449071..bbf1e056b 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -20,7 +20,7 @@ PTOAS_OUT_DIR="${PTOAS_OUT_DIR:-}"
 PTO_BUILD_DIR="${PTO_BUILD_DIR:-}"
 PTOAS_ENABLE_INSERT_SYNC="${PTOAS_ENABLE_INSERT_SYNC:-1}"
 PTOAS_FLAGS="${PTOAS_FLAGS:-}"
-PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3DecodeA3 Qwen3DecodeA5 CommSync}"
+PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3DecodeA3 Qwen3DecodeA5 DeepseekV4DecodeA3 DeepseekV4DecodeA5 CommSync}"
 ENABLE_BC=0
 
 usage() {
@@ -38,7 +38,7 @@ Env:
   PTO_BUILD_DIR  # build directory root that contains tools/ptoas and tools/ptobc (optional)
   PTOAS_FLAGS  # extra flags passed to ptoas (e.g. --enable-insert-sync)
   PTOAS_ENABLE_INSERT_SYNC  # 1 to append --enable-insert-sync to PTOAS_FLAGS (default: 1)
-  PTO_PTO_DIRS  # space-separated dirs to run .pto directly (default: Sync Qwen3DecodeA3 Qwen3DecodeA5)
+  PTO_PTO_DIRS  # space-separated dirs to run .pto directly (default: Sync Qwen3DecodeA3 Qwen3DecodeA5 DeepseekV4DecodeA3 DeepseekV4DecodeA5)
 
 Flags:
   --enablebc  # enable: python -> .pto -> ptobc -> .pto -> ptoas
@@ -167,7 +167,7 @@ process_one_dir() {
   if [[ "${ENABLE_BC}" == "1" ]]; then
     use_ptobc_roundtrip=1
   fi
-  if [[ "$A" == "Qwen3DecodeA3" || "$A" == "Qwen3DecodeA5" ]]; then
+  if [[ "$A" == "Qwen3DecodeA3" || "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA3" || "$A" == "DeepseekV4DecodeA5" ]]; then
     use_ptobc_roundtrip=0
   fi
   local -a ptoas_flags=()
@@ -206,7 +206,7 @@ process_one_dir() {
       fi
     done
   fi
-  if [[ "$A" == "Qwen3DecodeA5" ]]; then
+  if [[ "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA5" ]]; then
     if [[ $has_pto_arch_override -eq 0 ]]; then
       ptoas_flags+=(--pto-arch a5)
       target_arch="a5"
@@ -214,7 +214,7 @@ process_one_dir() {
     if [[ $has_pto_level_override -eq 0 ]]; then
       ptoas_flags+=(--pto-level=level3)
     fi
-  elif [[ "$A" == "Qwen3DecodeA3" ]]; then
+  elif [[ "$A" == "Qwen3DecodeA3" || "$A" == "DeepseekV4DecodeA3" ]]; then
     if [[ $has_pto_level_override -eq 0 ]]; then
       ptoas_flags+=(--pto-level=level3)
     fi
@@ -251,47 +251,47 @@ process_one_dir() {
   fi
   local soc_lc="${SOC_VERSION:-}"
   soc_lc="$(printf '%s' "${soc_lc}" | tr '[:upper:]' '[:lower:]')"
-  if [[ "$A" == "Qwen3DecodeA3" && "${target_arch_lc}" != "a3" ]]; then
-    local qwen_case
-    for qwen_case in "$dir"/*.pto; do
-      [[ -f "$qwen_case" ]] || continue
-      case "$qwen_case" in
+  if [[ ( "$A" == "Qwen3DecodeA3" || "$A" == "DeepseekV4DecodeA3" ) && "${target_arch_lc}" != "a3" ]]; then
+    local direct_case
+    for direct_case in "$dir"/*.pto; do
+      [[ -f "$direct_case" ]] || continue
+      case "$direct_case" in
         *-pto-ir.pto) continue ;;
       esac
-      echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires --pto-arch=a3"
+      echo -e "${A}($(basename "$direct_case"))\tSKIP\trequires --pto-arch=a3"
     done
     return 0
   fi
-  if [[ "$A" == "Qwen3DecodeA3" && -n "${soc_lc}" && ( "${soc_lc}" == *"a5"* || "${soc_lc}" == *"950"* ) ]]; then
-    local qwen_case
-    for qwen_case in "$dir"/*.pto; do
-      [[ -f "$qwen_case" ]] || continue
-      case "$qwen_case" in
+  if [[ ( "$A" == "Qwen3DecodeA3" || "$A" == "DeepseekV4DecodeA3" ) && -n "${soc_lc}" && ( "${soc_lc}" == *"a5"* || "${soc_lc}" == *"950"* ) ]]; then
+    local direct_case
+    for direct_case in "$dir"/*.pto; do
+      [[ -f "$direct_case" ]] || continue
+      case "$direct_case" in
         *-pto-ir.pto) continue ;;
       esac
-      echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires A3 target SOC"
+      echo -e "${A}($(basename "$direct_case"))\tSKIP\trequires A3 target SOC"
     done
     return 0
   fi
-  if [[ "$A" == "Qwen3DecodeA5" && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then
-    local qwen_case
-    for qwen_case in "$dir"/*.pto; do
-      [[ -f "$qwen_case" ]] || continue
-      case "$qwen_case" in
+  if [[ ( "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA5" ) && "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then
+    local direct_case
+    for direct_case in "$dir"/*.pto; do
+      [[ -f "$direct_case" ]] || continue
+      case "$direct_case" in
         *-pto-ir.pto) continue ;;
       esac
-      echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires --pto-arch=a5"
+      echo -e "${A}($(basename "$direct_case"))\tSKIP\trequires --pto-arch=a5"
     done
     return 0
   fi
-  if [[ "$A" == "Qwen3DecodeA5" && -n "${soc_lc}" && "${soc_lc}" != *"a5"* && "${soc_lc}" != *"950"* ]]; then
-    local qwen_case
-    for qwen_case in "$dir"/*.pto; do
-      [[ -f "$qwen_case" ]] || continue
-      case "$qwen_case" in
+  if [[ ( "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA5" ) && -n "${soc_lc}" && "${soc_lc}" != *"a5"* && "${soc_lc}" != *"950"* ]]; then
+    local direct_case
+    for direct_case in "$dir"/*.pto; do
+      [[ -f "$direct_case" ]] || continue
+      case "$direct_case" in
         *-pto-ir.pto) continue ;;
       esac
-      echo -e "${A}($(basename "$qwen_case"))\tSKIP\trequires A5 target SOC"
+      echo -e "${A}($(basename "$direct_case"))\tSKIP\trequires A5 target SOC"
     done
     return 0
   fi
@@ -1307,7 +1307,7 @@ PY
       ptobc_file="${out_subdir}/${base}.ptobc"
       decoded_pto="${out_subdir}/${base}-roundtrip.pto"
       cpp="${out_subdir}/${base}.cpp"
-      if [[ "$A" == "Qwen3DecodeA3" || "$A" == "Qwen3DecodeA5" ]]; then
+      if [[ "$A" == "Qwen3DecodeA3" || "$A" == "Qwen3DecodeA5" || "$A" == "DeepseekV4DecodeA3" || "$A" == "DeepseekV4DecodeA5" ]]; then
         cpp="${out_subdir}/${base}-pto.cpp"
       fi
       local sample_use_ptobc_roundtrip="$use_ptobc_roundtrip"