From 43f0a90e1b9b3869404898c47acfad762ef62592 Mon Sep 17 00:00:00 2001 From: Julien Schueller Date: Thu, 2 Jul 2026 09:29:32 +0200 Subject: [PATCH] Fix ZA tile slice indices in ssyrk SME direct kernel The kernel_2x2 function uses 4 ZA tiles (0-3) each with svl slices. Tiles 0/1 handle rows 0..svl-1 with slice indices 0..svl-1. Tiles 2/3 handle rows svl..2*svl-1, so their slice indices must start at 0, i.e. (i - svl) instead of i. Fix all three tile 2/3 access sites: - C load into ZA (svwrite_hor_za32_f32_m) - C writeback for UPPER (svst1_hor_za32) - C writeback for LOWER (svst1_hor_za32) Fixes #5873 --- kernel/arm64/ssyrk_direct_alpha_beta_arm64_sme1.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/ssyrk_direct_alpha_beta_arm64_sme1.c b/kernel/arm64/ssyrk_direct_alpha_beta_arm64_sme1.c index d1e6bcc92c..9322923ecb 100644 --- a/kernel/arm64/ssyrk_direct_alpha_beta_arm64_sme1.c +++ b/kernel/arm64/ssyrk_direct_alpha_beta_arm64_sme1.c @@ -79,11 +79,11 @@ kernel_2x2(const float *A, float *B, float *C, size_t shared_dim, for (size_t i = svl; i < block_rows; i++) { svfloat32_t row_c_0 = svld1(pg_c_0, &C[i * ldc]); row_c_0 = svmul_x(pg, beta_vec, row_c_0); - svwrite_hor_za32_f32_m(/*tile*/2, /*slice*/i, pg_c_0, row_c_0); + svwrite_hor_za32_f32_m(/*tile*/2, /*slice*/i - svl, pg_c_0, row_c_0); svfloat32_t row_c_1 = svld1(pg_c_1, &C[i * ldc + svl]); row_c_1 = svmul_x(pg, beta_vec, row_c_1); - svwrite_hor_za32_f32_m(/*tile*/3, /*slice*/i, pg_c_1, row_c_1); + svwrite_hor_za32_f32_m(/*tile*/3, /*slice*/i - svl, pg_c_1, row_c_1); } svfloat32_t alpha_vec = svdup_f32(alpha); @@ -143,8 +143,8 @@ kernel_2x2(const float *A, float *B, float *C, size_t shared_dim, pg_c_0 = svnot_b_z(pg_c_0_full, svwhilelt_b32_u64(0, last_invalid_index)); pg_c_1 = svnot_b_z(pg_c_1_full, svwhilelt_b32_u64(svl, last_invalid_index)); } - svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); - svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + svst1_hor_za32(/*tile*/2, /*slice*/i - svl, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/3, /*slice*/i - svl, pg_c_1, &C[i * ldc + svl]); } #else // Store to C from ZA @@ -158,8 +158,8 @@ kernel_2x2(const float *A, float *B, float *C, size_t shared_dim, for (size_t i = svl; i < block_rows; i++, valid_index++) { pg_c_0 = svwhilelt_b32_u64(0, MIN(valid_index, block_cols)); pg_c_1 = svwhilelt_b32_u64(svl, MIN(valid_index, block_cols)); - svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); - svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); + svst1_hor_za32(/*tile*/2, /*slice*/i - svl, pg_c_0, &C[i * ldc]); + svst1_hor_za32(/*tile*/3, /*slice*/i - svl, pg_c_1, &C[i * ldc + svl]); } #endif }