From 667e09bd0e9ef7ea1546ec0356e58b4217a9c697 Mon Sep 17 00:00:00 2001 From: pl752 Date: Sun, 3 May 2026 20:13:49 +0500 Subject: [PATCH 1/5] RVV Q1_0 1x1 dot vla --- ggml/src/ggml-cpu/arch-fallback.h | 1 - ggml/src/ggml-cpu/arch/riscv/quants.c | 105 ++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 1758d83c261..7aeacfdd5b2 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -207,7 +207,6 @@ #elif defined(__riscv) // quants.c #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 -#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 #define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1 diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index d3278d6489f..d665c0a34b2 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -14,6 +14,7 @@ #include #include // for qsort #include // for GGML_ASSERT +#include #ifdef _MSC_VER #define NOINLINE __declspec(noinline) @@ -480,6 +481,110 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +alignas(32) static const uint8_t q1_byte_sel_32[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, +}; + +alignas(32) static const uint8_t q1_bit_mask_32[32] = { + 1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128, +}; + +void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK1_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q1_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__riscv_v) + float sumf = 0; + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + + for (int k = 0; k < 4; ++k) { + const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k]; + const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); + + const uint8_t * bits = &x[ib].qs[k * 4]; + + // Load 4 Q1 bytes once (bits array is only 4 bytes) + const size_t vl4 = __riscv_vsetvl_e8m1(4); + const vuint8m1_t qx4 = __riscv_vle8_v_u8m1(bits, vl4); + + // Expand 4 Q1 bytes -> 32 sign bytes via vrgather + AND + size_t offset = 0; + int sumi = 0; + + while (offset < 32) { + const size_t vl = __riscv_vsetvl_e8m1(32 - offset); + + const vuint8m1_t sel = + __riscv_vle8_v_u8m1(q1_byte_sel_32 + offset, vl); + + const vuint8m1_t mask = + __riscv_vle8_v_u8m1(q1_bit_mask_32 + offset, vl); + + const vuint8m1_t qbyte = + __riscv_vrgather_vv_u8m1(qx4, sel, vl); + + const vuint8m1_t bit = + __riscv_vand_vv_u8m1(qbyte, mask, vl); + + // bit == 0 means negative, bit != 0 means positive + const vbool8_t is_zero = + __riscv_vmseq_vx_u8m1_b8(bit, 0, vl); + + const vint8m1_t qy = + __riscv_vle8_v_i8m1(yb->qs + offset, vl); + + // Equivalent to AVX2: + // sm = bit == 0 ? 0xFF : 0x00 + // sy = (qy ^ sm) - sm + const vint8m1_t neg_qy = + __riscv_vneg_v_i8m1(qy, vl); + + const vint8m1_t sy = + __riscv_vmerge_vvm_i8m1(qy, neg_qy, is_zero, vl); + + const vint16m1_t zero = + __riscv_vmv_v_x_i16m1(0, 1); + + const vint16m1_t red = + __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl); + + sumi += (int)__riscv_vmv_x_s_i16m1_i16(red); + + offset += vl; + } + + sumf += d0 * d1 * sumi; + } + } + + *s = sumf; +#else + + UNUSED(nb); + UNUSED(x); + UNUSED(y); + ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); +#endif +} + void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); From 4c0300f03cc2e0d48c9a73ca561274e11c102461 Mon Sep 17 00:00:00 2001 From: pl752 Date: Sun, 3 May 2026 22:54:43 +0500 Subject: [PATCH 2/5] RVV Q1_0 1x1 dot fixed vl instead of vla --- ggml/src/ggml-cpu/arch/riscv/quants.c | 124 +++++++++++++++----------- 1 file changed, 72 insertions(+), 52 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index d665c0a34b2..8f59322158f 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -481,6 +481,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +#if defined(__riscv_v) alignas(32) static const uint8_t q1_byte_sel_32[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, @@ -495,92 +496,111 @@ alignas(32) static const uint8_t q1_bit_mask_32[32] = { 1, 2, 4, 8, 16, 32, 64, 128, }; -void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK1_0; const int nb = n / qk; - assert(n % qk == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); const block_q1_0 * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; -#if defined(__riscv_v) + //LMUL = 1, VLMAX = 256 + const size_t vl32 = __riscv_vsetvl_e8m1(32); + assert(vl32 == 32); + + const vuint8m1_t sel = __riscv_vle8_v_u8m1(q1_byte_sel_32, vl32); + const vuint8m1_t mask = __riscv_vle8_v_u8m1(q1_bit_mask_32, vl32); + + const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1); + float sumf = 0; for (int ib = 0; ib < nb; ++ib) { const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + float acc = 0; + for (int k = 0; k < 4; ++k) { const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k]; - const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); - - const uint8_t * bits = &x[ib].qs[k * 4]; - - // Load 4 Q1 bytes once (bits array is only 4 bytes) - const size_t vl4 = __riscv_vsetvl_e8m1(4); - const vuint8m1_t qx4 = __riscv_vle8_v_u8m1(bits, vl4); + const vuint8m1_t qx4 = __riscv_vle8_v_u8m1(x[ib].qs + 4 * k, 4); + const vuint8m1_t qbyte = __riscv_vrgather_vv_u8m1(qx4, sel, vl32); + const vuint8m1_t bit = __riscv_vand_vv_u8m1(qbyte, mask, vl32); - // Expand 4 Q1 bytes -> 32 sign bytes via vrgather + AND - size_t offset = 0; - int sumi = 0; + const vbool8_t is_zero = __riscv_vmseq_vx_u8m1_b8(bit, 0, vl32); + const vint8m1_t qy = __riscv_vle8_v_i8m1(yb->qs, vl32); + const vint8m1_t neg_qy = __riscv_vneg_v_i8m1(qy, vl32); + const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(qy, neg_qy, is_zero, vl32); - while (offset < 32) { - const size_t vl = __riscv_vsetvl_e8m1(32 - offset); + const vint16m1_t red = __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl32); + acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red); + } - const vuint8m1_t sel = - __riscv_vle8_v_u8m1(q1_byte_sel_32 + offset, vl); + sumf += d0 * acc; + } - const vuint8m1_t mask = - __riscv_vle8_v_u8m1(q1_bit_mask_32 + offset, vl); + *s = sumf; +} - const vuint8m1_t qbyte = - __riscv_vrgather_vv_u8m1(qx4, sel, vl); +static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { + const int qk = QK1_0; + const int nb = n / qk; + assert(n % qk == 0); - const vuint8m1_t bit = - __riscv_vand_vv_u8m1(qbyte, mask, vl); + const block_q1_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; - // bit == 0 means negative, bit != 0 means positive - const vbool8_t is_zero = - __riscv_vmseq_vx_u8m1_b8(bit, 0, vl); + //LMUL = 2, VLMAX = 256 + const size_t vl32 = __riscv_vsetvl_e8m2(32); + assert(vl32 == 32); - const vint8m1_t qy = - __riscv_vle8_v_i8m1(yb->qs + offset, vl); + const vuint8m2_t sel = __riscv_vle8_v_u8m2(q1_byte_sel_32, vl32); + const vuint8m2_t mask = __riscv_vle8_v_u8m2(q1_bit_mask_32, vl32); - // Equivalent to AVX2: - // sm = bit == 0 ? 0xFF : 0x00 - // sy = (qy ^ sm) - sm - const vint8m1_t neg_qy = - __riscv_vneg_v_i8m1(qy, vl); + const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1); - const vint8m1_t sy = - __riscv_vmerge_vvm_i8m1(qy, neg_qy, is_zero, vl); + float sumf = 0; - const vint16m1_t zero = - __riscv_vmv_v_x_i16m1(0, 1); + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); - const vint16m1_t red = - __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl); + float acc = 0; - sumi += (int)__riscv_vmv_x_s_i16m1_i16(red); + for (int k = 0; k < 4; ++k) { + const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k]; + const vuint8m2_t qx4 = __riscv_vle8_v_u8m2(x[ib].qs + 4 * k, 4); + const vuint8m2_t qbyte = __riscv_vrgather_vv_u8m2(qx4, sel, vl32); + const vuint8m2_t bit = __riscv_vand_vv_u8m2(qbyte, mask, vl32); - offset += vl; - } + const vbool4_t is_zero =__riscv_vmseq_vx_u8m2_b4(bit, 0, vl32); + const vint8m2_t qy = __riscv_vle8_v_i8m2(yb->qs, vl32); + const vint8m2_t neg_qy =__riscv_vneg_v_i8m2(qy, vl32); + const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(qy, neg_qy, is_zero, vl32); - sumf += d0 * d1 * sumi; + const vint16m1_t red = __riscv_vwredsum_vs_i8m2_i16m1(sy, zero, vl32); + acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red); } + + sumf += d0 * acc; } *s = sumf; -#else +} +#endif - UNUSED(nb); - UNUSED(x); - UNUSED(y); +void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +#if defined(__riscv_v) + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + if (__riscv_vlenb() * 8 >= 256) { + ggml_vec_dot_q1_0_q8_0_vl256(n, s, vx, vy); + } else { + ggml_vec_dot_q1_0_q8_0_vl128(n, s, vx, vy); + } +#else ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } From 9ec88056e29ba1a98bb890ec6b1b2627dbefce28 Mon Sep 17 00:00:00 2001 From: pl752 Date: Tue, 5 May 2026 11:31:00 +0500 Subject: [PATCH 3/5] Accounted for VLEN=64 even though min VLEN for V ext is 128 Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- ggml/src/ggml-cpu/arch/riscv/quants.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index 8f59322158f..4d2d0aa4285 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -590,15 +590,15 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_REST void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { #if defined(__riscv_v) assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - if (__riscv_vlenb() * 8 >= 256) { + const size_t vlen_bits = __riscv_vlenb() * 8; + + if (vlen_bits >= 256) { ggml_vec_dot_q1_0_q8_0_vl256(n, s, vx, vy); - } else { + } else if (vlen_bits >= 128) { ggml_vec_dot_q1_0_q8_0_vl128(n, s, vx, vy); + } else { + ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); } #else ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); From 76f6e63868ee243f8e213f527355600cee2728c3 Mon Sep 17 00:00:00 2001 From: pl752 Date: Wed, 6 May 2026 19:31:41 +0500 Subject: [PATCH 4/5] Replaced AVX2 like masking with vlm op --- ggml/src/ggml-cpu/arch/riscv/quants.c | 34 ++++----------------------- 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index 4d2d0aa4285..bb18ef1481d 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -482,20 +482,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } #if defined(__riscv_v) -alignas(32) static const uint8_t q1_byte_sel_32[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, -}; - -alignas(32) static const uint8_t q1_bit_mask_32[32] = { - 1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128, -}; - static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK1_0; const int nb = n / qk; @@ -508,9 +494,6 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_REST const size_t vl32 = __riscv_vsetvl_e8m1(32); assert(vl32 == 32); - const vuint8m1_t sel = __riscv_vle8_v_u8m1(q1_byte_sel_32, vl32); - const vuint8m1_t mask = __riscv_vle8_v_u8m1(q1_bit_mask_32, vl32); - const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1); float sumf = 0; @@ -522,14 +505,11 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_REST for (int k = 0; k < 4; ++k) { const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k]; - const vuint8m1_t qx4 = __riscv_vle8_v_u8m1(x[ib].qs + 4 * k, 4); - const vuint8m1_t qbyte = __riscv_vrgather_vv_u8m1(qx4, sel, vl32); - const vuint8m1_t bit = __riscv_vand_vv_u8m1(qbyte, mask, vl32); + const vbool8_t is_not_zero = __riscv_vlm_v_b8(x[ib].qs + 4 * k, vl32); - const vbool8_t is_zero = __riscv_vmseq_vx_u8m1_b8(bit, 0, vl32); const vint8m1_t qy = __riscv_vle8_v_i8m1(yb->qs, vl32); const vint8m1_t neg_qy = __riscv_vneg_v_i8m1(qy, vl32); - const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(qy, neg_qy, is_zero, vl32); + const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(neg_qy, qy, is_not_zero, vl32); const vint16m1_t red = __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl32); acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red); @@ -553,9 +533,6 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_REST const size_t vl32 = __riscv_vsetvl_e8m2(32); assert(vl32 == 32); - const vuint8m2_t sel = __riscv_vle8_v_u8m2(q1_byte_sel_32, vl32); - const vuint8m2_t mask = __riscv_vle8_v_u8m2(q1_bit_mask_32, vl32); - const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1); float sumf = 0; @@ -567,14 +544,11 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_REST for (int k = 0; k < 4; ++k) { const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k]; - const vuint8m2_t qx4 = __riscv_vle8_v_u8m2(x[ib].qs + 4 * k, 4); - const vuint8m2_t qbyte = __riscv_vrgather_vv_u8m2(qx4, sel, vl32); - const vuint8m2_t bit = __riscv_vand_vv_u8m2(qbyte, mask, vl32); + const vbool4_t is_not_zero = __riscv_vlm_v_b4(x[ib].qs + 4 * k, vl32); - const vbool4_t is_zero =__riscv_vmseq_vx_u8m2_b4(bit, 0, vl32); const vint8m2_t qy = __riscv_vle8_v_i8m2(yb->qs, vl32); const vint8m2_t neg_qy =__riscv_vneg_v_i8m2(qy, vl32); - const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(qy, neg_qy, is_zero, vl32); + const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(neg_qy, qy, is_not_zero, vl32); const vint16m1_t red = __riscv_vwredsum_vs_i8m2_i16m1(sy, zero, vl32); acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red); From 113ed38df126cd21553dcc9a5f5e592d6276e7f1 Mon Sep 17 00:00:00 2001 From: pl752 Date: Wed, 6 May 2026 19:49:37 +0500 Subject: [PATCH 5/5] Corrected comments about vlmax --- ggml/src/ggml-cpu/arch/riscv/quants.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index bb18ef1481d..0d04cac3bd1 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -490,7 +490,7 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_REST const block_q1_0 * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; - //LMUL = 1, VLMAX = 256 + //LMUL = 1, VLMAX = 32 const size_t vl32 = __riscv_vsetvl_e8m1(32); assert(vl32 == 32); @@ -529,7 +529,7 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_REST const block_q1_0 * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; - //LMUL = 2, VLMAX = 256 + //LMUL = 2, VLMAX = 32 const size_t vl32 = __riscv_vsetvl_e8m2(32); assert(vl32 == 32);