From 667e09bd0e9ef7ea1546ec0356e58b4217a9c697 Mon Sep 17 00:00:00 2001
From: pl752 <pl752@mail.ru>
Date: Sun, 3 May 2026 20:13:49 +0500
Subject: [PATCH 1/5] RVV Q1_0 1x1 dot vla

---
 ggml/src/ggml-cpu/arch-fallback.h     |   1 -
 ggml/src/ggml-cpu/arch/riscv/quants.c | 105 ++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h
index 1758d83c261..7aeacfdd5b2 100644
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -207,7 +207,6 @@
 #elif defined(__riscv)
 // quants.c
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
-#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 #define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c
index d3278d6489f..d665c0a34b2 100644
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -14,6 +14,7 @@
 #include <float.h>
 #include <stdlib.h> // for qsort
 #include <stdio.h>  // for GGML_ASSERT
+#include <stdalign.h>
 
 #ifdef _MSC_VER
 #define NOINLINE __declspec(noinline)
@@ -480,6 +481,110 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }
 
+alignas(32) static const uint8_t q1_byte_sel_32[32] = {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1,
+    2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+alignas(32) static const uint8_t q1_bit_mask_32[32] = {
+    1, 2, 4, 8, 16, 32, 64, 128,
+    1, 2, 4, 8, 16, 32, 64, 128,
+    1, 2, 4, 8, 16, 32, 64, 128,
+    1, 2, 4, 8, 16, 32, 64, 128,
+};
+
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__riscv_v)
+    float sumf = 0;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+
+            const uint8_t * bits = &x[ib].qs[k * 4];
+
+            // Load 4 Q1 bytes once (bits array is only 4 bytes)
+            const size_t vl4 = __riscv_vsetvl_e8m1(4);
+            const vuint8m1_t qx4 = __riscv_vle8_v_u8m1(bits, vl4);
+
+            // Expand 4 Q1 bytes -> 32 sign bytes via vrgather + AND
+            size_t offset = 0;
+            int sumi = 0;
+
+            while (offset < 32) {
+                const size_t vl = __riscv_vsetvl_e8m1(32 - offset);
+
+                const vuint8m1_t sel =
+                    __riscv_vle8_v_u8m1(q1_byte_sel_32 + offset, vl);
+
+                const vuint8m1_t mask =
+                    __riscv_vle8_v_u8m1(q1_bit_mask_32 + offset, vl);
+
+                const vuint8m1_t qbyte =
+                    __riscv_vrgather_vv_u8m1(qx4, sel, vl);
+
+                const vuint8m1_t bit =
+                    __riscv_vand_vv_u8m1(qbyte, mask, vl);
+
+                // bit == 0 means negative, bit != 0 means positive
+                const vbool8_t is_zero =
+                    __riscv_vmseq_vx_u8m1_b8(bit, 0, vl);
+
+                const vint8m1_t qy =
+                    __riscv_vle8_v_i8m1(yb->qs + offset, vl);
+
+                // Equivalent to AVX2:
+                // sm = bit == 0 ? 0xFF : 0x00
+                // sy = (qy ^ sm) - sm
+                const vint8m1_t neg_qy =
+                    __riscv_vneg_v_i8m1(qy, vl);
+
+                const vint8m1_t sy =
+                    __riscv_vmerge_vvm_i8m1(qy, neg_qy, is_zero, vl);
+
+                const vint16m1_t zero =
+                    __riscv_vmv_v_x_i16m1(0, 1);
+
+                const vint16m1_t red =
+                    __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl);
+
+                sumi += (int)__riscv_vmv_x_s_i16m1_i16(red);
+
+                offset += vl;
+            }
+
+            sumf += d0 * d1 * sumi;
+        }
+    }
+
+    *s = sumf;
+#else
+
+    UNUSED(nb);
+    UNUSED(x);
+    UNUSED(y);
+    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);

From 4c0300f03cc2e0d48c9a73ca561274e11c102461 Mon Sep 17 00:00:00 2001
From: pl752 <pl752@mail.ru>
Date: Sun, 3 May 2026 22:54:43 +0500
Subject: [PATCH 2/5] RVV Q1_0 1x1 dot fixed vl instead of vla

---
 ggml/src/ggml-cpu/arch/riscv/quants.c | 124 +++++++++++++++-----------
 1 file changed, 72 insertions(+), 52 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c
index d665c0a34b2..8f59322158f 100644
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -481,6 +481,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }
 
+#if defined(__riscv_v)
 alignas(32) static const uint8_t q1_byte_sel_32[32] = {
     0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1,
@@ -495,92 +496,111 @@ alignas(32) static const uint8_t q1_bit_mask_32[32] = {
     1, 2, 4, 8, 16, 32, 64, 128,
 };
 
-void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
     const int qk = QK1_0;
     const int nb = n / qk;
-
     assert(n % qk == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
 
     const block_q1_0 * GGML_RESTRICT x = vx;
     const block_q8_0 * GGML_RESTRICT y = vy;
 
-#if defined(__riscv_v)
+    //LMUL = 1, VLMAX = 256
+    const size_t vl32 = __riscv_vsetvl_e8m1(32);
+    assert(vl32 == 32);
+
+    const vuint8m1_t sel  = __riscv_vle8_v_u8m1(q1_byte_sel_32, vl32);
+    const vuint8m1_t mask = __riscv_vle8_v_u8m1(q1_bit_mask_32, vl32);
+
+    const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
+
     float sumf = 0;
 
     for (int ib = 0; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
 
+        float acc = 0;
+
         for (int k = 0; k < 4; ++k) {
             const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
-            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
-
-            const uint8_t * bits = &x[ib].qs[k * 4];
-
-            // Load 4 Q1 bytes once (bits array is only 4 bytes)
-            const size_t vl4 = __riscv_vsetvl_e8m1(4);
-            const vuint8m1_t qx4 = __riscv_vle8_v_u8m1(bits, vl4);
+            const vuint8m1_t qx4 = __riscv_vle8_v_u8m1(x[ib].qs + 4 * k, 4);
+            const vuint8m1_t qbyte = __riscv_vrgather_vv_u8m1(qx4, sel, vl32);
+            const vuint8m1_t bit = __riscv_vand_vv_u8m1(qbyte, mask, vl32);
 
-            // Expand 4 Q1 bytes -> 32 sign bytes via vrgather + AND
-            size_t offset = 0;
-            int sumi = 0;
+            const vbool8_t is_zero = __riscv_vmseq_vx_u8m1_b8(bit, 0, vl32);
+            const vint8m1_t qy = __riscv_vle8_v_i8m1(yb->qs, vl32);
+            const vint8m1_t neg_qy = __riscv_vneg_v_i8m1(qy, vl32);
+            const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(qy, neg_qy, is_zero, vl32);
 
-            while (offset < 32) {
-                const size_t vl = __riscv_vsetvl_e8m1(32 - offset);
+            const vint16m1_t red = __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl32);
+            acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
+        }
 
-                const vuint8m1_t sel =
-                    __riscv_vle8_v_u8m1(q1_byte_sel_32 + offset, vl);
+        sumf += d0 * acc;
+    }
 
-                const vuint8m1_t mask =
-                    __riscv_vle8_v_u8m1(q1_bit_mask_32 + offset, vl);
+    *s = sumf;
+}
 
-                const vuint8m1_t qbyte =
-                    __riscv_vrgather_vv_u8m1(qx4, sel, vl);
+static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+    assert(n % qk == 0);
 
-                const vuint8m1_t bit =
-                    __riscv_vand_vv_u8m1(qbyte, mask, vl);
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
 
-                // bit == 0 means negative, bit != 0 means positive
-                const vbool8_t is_zero =
-                    __riscv_vmseq_vx_u8m1_b8(bit, 0, vl);
+    //LMUL = 2, VLMAX = 256
+    const size_t vl32 = __riscv_vsetvl_e8m2(32);
+    assert(vl32 == 32);
 
-                const vint8m1_t qy =
-                    __riscv_vle8_v_i8m1(yb->qs + offset, vl);
+    const vuint8m2_t sel  = __riscv_vle8_v_u8m2(q1_byte_sel_32, vl32);
+    const vuint8m2_t mask = __riscv_vle8_v_u8m2(q1_bit_mask_32, vl32);
 
-                // Equivalent to AVX2:
-                // sm = bit == 0 ? 0xFF : 0x00
-                // sy = (qy ^ sm) - sm
-                const vint8m1_t neg_qy =
-                    __riscv_vneg_v_i8m1(qy, vl);
+    const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
 
-                const vint8m1_t sy =
-                    __riscv_vmerge_vvm_i8m1(qy, neg_qy, is_zero, vl);
+    float sumf = 0;
 
-                const vint16m1_t zero =
-                    __riscv_vmv_v_x_i16m1(0, 1);
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
 
-                const vint16m1_t red =
-                    __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl);
+        float acc = 0;
 
-                sumi += (int)__riscv_vmv_x_s_i16m1_i16(red);
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const vuint8m2_t qx4 = __riscv_vle8_v_u8m2(x[ib].qs + 4 * k, 4);
+            const vuint8m2_t qbyte = __riscv_vrgather_vv_u8m2(qx4, sel, vl32);
+            const vuint8m2_t bit = __riscv_vand_vv_u8m2(qbyte, mask, vl32);
 
-                offset += vl;
-            }
+            const vbool4_t is_zero =__riscv_vmseq_vx_u8m2_b4(bit, 0, vl32);
+            const vint8m2_t qy = __riscv_vle8_v_i8m2(yb->qs, vl32);
+            const vint8m2_t neg_qy =__riscv_vneg_v_i8m2(qy, vl32);
+            const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(qy, neg_qy, is_zero, vl32);
 
-            sumf += d0 * d1 * sumi;
+            const vint16m1_t red = __riscv_vwredsum_vs_i8m2_i16m1(sy, zero, vl32);
+            acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
         }
+
+        sumf += d0 * acc;
     }
 
     *s = sumf;
-#else
+}
+#endif
 
-    UNUSED(nb);
-    UNUSED(x);
-    UNUSED(y);
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined(__riscv_v)
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    if (__riscv_vlenb() * 8 >= 256) {
+        ggml_vec_dot_q1_0_q8_0_vl256(n, s, vx, vy);
+    } else {
+        ggml_vec_dot_q1_0_q8_0_vl128(n, s, vx, vy);
+    }
+#else
     ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif
 }

From 9ec88056e29ba1a98bb890ec6b1b2627dbefce28 Mon Sep 17 00:00:00 2001
From: pl752 <pl752@mail.ru>
Date: Tue, 5 May 2026 11:31:00 +0500
Subject: [PATCH 3/5] Accounted for VLEN=64 even though min VLEN for V ext is
 128

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 ggml/src/ggml-cpu/arch/riscv/quants.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c
index 8f59322158f..4d2d0aa4285 100644
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -590,15 +590,15 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_REST
 void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 #if defined(__riscv_v)
     assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
 
-    if (__riscv_vlenb() * 8 >= 256) {
+    const size_t vlen_bits = __riscv_vlenb() * 8;
+
+    if (vlen_bits >= 256) {
         ggml_vec_dot_q1_0_q8_0_vl256(n, s, vx, vy);
-    } else {
+    } else if (vlen_bits >= 128) {
         ggml_vec_dot_q1_0_q8_0_vl128(n, s, vx, vy);
+    } else {
+        ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
     }
 #else
     ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);

From 76f6e63868ee243f8e213f527355600cee2728c3 Mon Sep 17 00:00:00 2001
From: pl752 <pl752@mail.ru>
Date: Wed, 6 May 2026 19:31:41 +0500
Subject: [PATCH 4/5] Replaced AVX2 like masking with vlm op

---
 ggml/src/ggml-cpu/arch/riscv/quants.c | 34 ++++-----------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c
index 4d2d0aa4285..bb18ef1481d 100644
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -482,20 +482,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 }
 
 #if defined(__riscv_v)
-alignas(32) static const uint8_t q1_byte_sel_32[32] = {
-    0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 1, 1, 1, 1,
-    2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3,
-};
-
-alignas(32) static const uint8_t q1_bit_mask_32[32] = {
-    1, 2, 4, 8, 16, 32, 64, 128,
-    1, 2, 4, 8, 16, 32, 64, 128,
-    1, 2, 4, 8, 16, 32, 64, 128,
-    1, 2, 4, 8, 16, 32, 64, 128,
-};
-
 static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
     const int qk = QK1_0;
     const int nb = n / qk;
@@ -508,9 +494,6 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_REST
     const size_t vl32 = __riscv_vsetvl_e8m1(32);
     assert(vl32 == 32);
 
-    const vuint8m1_t sel  = __riscv_vle8_v_u8m1(q1_byte_sel_32, vl32);
-    const vuint8m1_t mask = __riscv_vle8_v_u8m1(q1_bit_mask_32, vl32);
-
     const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
 
     float sumf = 0;
@@ -522,14 +505,11 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_REST
 
         for (int k = 0; k < 4; ++k) {
             const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
-            const vuint8m1_t qx4 = __riscv_vle8_v_u8m1(x[ib].qs + 4 * k, 4);
-            const vuint8m1_t qbyte = __riscv_vrgather_vv_u8m1(qx4, sel, vl32);
-            const vuint8m1_t bit = __riscv_vand_vv_u8m1(qbyte, mask, vl32);
+            const vbool8_t is_not_zero = __riscv_vlm_v_b8(x[ib].qs + 4 * k, vl32);
 
-            const vbool8_t is_zero = __riscv_vmseq_vx_u8m1_b8(bit, 0, vl32);
             const vint8m1_t qy = __riscv_vle8_v_i8m1(yb->qs, vl32);
             const vint8m1_t neg_qy = __riscv_vneg_v_i8m1(qy, vl32);
-            const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(qy, neg_qy, is_zero, vl32);
+            const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(neg_qy, qy, is_not_zero, vl32);
 
             const vint16m1_t red = __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl32);
             acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
@@ -553,9 +533,6 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_REST
     const size_t vl32 = __riscv_vsetvl_e8m2(32);
     assert(vl32 == 32);
 
-    const vuint8m2_t sel  = __riscv_vle8_v_u8m2(q1_byte_sel_32, vl32);
-    const vuint8m2_t mask = __riscv_vle8_v_u8m2(q1_bit_mask_32, vl32);
-
     const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
 
     float sumf = 0;
@@ -567,14 +544,11 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_REST
 
         for (int k = 0; k < 4; ++k) {
             const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
-            const vuint8m2_t qx4 = __riscv_vle8_v_u8m2(x[ib].qs + 4 * k, 4);
-            const vuint8m2_t qbyte = __riscv_vrgather_vv_u8m2(qx4, sel, vl32);
-            const vuint8m2_t bit = __riscv_vand_vv_u8m2(qbyte, mask, vl32);
+            const vbool4_t is_not_zero = __riscv_vlm_v_b4(x[ib].qs + 4 * k, vl32);
 
-            const vbool4_t is_zero =__riscv_vmseq_vx_u8m2_b4(bit, 0, vl32);
             const vint8m2_t qy = __riscv_vle8_v_i8m2(yb->qs, vl32);
             const vint8m2_t neg_qy =__riscv_vneg_v_i8m2(qy, vl32);
-            const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(qy, neg_qy, is_zero, vl32);
+            const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(neg_qy, qy, is_not_zero, vl32);
 
             const vint16m1_t red = __riscv_vwredsum_vs_i8m2_i16m1(sy, zero, vl32);
             acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);

From 113ed38df126cd21553dcc9a5f5e592d6276e7f1 Mon Sep 17 00:00:00 2001
From: pl752 <pl752@mail.ru>
Date: Wed, 6 May 2026 19:49:37 +0500
Subject: [PATCH 5/5] Corrected comments about vlmax

---
 ggml/src/ggml-cpu/arch/riscv/quants.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c
index bb18ef1481d..0d04cac3bd1 100644
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -490,7 +490,7 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_REST
     const block_q1_0 * GGML_RESTRICT x = vx;
     const block_q8_0 * GGML_RESTRICT y = vy;
 
-    //LMUL = 1, VLMAX = 256
+    //LMUL = 1, VLMAX = 32
     const size_t vl32 = __riscv_vsetvl_e8m1(32);
     assert(vl32 == 32);
 
@@ -529,7 +529,7 @@ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_REST
     const block_q1_0 * GGML_RESTRICT x = vx;
     const block_q8_0 * GGML_RESTRICT y = vy;
 
-    //LMUL = 2, VLMAX = 256
+    //LMUL = 2, VLMAX = 32
     const size_t vl32 = __riscv_vsetvl_e8m2(32);
     assert(vl32 == 32);