Mintplex-Labs · vrwallace · May 3, 2026 · May 3, 2026
diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp
@@ -216,7 +216,11 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
         } else {
             parser = content.build_parser(ctx);
         }
-        return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
+        // When thinking is off the generation prompt ends with the suppression block (e.g. "<think>\n\n</think>").
+        // Use the full string as a literal prefix so the suppression block isn't matched by the optional
+        // reasoning parser, which would otherwise emit spurious reasoning_content.
+        const std::string prefix_delim = inputs.enable_thinking ? reasoning.start : std::string();
+        return pure_content ? p.prefix(inputs.generation_prompt, prefix_delim) + parser : p.prefix(inputs.generation_prompt, prefix_delim) << parser;
     });
 }
 

diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp
@@ -598,6 +598,52 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
 }
 
 
+
+// =========================================================================
+// Q1_0 and Q1_0_g128 SYCL dequantize functions
+// block_q1_0:      { ggml_half d; uint8_t qs[4];  }  32  elements @ 1 bit
+// block_q1_0_g128: { ggml_half d; uint8_t qs[16]; }  128 elements @ 1 bit
+// Bit=1 -> +d,  Bit=0 -> -d
+// =========================================================================
+template <typename dst_t>
+static void dequantize_row_q1_0_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
+    const int64_t num_threads = SYCL_DEQUANTIZE_BLOCK_SIZE;
+    const int64_t num_blocks  = (k + num_threads - 1) / num_threads;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, num_threads),
+                          sycl::range<3>(1, 1, num_threads)),
+        [=](sycl::nd_item<3> item_ct1) {
+            const int64_t i = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
+            if (i >= k) return;
+            const block_q1_0 * x = (const block_q1_0 *) vx;
+            const int64_t ib  = i / QK1_0;
+            const int     bit = i % QK1_0;
+            const float   d   = (float)(x[ib].d);
+            y[i] = static_cast<dst_t>(((x[ib].qs[bit / 8] >> (bit % 8)) & 1) ? d : -d);
+        });
+}
+
+template <typename dst_t>
+static void dequantize_row_q1_0_g128_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
+    const int64_t num_threads = SYCL_DEQUANTIZE_BLOCK_SIZE;
+    const int64_t num_blocks  = (k + num_threads - 1) / num_threads;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, num_threads),
+                          sycl::range<3>(1, 1, num_threads)),
+        [=](sycl::nd_item<3> item_ct1) {
+            const int64_t i = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
+            if (i >= k) return;
+            const block_q1_0_g128 * x = (const block_q1_0_g128 *) vx;
+            const int64_t ib  = i / QK1_0_g128;
+            const int     bit = i % QK1_0_g128;
+            const float   d   = (float)(x[ib].d);
+            y[i] = static_cast<dst_t>(((x[ib].qs[bit / 8] >> (bit % 8)) & 1) ? d : -d);
+        });
+}
+// =========================================================================
+// end Q1_0 SYCL dequantize
+// =========================================================================
+
 to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
     switch (type) {
         case GGML_TYPE_Q4_0:
@@ -661,6 +707,10 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
         case GGML_TYPE_BF16:
             return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
 #endif
+        case GGML_TYPE_Q1_0:
+            return dequantize_row_q1_0_sycl;
+        case GGML_TYPE_Q1_0_g128:
+            return dequantize_row_q1_0_g128_sycl;
         default:
             GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
             return nullptr;
@@ -731,6 +781,10 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
         case GGML_TYPE_BF16:
             return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
 #endif
+        case GGML_TYPE_Q1_0:
+            return dequantize_row_q1_0_sycl;
+        case GGML_TYPE_Q1_0_g128:
+            return dequantize_row_q1_0_g128_sycl;
         default:
             GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
             return nullptr;

diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1058,6 +1058,37 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
     }
 }
 
+
+static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy, float * dst,
+                                        const int ncols, const int nrows, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK1_0 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                mul_mat_vec_q<QK1_0, QI1_0, block_q1_0, VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1>(
+                    vx, vy, dst, ncols, nrows, item_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q1_0_g128_q8_1_sycl(const void * vx, const void * vy, float * dst,
+                                              const int ncols, const int nrows, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK1_0_g128 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                mul_mat_vec_q<QK1_0_g128, QI1_0_g128, block_q1_0_g128, VDR_Q1_0_g128_Q8_1_MMVQ, vec_dot_q1_0_g128_q8_1>(
+                    vx, vy, dst, ncols, nrows, item_ct1);
+            });
+    });
+}
+
 void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
                                 ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
                                 const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low,
@@ -1165,6 +1196,13 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
             case GGML_TYPE_NVFP4:
                 mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                 break;
+            case GGML_TYPE_Q1_0:
+                mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q1_0_g128:
+                mul_mat_vec_q1_0_g128_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+
             default:
                 GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(src0->type));
         }

diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -1413,4 +1413,49 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
 #endif
 }
 
+
+#define VDR_Q1_0_Q8_1_MMVQ      1
+#define VDR_Q1_0_g128_Q8_1_MMVQ 1
+
+static __dpct_inline__ float vec_dot_q1_0_q8_1(
+        const void * __restrict__ vbq,
+        const block_q8_1 * __restrict__ bq8_1,
+        const int & iqs) {
+    // Q1_0: 32 weights per block, 1 Q8_1 block aligns per X block
+    // result = d_q1 * d_q8 * sum(sign * q8_raw)
+    const block_q1_0 * bq = (const block_q1_0 *) vbq;
+    const float d_q1 = (float)(bq->d);
+    const sycl::float2 ds8 = bq8_1[0].ds.convert<float, sycl::rounding_mode::automatic>();
+    const float d_q8 = ds8.x();
+    const int8_t * q8 = bq8_1[0].qs;
+    float sum = 0.0f;
+    for (int bit = 0; bit < QK1_0; bit++) {
+        const float sign = ((bq->qs[bit / 8] >> (bit % 8)) & 1) ? 1.0f : -1.0f;
+        sum += sign * (float)q8[bit];
+    }
+    return d_q1 * d_q8 * sum;
+}
+
+static __dpct_inline__ float vec_dot_q1_0_g128_q8_1(
+        const void * __restrict__ vbq,
+        const block_q8_1 * __restrict__ bq8_1,
+        const int & iqs) {
+    // Q1_0_g128: 128 weights per block, 4 Q8_1 blocks align per X block
+    // iqs = 0..3 selects which Q8_1 block (32 activations each)
+    // result = d_q1 * d_q8[iqs] * sum(sign * q8_raw)
+    const block_q1_0_g128 * bq = (const block_q1_0_g128 *) vbq;
+    const float d_q1 = (float)(bq->d);
+    const sycl::float2 ds8 = bq8_1[iqs].ds.convert<float, sycl::rounding_mode::automatic>();
+    const float d_q8 = ds8.x();
+    const int8_t * q8 = bq8_1[iqs].qs;
+    const int base_bit = iqs * 32;
+    float sum = 0.0f;
+    for (int bit = 0; bit < 32; bit++) {
+        const int abs_bit = base_bit + bit;
+        const float sign = ((bq->qs[abs_bit / 8] >> (abs_bit % 8)) & 1) ? 1.0f : -1.0f;
+        sum += sign * (float)q8[bit];
+    }
+    return d_q1 * d_q8 * sum;
+}
+
 #endif // GGML_SYCL_VECDOTQ_HPP