diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp
index 60b269c42de..95dcd6ffa0c 100644
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -216,7 +216,11 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
} else {
parser = content.build_parser(ctx);
}
- return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
+ // When thinking is off the generation prompt ends with the suppression block (e.g. "\n\n").
+ // Use the full string as a literal prefix so the suppression block isn't matched by the optional
+ // reasoning parser, which would otherwise emit spurious reasoning_content.
+ const std::string prefix_delim = inputs.enable_thinking ? reasoning.start : std::string();
+ return pure_content ? p.prefix(inputs.generation_prompt, prefix_delim) + parser : p.prefix(inputs.generation_prompt, prefix_delim) << parser;
});
}
diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp
index d7f60cbc9ea..244de8d875f 100644
--- a/ggml/src/ggml-sycl/convert.cpp
+++ b/ggml/src/ggml-sycl/convert.cpp
@@ -598,6 +598,52 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
}
+
+// =========================================================================
+// Q1_0 and Q1_0_g128 SYCL dequantize functions
+// block_q1_0: { ggml_half d; uint8_t qs[4]; } 32 elements @ 1 bit
+// block_q1_0_g128: { ggml_half d; uint8_t qs[16]; } 128 elements @ 1 bit
+// Bit=1 -> +d, Bit=0 -> -d
+// =========================================================================
+template
+static void dequantize_row_q1_0_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
+ const int64_t num_threads = SYCL_DEQUANTIZE_BLOCK_SIZE;
+ const int64_t num_blocks = (k + num_threads - 1) / num_threads;
+ stream->parallel_for(
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, num_threads),
+ sycl::range<3>(1, 1, num_threads)),
+ [=](sycl::nd_item<3> item_ct1) {
+ const int64_t i = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
+ if (i >= k) return;
+ const block_q1_0 * x = (const block_q1_0 *) vx;
+ const int64_t ib = i / QK1_0;
+ const int bit = i % QK1_0;
+ const float d = (float)(x[ib].d);
+ y[i] = static_cast(((x[ib].qs[bit / 8] >> (bit % 8)) & 1) ? d : -d);
+ });
+}
+
+template
+static void dequantize_row_q1_0_g128_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
+ const int64_t num_threads = SYCL_DEQUANTIZE_BLOCK_SIZE;
+ const int64_t num_blocks = (k + num_threads - 1) / num_threads;
+ stream->parallel_for(
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, num_threads),
+ sycl::range<3>(1, 1, num_threads)),
+ [=](sycl::nd_item<3> item_ct1) {
+ const int64_t i = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
+ if (i >= k) return;
+ const block_q1_0_g128 * x = (const block_q1_0_g128 *) vx;
+ const int64_t ib = i / QK1_0_g128;
+ const int bit = i % QK1_0_g128;
+ const float d = (float)(x[ib].d);
+ y[i] = static_cast(((x[ib].qs[bit / 8] >> (bit % 8)) & 1) ? d : -d);
+ });
+}
+// =========================================================================
+// end Q1_0 SYCL dequantize
+// =========================================================================
+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
switch (type) {
case GGML_TYPE_Q4_0:
@@ -661,6 +707,10 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
case GGML_TYPE_BF16:
return convert_unary_sycl;
#endif
+ case GGML_TYPE_Q1_0:
+ return dequantize_row_q1_0_sycl;
+ case GGML_TYPE_Q1_0_g128:
+ return dequantize_row_q1_0_g128_sycl;
default:
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
return nullptr;
@@ -731,6 +781,10 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
case GGML_TYPE_BF16:
return convert_unary_sycl;
#endif
+ case GGML_TYPE_Q1_0:
+ return dequantize_row_q1_0_sycl;
+ case GGML_TYPE_Q1_0_g128:
+ return dequantize_row_q1_0_g128_sycl;
default:
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
return nullptr;
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index 5abc50fabfe..c587493a6fc 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1058,6 +1058,37 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
}
}
+
+static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy, float * dst,
+ const int ncols, const int nrows, dpct::queue_ptr stream) {
+ GGML_ASSERT(ncols % QK1_0 == 0);
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+ const sycl::range<3> block_nums(1, 1, block_num_y);
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+ stream->submit([&](sycl::handler & cgh) {
+ cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+ mul_mat_vec_q(
+ vx, vy, dst, ncols, nrows, item_ct1);
+ });
+ });
+}
+
+static void mul_mat_vec_q1_0_g128_q8_1_sycl(const void * vx, const void * vy, float * dst,
+ const int ncols, const int nrows, dpct::queue_ptr stream) {
+ GGML_ASSERT(ncols % QK1_0_g128 == 0);
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+ const sycl::range<3> block_nums(1, 1, block_num_y);
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+ stream->submit([&](sycl::handler & cgh) {
+ cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+ mul_mat_vec_q(
+ vx, vy, dst, ncols, nrows, item_ct1);
+ });
+ });
+}
+
void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low,
@@ -1165,6 +1196,13 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
case GGML_TYPE_NVFP4:
mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
+ case GGML_TYPE_Q1_0:
+ mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+ break;
+ case GGML_TYPE_Q1_0_g128:
+ mul_mat_vec_q1_0_g128_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+ break;
+
default:
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(src0->type));
}
diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
index eab9850aed7..0b02f62d3de 100644
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -1413,4 +1413,49 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
#endif
}
+
+#define VDR_Q1_0_Q8_1_MMVQ 1
+#define VDR_Q1_0_g128_Q8_1_MMVQ 1
+
+static __dpct_inline__ float vec_dot_q1_0_q8_1(
+ const void * __restrict__ vbq,
+ const block_q8_1 * __restrict__ bq8_1,
+ const int & iqs) {
+ // Q1_0: 32 weights per block, 1 Q8_1 block aligns per X block
+ // result = d_q1 * d_q8 * sum(sign * q8_raw)
+ const block_q1_0 * bq = (const block_q1_0 *) vbq;
+ const float d_q1 = (float)(bq->d);
+ const sycl::float2 ds8 = bq8_1[0].ds.convert();
+ const float d_q8 = ds8.x();
+ const int8_t * q8 = bq8_1[0].qs;
+ float sum = 0.0f;
+ for (int bit = 0; bit < QK1_0; bit++) {
+ const float sign = ((bq->qs[bit / 8] >> (bit % 8)) & 1) ? 1.0f : -1.0f;
+ sum += sign * (float)q8[bit];
+ }
+ return d_q1 * d_q8 * sum;
+}
+
+static __dpct_inline__ float vec_dot_q1_0_g128_q8_1(
+ const void * __restrict__ vbq,
+ const block_q8_1 * __restrict__ bq8_1,
+ const int & iqs) {
+ // Q1_0_g128: 128 weights per block, 4 Q8_1 blocks align per X block
+ // iqs = 0..3 selects which Q8_1 block (32 activations each)
+ // result = d_q1 * d_q8[iqs] * sum(sign * q8_raw)
+ const block_q1_0_g128 * bq = (const block_q1_0_g128 *) vbq;
+ const float d_q1 = (float)(bq->d);
+ const sycl::float2 ds8 = bq8_1[iqs].ds.convert();
+ const float d_q8 = ds8.x();
+ const int8_t * q8 = bq8_1[iqs].qs;
+ const int base_bit = iqs * 32;
+ float sum = 0.0f;
+ for (int bit = 0; bit < 32; bit++) {
+ const int abs_bit = base_bit + bit;
+ const float sign = ((bq->qs[abs_bit / 8] >> (abs_bit % 8)) & 1) ? 1.0f : -1.0f;
+ sum += sign * (float)q8[bit];
+ }
+ return d_q1 * d_q8 * sum;
+}
+
#endif // GGML_SYCL_VECDOTQ_HPP