diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index 60b269c42de..95dcd6ffa0c 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -216,7 +216,11 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons } else { parser = content.build_parser(ctx); } - return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser; + // When thinking is off the generation prompt ends with the suppression block (e.g. "\n\n"). + // Use the full string as a literal prefix so the suppression block isn't matched by the optional + // reasoning parser, which would otherwise emit spurious reasoning_content. + const std::string prefix_delim = inputs.enable_thinking ? reasoning.start : std::string(); + return pure_content ? p.prefix(inputs.generation_prompt, prefix_delim) + parser : p.prefix(inputs.generation_prompt, prefix_delim) << parser; }); } diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index d7f60cbc9ea..244de8d875f 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -598,6 +598,52 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct } + +// ========================================================================= +// Q1_0 and Q1_0_g128 SYCL dequantize functions +// block_q1_0: { ggml_half d; uint8_t qs[4]; } 32 elements @ 1 bit +// block_q1_0_g128: { ggml_half d; uint8_t qs[16]; } 128 elements @ 1 bit +// Bit=1 -> +d, Bit=0 -> -d +// ========================================================================= +template +static void dequantize_row_q1_0_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) { + const int64_t num_threads = SYCL_DEQUANTIZE_BLOCK_SIZE; + const int64_t num_blocks = (k + num_threads - 1) / num_threads; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, num_threads), + sycl::range<3>(1, 1, num_threads)), + [=](sycl::nd_item<3> item_ct1) { + const int64_t i = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2); + if (i >= k) return; + const block_q1_0 * x = (const block_q1_0 *) vx; + const int64_t ib = i / QK1_0; + const int bit = i % QK1_0; + const float d = (float)(x[ib].d); + y[i] = static_cast(((x[ib].qs[bit / 8] >> (bit % 8)) & 1) ? d : -d); + }); +} + +template +static void dequantize_row_q1_0_g128_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) { + const int64_t num_threads = SYCL_DEQUANTIZE_BLOCK_SIZE; + const int64_t num_blocks = (k + num_threads - 1) / num_threads; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, num_threads), + sycl::range<3>(1, 1, num_threads)), + [=](sycl::nd_item<3> item_ct1) { + const int64_t i = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2); + if (i >= k) return; + const block_q1_0_g128 * x = (const block_q1_0_g128 *) vx; + const int64_t ib = i / QK1_0_g128; + const int bit = i % QK1_0_g128; + const float d = (float)(x[ib].d); + y[i] = static_cast(((x[ib].qs[bit / 8] >> (bit % 8)) & 1) ? d : -d); + }); +} +// ========================================================================= +// end Q1_0 SYCL dequantize +// ========================================================================= + to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { switch (type) { case GGML_TYPE_Q4_0: @@ -661,6 +707,10 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { case GGML_TYPE_BF16: return convert_unary_sycl; #endif + case GGML_TYPE_Q1_0: + return dequantize_row_q1_0_sycl; + case GGML_TYPE_Q1_0_g128: + return dequantize_row_q1_0_g128_sycl; default: GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type)); return nullptr; @@ -731,6 +781,10 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) { case GGML_TYPE_BF16: return convert_unary_sycl; #endif + case GGML_TYPE_Q1_0: + return dequantize_row_q1_0_sycl; + case GGML_TYPE_Q1_0_g128: + return dequantize_row_q1_0_g128_sycl; default: GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type)); return nullptr; diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 5abc50fabfe..c587493a6fc 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -1058,6 +1058,37 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, } } + +static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK1_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); +} + +static void mul_mat_vec_q1_0_g128_q8_1_sycl(const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK1_0_g128 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); +} + void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, @@ -1165,6 +1196,13 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_NVFP4: mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); break; + case GGML_TYPE_Q1_0: + mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q1_0_g128: + mul_mat_vec_q1_0_g128_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + default: GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(src0->type)); } diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index eab9850aed7..0b02f62d3de 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -1413,4 +1413,49 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, #endif } + +#define VDR_Q1_0_Q8_1_MMVQ 1 +#define VDR_Q1_0_g128_Q8_1_MMVQ 1 + +static __dpct_inline__ float vec_dot_q1_0_q8_1( + const void * __restrict__ vbq, + const block_q8_1 * __restrict__ bq8_1, + const int & iqs) { + // Q1_0: 32 weights per block, 1 Q8_1 block aligns per X block + // result = d_q1 * d_q8 * sum(sign * q8_raw) + const block_q1_0 * bq = (const block_q1_0 *) vbq; + const float d_q1 = (float)(bq->d); + const sycl::float2 ds8 = bq8_1[0].ds.convert(); + const float d_q8 = ds8.x(); + const int8_t * q8 = bq8_1[0].qs; + float sum = 0.0f; + for (int bit = 0; bit < QK1_0; bit++) { + const float sign = ((bq->qs[bit / 8] >> (bit % 8)) & 1) ? 1.0f : -1.0f; + sum += sign * (float)q8[bit]; + } + return d_q1 * d_q8 * sum; +} + +static __dpct_inline__ float vec_dot_q1_0_g128_q8_1( + const void * __restrict__ vbq, + const block_q8_1 * __restrict__ bq8_1, + const int & iqs) { + // Q1_0_g128: 128 weights per block, 4 Q8_1 blocks align per X block + // iqs = 0..3 selects which Q8_1 block (32 activations each) + // result = d_q1 * d_q8[iqs] * sum(sign * q8_raw) + const block_q1_0_g128 * bq = (const block_q1_0_g128 *) vbq; + const float d_q1 = (float)(bq->d); + const sycl::float2 ds8 = bq8_1[iqs].ds.convert(); + const float d_q8 = ds8.x(); + const int8_t * q8 = bq8_1[iqs].qs; + const int base_bit = iqs * 32; + float sum = 0.0f; + for (int bit = 0; bit < 32; bit++) { + const int abs_bit = base_bit + bit; + const float sign = ((bq->qs[abs_bit / 8] >> (abs_bit % 8)) & 1) ? 1.0f : -1.0f; + sum += sign * (float)q8[bit]; + } + return d_q1 * d_q8 * sum; +} + #endif // GGML_SYCL_VECDOTQ_HPP