Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion common/chat-auto-parser-generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,11 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
} else {
parser = content.build_parser(ctx);
}
return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
// When thinking is off the generation prompt ends with the suppression block (e.g. "<think>\n\n</think>").
// Use the full string as a literal prefix so the suppression block isn't matched by the optional
// reasoning parser, which would otherwise emit spurious reasoning_content.
const std::string prefix_delim = inputs.enable_thinking ? reasoning.start : std::string();
return pure_content ? p.prefix(inputs.generation_prompt, prefix_delim) + parser : p.prefix(inputs.generation_prompt, prefix_delim) << parser;
});
}

Expand Down
54 changes: 54 additions & 0 deletions ggml/src/ggml-sycl/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,52 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
}



// =========================================================================
// Q1_0 and Q1_0_g128 SYCL dequantize functions
// block_q1_0: { ggml_half d; uint8_t qs[4]; } 32 elements @ 1 bit
// block_q1_0_g128: { ggml_half d; uint8_t qs[16]; } 128 elements @ 1 bit
// Bit=1 -> +d, Bit=0 -> -d
// =========================================================================
template <typename dst_t>
static void dequantize_row_q1_0_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
const int64_t num_threads = SYCL_DEQUANTIZE_BLOCK_SIZE;
const int64_t num_blocks = (k + num_threads - 1) / num_threads;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, num_threads),
sycl::range<3>(1, 1, num_threads)),
[=](sycl::nd_item<3> item_ct1) {
const int64_t i = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
if (i >= k) return;
const block_q1_0 * x = (const block_q1_0 *) vx;
const int64_t ib = i / QK1_0;
const int bit = i % QK1_0;
const float d = (float)(x[ib].d);
y[i] = static_cast<dst_t>(((x[ib].qs[bit / 8] >> (bit % 8)) & 1) ? d : -d);
});
}

template <typename dst_t>
static void dequantize_row_q1_0_g128_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
const int64_t num_threads = SYCL_DEQUANTIZE_BLOCK_SIZE;
const int64_t num_blocks = (k + num_threads - 1) / num_threads;
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, num_threads),
sycl::range<3>(1, 1, num_threads)),
[=](sycl::nd_item<3> item_ct1) {
const int64_t i = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
if (i >= k) return;
const block_q1_0_g128 * x = (const block_q1_0_g128 *) vx;
const int64_t ib = i / QK1_0_g128;
const int bit = i % QK1_0_g128;
const float d = (float)(x[ib].d);
y[i] = static_cast<dst_t>(((x[ib].qs[bit / 8] >> (bit % 8)) & 1) ? d : -d);
});
}
// =========================================================================
// end Q1_0 SYCL dequantize
// =========================================================================

to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
switch (type) {
case GGML_TYPE_Q4_0:
Expand Down Expand Up @@ -661,6 +707,10 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
case GGML_TYPE_BF16:
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
#endif
case GGML_TYPE_Q1_0:
return dequantize_row_q1_0_sycl;
case GGML_TYPE_Q1_0_g128:
return dequantize_row_q1_0_g128_sycl;
default:
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
return nullptr;
Expand Down Expand Up @@ -731,6 +781,10 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
case GGML_TYPE_BF16:
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
#endif
case GGML_TYPE_Q1_0:
return dequantize_row_q1_0_sycl;
case GGML_TYPE_Q1_0_g128:
return dequantize_row_q1_0_g128_sycl;
default:
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
return nullptr;
Expand Down
38 changes: 38 additions & 0 deletions ggml/src/ggml-sycl/mmvq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,37 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
}
}


static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy, float * dst,
const int ncols, const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK1_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q<QK1_0, QI1_0, block_q1_0, VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
});
}

static void mul_mat_vec_q1_0_g128_q8_1_sycl(const void * vx, const void * vy, float * dst,
const int ncols, const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK1_0_g128 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q<QK1_0_g128, QI1_0_g128, block_q1_0_g128, VDR_Q1_0_g128_Q8_1_MMVQ, vec_dot_q1_0_g128_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
});
}

void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low,
Expand Down Expand Up @@ -1165,6 +1196,13 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
case GGML_TYPE_NVFP4:
mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
case GGML_TYPE_Q1_0:
mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
case GGML_TYPE_Q1_0_g128:
mul_mat_vec_q1_0_g128_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;

default:
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(src0->type));
}
Expand Down
45 changes: 45 additions & 0 deletions ggml/src/ggml-sycl/vecdotq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1413,4 +1413,49 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
#endif
}


#define VDR_Q1_0_Q8_1_MMVQ 1
#define VDR_Q1_0_g128_Q8_1_MMVQ 1

static __dpct_inline__ float vec_dot_q1_0_q8_1(
const void * __restrict__ vbq,
const block_q8_1 * __restrict__ bq8_1,
const int & iqs) {
// Q1_0: 32 weights per block, 1 Q8_1 block aligns per X block
// result = d_q1 * d_q8 * sum(sign * q8_raw)
const block_q1_0 * bq = (const block_q1_0 *) vbq;
const float d_q1 = (float)(bq->d);
const sycl::float2 ds8 = bq8_1[0].ds.convert<float, sycl::rounding_mode::automatic>();
const float d_q8 = ds8.x();
const int8_t * q8 = bq8_1[0].qs;
float sum = 0.0f;
for (int bit = 0; bit < QK1_0; bit++) {
const float sign = ((bq->qs[bit / 8] >> (bit % 8)) & 1) ? 1.0f : -1.0f;
sum += sign * (float)q8[bit];
}
return d_q1 * d_q8 * sum;
}

static __dpct_inline__ float vec_dot_q1_0_g128_q8_1(
const void * __restrict__ vbq,
const block_q8_1 * __restrict__ bq8_1,
const int & iqs) {
// Q1_0_g128: 128 weights per block, 4 Q8_1 blocks align per X block
// iqs = 0..3 selects which Q8_1 block (32 activations each)
// result = d_q1 * d_q8[iqs] * sum(sign * q8_raw)
const block_q1_0_g128 * bq = (const block_q1_0_g128 *) vbq;
const float d_q1 = (float)(bq->d);
const sycl::float2 ds8 = bq8_1[iqs].ds.convert<float, sycl::rounding_mode::automatic>();
const float d_q8 = ds8.x();
const int8_t * q8 = bq8_1[iqs].qs;
const int base_bit = iqs * 32;
float sum = 0.0f;
for (int bit = 0; bit < 32; bit++) {
const int abs_bit = base_bit + bit;
const float sign = ((bq->qs[abs_bit / 8] >> (abs_bit % 8)) & 1) ? 1.0f : -1.0f;
sum += sign * (float)q8[bit];
}
return d_q1 * d_q8 * sum;
}

#endif // GGML_SYCL_VECDOTQ_HPP