From 2cd85621e66c90a2e7b03d4ea2c2a52ec9a66ba6 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 1 Jun 2026 19:12:02 -0400 Subject: [PATCH 01/25] perf: improve int8 on arm64 CPU --- bitsandbytes/backends/.DS_Store | Bin 0 -> 6148 bytes bitsandbytes/backends/cpu/ops.py | 8 ++++++-- 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 bitsandbytes/backends/.DS_Store diff --git a/bitsandbytes/backends/.DS_Store b/bitsandbytes/backends/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c2102dc454e1d95e549a115218b72225758e63a7 GIT binary patch literal 6148 zcmeHK%SyvQ6rE|SO({Ya3SADkE!g^OaT8+w0VBFlsfj5XjG59jW>E@R>ks)QevkLg zOu%B+BKBUGIrlk}IgoiU#<-h>ea39Ym<0`yqf#O0t_;;oG9t$@(rFOO0IZK-YGQvK z@Z0O`flXN!QsTYFcgU&U=!%H}~gR>iV-Av@WHLgHrc{t7wvs z?d>y}%>5{t%v3=XP9Wv>I!Z#ByE03{RMmPqV708)*zPPANBu!p92^gqU9lMM_qyWn zWUySetexGx(~I$Q{F2BwO(zGom24R-;T;sSnpba@#4>pVPnlh25fTH$05L!etTzMZ zT(H{fO#`i-7$62J7{L8OKtps478=#o0UchSG2TE#0Uh5Gh{B*_u+Rt*5UxuBbtyMb z46e(;FHD|eu+XT>8CNsIIA-SZ@xs;Y;1?>LaYrNd!~ij{%0OM4HlF|I@XJ&_@>fg9 zA_j^d6KoLS6G4Klvd;w;TNs|Bo literal 0 HcmV?d00001 diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py index ed6803eda..74f3cfe30 100755 --- a/bitsandbytes/backends/cpu/ops.py +++ b/bitsandbytes/backends/cpu/ops.py @@ -3,6 +3,7 @@ import logging import math from math import prod +import platform from typing import Optional import torch @@ -20,8 +21,11 @@ # However, we can overflow if we use this without AVX512_VNNI support. # This is fixed in torch 2.6+, so we set this as the minimum to be safe. # For more information: https://github.com/pytorch/pytorch/pull/136942 -# TODO(matthewdouglas): aarch64? -if torch.__version__ >= (2, 6): +# +# On aarch64, torch._int_mm uses a scalar fallback that is much slower +# than fp32 matmul. Skip it and let the default backend handle this. +_is_arm64 = platform.machine().lower() in ("arm64", "aarch64") +if torch.__version__ >= (2, 6) and not _is_arm64: @register_kernel("bitsandbytes::int8_linear_matmul", "cpu") def _(A: torch.Tensor, B: torch.Tensor): From 62427bf7cf4c636484b4a7d31c20bac2c729dbd3 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 1 Jun 2026 19:12:24 -0400 Subject: [PATCH 02/25] build: temporary add CPU build verbosity --- .github/scripts/build-cpu.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh index 5daeb5ea5..24c731063 100644 --- a/.github/scripts/build-cpu.sh +++ b/.github/scripts/build-cpu.sh @@ -6,10 +6,17 @@ set -xeuo pipefail pip install cmake==3.28.3 +# Temporary: vectorization reporting +if [[ "${build_os}" == windows* ]]; then + EXTRA_CXX_FLAGS="/Qvec-report:2 /Qpar-report:1" +else + EXTRA_CXX_FLAGS="-fopt-info-vec-missed -fopt-info-vec -fopt-info-loop-optimized" +fi + if [ "${build_os:0:5}" == macos ] && [ "${build_arch}" == aarch64 ]; then - cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu -DCMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" . else - cmake -DCOMPUTE_BACKEND=cpu . + cmake -DCOMPUTE_BACKEND=cpu -DCMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" . fi cmake --build . --config Release From f7f1cdbe88f03d6db56c55c7b6460a9d212f9017 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 1 Jun 2026 19:15:15 -0400 Subject: [PATCH 03/25] fix --- .github/scripts/build-cpu.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh index 24c731063..520e22eab 100644 --- a/.github/scripts/build-cpu.sh +++ b/.github/scripts/build-cpu.sh @@ -9,6 +9,8 @@ pip install cmake==3.28.3 # Temporary: vectorization reporting if [[ "${build_os}" == windows* ]]; then EXTRA_CXX_FLAGS="/Qvec-report:2 /Qpar-report:1" +elif [[ "${build_os:0:5}" == macos ]]; then + EXTRA_CXX_FLAGS="-Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize" else EXTRA_CXX_FLAGS="-fopt-info-vec-missed -fopt-info-vec -fopt-info-loop-optimized" fi From ec1e76d75f2e1a15b54e1fc812a83d3143025106 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 1 Jun 2026 21:05:40 -0400 Subject: [PATCH 04/25] cpu: skip _int_mm when not on avx512. --- bitsandbytes/backends/.DS_Store | Bin 6148 -> 0 bytes bitsandbytes/backends/cpu/ops.py | 8 +++----- 2 files changed, 3 insertions(+), 5 deletions(-) delete mode 100644 bitsandbytes/backends/.DS_Store diff --git a/bitsandbytes/backends/.DS_Store b/bitsandbytes/backends/.DS_Store deleted file mode 100644 index c2102dc454e1d95e549a115218b72225758e63a7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%SyvQ6rE|SO({Ya3SADkE!g^OaT8+w0VBFlsfj5XjG59jW>E@R>ks)QevkLg zOu%B+BKBUGIrlk}IgoiU#<-h>ea39Ym<0`yqf#O0t_;;oG9t$@(rFOO0IZK-YGQvK z@Z0O`flXN!QsTYFcgU&U=!%H}~gR>iV-Av@WHLgHrc{t7wvs z?d>y}%>5{t%v3=XP9Wv>I!Z#ByE03{RMmPqV708)*zPPANBu!p92^gqU9lMM_qyWn zWUySetexGx(~I$Q{F2BwO(zGom24R-;T;sSnpba@#4>pVPnlh25fTH$05L!etTzMZ zT(H{fO#`i-7$62J7{L8OKtps478=#o0UchSG2TE#0Uh5Gh{B*_u+Rt*5UxuBbtyMb z46e(;FHD|eu+XT>8CNsIIA-SZ@xs;Y;1?>LaYrNd!~ij{%0OM4HlF|I@XJ&_@>fg9 zA_j^d6KoLS6G4Klvd;w;TNs|Bo diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py index 74f3cfe30..597511c4b 100755 --- a/bitsandbytes/backends/cpu/ops.py +++ b/bitsandbytes/backends/cpu/ops.py @@ -3,7 +3,6 @@ import logging import math from math import prod -import platform from typing import Optional import torch @@ -22,10 +21,9 @@ # This is fixed in torch 2.6+, so we set this as the minimum to be safe. # For more information: https://github.com/pytorch/pytorch/pull/136942 # -# On aarch64, torch._int_mm uses a scalar fallback that is much slower -# than fp32 matmul. Skip it and let the default backend handle this. -_is_arm64 = platform.machine().lower() in ("arm64", "aarch64") -if torch.__version__ >= (2, 6) and not _is_arm64: +# Without AVX-512 (including aarch64), torch._int_mm uses a scalar fallback +# that is much slower than fp32 matmul. Only use it when AVX-512 is available. +if torch.__version__ >= (2, 6) and _has_avx512: @register_kernel("bitsandbytes::int8_linear_matmul", "cpu") def _(A: torch.Tensor, B: torch.Tensor): From 45831bd0052c6f6d5fdfc3f401d180e19d34a66f Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 1 Jun 2026 22:00:52 -0400 Subject: [PATCH 05/25] MSVC optimization for CPU ops --- csrc/cpu_ops.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 2a8912674..a5fb7b158 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -263,7 +263,7 @@ static inline __m512 set_fp4_lut() { // DATA_TYPE: 1 = FP4, 2 = NF4 template void dequantizeBlockwise4bitCpu( - unsigned char* A, const float* absmax, T* out, long long blocksize, long long m, long long n + unsigned char* __restrict A, const float* __restrict absmax, T* __restrict out, long long blocksize, long long m, long long n ) { static_assert(DATA_TYPE == 1 || DATA_TYPE == 2, "dequantizeBlockwise4bitCpu called with non 4-bit DATA_TYPE"); if (blocksize <= 0 || m < 0 || n <= 0) @@ -408,7 +408,7 @@ void dequantizeBlockwise4bitCpu( template void dequantizeBlockwise8bitCpu( - float* code, unsigned char* A, const float* absmax, T* out, long long blocksize, long long n + float* __restrict code, unsigned char* __restrict A, const float* __restrict absmax, T* __restrict out, long long blocksize, long long n ) { if (blocksize <= 0 || n <= 0) return; @@ -518,7 +518,7 @@ static inline uint16_t norm_to_lut_index(float val) { } template -void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* out, long long blocksize, long long n) { +void quantize_cpu_impl(float* __restrict code, const T* __restrict A, float* __restrict absmax, unsigned char* __restrict out, long long blocksize, long long n) { if (blocksize <= 0 || n <= 0) return; From 4e2b6116356ee39e4602cc35b5d2fa5009d26a5e Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 1 Jun 2026 22:37:56 -0400 Subject: [PATCH 06/25] msvc improvement --- csrc/cpu_ops.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index a5fb7b158..272858838 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -263,7 +263,7 @@ static inline __m512 set_fp4_lut() { // DATA_TYPE: 1 = FP4, 2 = NF4 template void dequantizeBlockwise4bitCpu( - unsigned char* __restrict A, const float* __restrict absmax, T* __restrict out, long long blocksize, long long m, long long n + unsigned char* A, const float* absmax, T* out, long long blocksize, long long m, long long n ) { static_assert(DATA_TYPE == 1 || DATA_TYPE == 2, "dequantizeBlockwise4bitCpu called with non 4-bit DATA_TYPE"); if (blocksize <= 0 || m < 0 || n <= 0) @@ -408,7 +408,7 @@ void dequantizeBlockwise4bitCpu( template void dequantizeBlockwise8bitCpu( - float* __restrict code, unsigned char* __restrict A, const float* __restrict absmax, T* __restrict out, long long blocksize, long long n + float* code, unsigned char* A, const float* absmax, T* out, long long blocksize, long long n ) { if (blocksize <= 0 || n <= 0) return; @@ -418,6 +418,9 @@ void dequantizeBlockwise8bitCpu( long long valid_items = (n - block_idx >= blocksize ? blocksize : n - block_idx); long long block_end = block_idx + valid_items; float scale = absmax[block_idx / blocksize]; +#ifdef _MSC_VER +#pragma loop(ivdep) +#endif for (long long i = block_idx; i < block_end; ++i) { float v = code[A[i]] * scale; if constexpr (std::is_same::value) { @@ -518,7 +521,7 @@ static inline uint16_t norm_to_lut_index(float val) { } template -void quantize_cpu_impl(float* __restrict code, const T* __restrict A, float* __restrict absmax, unsigned char* __restrict out, long long blocksize, long long n) { +void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* out, long long blocksize, long long n) { if (blocksize <= 0 || n <= 0) return; From 6755f0387838ad49718c2d3ec279eabf26462bcb Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 2 Jun 2026 14:40:02 -0400 Subject: [PATCH 07/25] cpu: enable openmp:experimental on windows; add back avx2/fma for linux-x64 --- .github/scripts/build-cpu.sh | 6 +++++- CMakeLists.txt | 5 +++++ csrc/cpu_ops.cpp | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh index 520e22eab..f75325505 100644 --- a/.github/scripts/build-cpu.sh +++ b/.github/scripts/build-cpu.sh @@ -4,7 +4,11 @@ declare build_os set -xeuo pipefail -pip install cmake==3.28.3 +if [[ "${build_os}" == windows* ]]; then + pip install cmake==3.30.9 +else + pip install cmake==3.28.3 +fi # Temporary: vectorization reporting if [[ "${build_os}" == windows* ]]; then diff --git a/CMakeLists.txt b/CMakeLists.txt index a787866f6..19e8ae0b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,6 +118,11 @@ if (BUILD_CPU) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" HOST_ARCH) + if(MSVC) + # Use the experimental OpenMP runtime for persistent thread pool support. + # Requires CMake 3.30+; silently ignored on older CMake versions. + set(OpenMP_RUNTIME_MSVC "experimental") + endif() find_package(OpenMP) endif() diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 272858838..f6f2f59b4 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -439,7 +439,7 @@ void dequantizeBlockwise8bitCpu( // which would SIGILL on non-AVX512 CPUs like Zen3. These functions are scalar C++ and don't need AVX512. #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #pragma GCC push_options -#pragma GCC target("no-avx512f") +#pragma GCC target("avx2,fma,no-avx512f") #endif // Precomputed direct lookup table: maps quantized uint16 index [0..65535] to codebook index. From 44e6da6448e5d0522183c81f90f334bffc75845d Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 2 Jun 2026 15:59:17 -0400 Subject: [PATCH 08/25] improve optim test perf --- tests/test_optim.py | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/tests/test_optim.py b/tests/test_optim.py index dbfb9d469..d96bdc169 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -1,9 +1,6 @@ -import os -from os.path import join -import shutil +import io import sys import time -import uuid from lion_pytorch import Lion import pytest @@ -27,15 +24,6 @@ def assert_most_approx_close(a, b, rtol=1e-3, atol=1e-3, max_error_count=0): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) -def get_temp_dir(): - path = f"/tmp/autoswap/{uuid.uuid4()}" - os.makedirs(path, exist_ok=True) - return path - - -def rm_path(path): - shutil.rmtree(path) - str2optimizers = {} @@ -223,13 +211,13 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name, device): assert_most_approx_close(p1, p2.float(), atol=atol, rtol=rtol, max_error_count=15) if i % (k // 5) == 0 and i > 0: - path = get_temp_dir() - torch.save(bnb_optimizer.state_dict(), join(path, "opt.pt")) + buf = io.BytesIO() + torch.save(bnb_optimizer.state_dict(), buf) del bnb_optimizer bnb_optimizer = None bnb_optimizer = str2optimizers[optim_name][1]([p2]) - bnb_optimizer.load_state_dict(torch.load(join(path, "opt.pt"))) - rm_path(path) + buf.seek(0) + bnb_optimizer.load_state_dict(torch.load(buf)) # since Lion can have pretty noisy updates where things lie at the boundary # allow up to 10 errors for Lion assert_most_approx_close(p1, p2.float(), atol=atol, rtol=rtol, max_error_count=10) @@ -441,13 +429,13 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name, device): raws1cpy = bnb_optimizer.state[p2][name2].clone() qmap1 = bnb_optimizer.state[p2][qmap].clone() - path = get_temp_dir() - torch.save(bnb_optimizer.state_dict(), join(path, "opt.pt")) + buf = io.BytesIO() + torch.save(bnb_optimizer.state_dict(), buf) del bnb_optimizer bnb_optimizer = None bnb_optimizer = str2optimizers[optim_name][1]([p2]) - bnb_optimizer.load_state_dict(torch.load(join(path, "opt.pt"))) - rm_path(path) + buf.seek(0) + bnb_optimizer.load_state_dict(torch.load(buf)) torch.testing.assert_close(raws1cpy, bnb_optimizer.state[p2][name2]) torch.testing.assert_close(qmap1, bnb_optimizer.state[p2][qmap]) @@ -577,16 +565,18 @@ def test_ademamix_state_dict_no_nan(optim_name, optim_factory, device): # Save state model_sd = {k: v.clone() for k, v in model.state_dict().items()} opt_sd = opt.state_dict() - path = get_temp_dir() - torch.save(opt_sd, join(path, "opt.pt")) - torch.save(model_sd, join(path, "model.pt")) + opt_buf = io.BytesIO() + model_buf = io.BytesIO() + torch.save(opt_sd, opt_buf) + torch.save(model_sd, model_buf) # Create fresh model and optimizer, load state model2 = nn.Linear(256, 64).to(device) - model2.load_state_dict(torch.load(join(path, "model.pt"))) + model_buf.seek(0) + model2.load_state_dict(torch.load(model_buf)) opt2 = optim_factory(model2.parameters()) - opt2.load_state_dict(torch.load(join(path, "opt.pt"))) - rm_path(path) + opt_buf.seek(0) + opt2.load_state_dict(torch.load(opt_buf)) # Verify loaded state matches original byte-for-byte orig_params = list(model.parameters()) From c5df7fade172992a9b02e21d56fe157298985877 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 2 Jun 2026 22:54:25 -0400 Subject: [PATCH 09/25] cpu perf: improvements for arm64 8bit blockwise quant/dequant (neon) --- csrc/cpu_ops.cpp | 152 +++++++++++++++++++++++++++++++---------------- 1 file changed, 102 insertions(+), 50 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index f6f2f59b4..9ea661615 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -173,27 +173,50 @@ static inline void neon_f32_to_fp16x4(const float32x4_t src, fp16_t* dst) { vst1_u16(reinterpret_cast(dst), vreinterpret_u16_f16(half)); } -// NEON-optimized absmax computation for a block of float32 -static inline float neon_absmax_f32(const float* data, long long n) { +// NEON-optimized FP16 to float conversion (4 values at a time) +static inline float32x4_t neon_fp16x4_to_f32(const fp16_t* src) { + uint16x4_t raw = vld1_u16(reinterpret_cast(src)); + return vcvt_f32_f16(vreinterpret_f16_u16(raw)); +} + +// NEON-optimized absmax computation for a block of float32, bf16, or fp16. +template +static inline float neon_absmax(const T* data, long long n) { float32x4_t vmax = vdupq_n_f32(0.0f); long long i = 0; - // Process 16 elements per iteration for better throughput for (; i + 16 <= n; i += 16) { - float32x4_t v0 = vabsq_f32(vld1q_f32(data + i)); - float32x4_t v1 = vabsq_f32(vld1q_f32(data + i + 4)); - float32x4_t v2 = vabsq_f32(vld1q_f32(data + i + 8)); - float32x4_t v3 = vabsq_f32(vld1q_f32(data + i + 12)); - vmax = vmaxq_f32(vmax, vmaxq_f32(vmaxq_f32(v0, v1), vmaxq_f32(v2, v3))); + float32x4_t v0, v1, v2, v3; + if constexpr (std::is_same::value) { + const float* p = reinterpret_cast(data + i); + v0 = vld1q_f32(p); v1 = vld1q_f32(p + 4); + v2 = vld1q_f32(p + 8); v3 = vld1q_f32(p + 12); + } else if constexpr (std::is_same::value) { + v0 = neon_bf16x4_to_f32(data + i); v1 = neon_bf16x4_to_f32(data + i + 4); + v2 = neon_bf16x4_to_f32(data + i + 8); v3 = neon_bf16x4_to_f32(data + i + 12); + } else { + v0 = neon_fp16x4_to_f32(data + i); v1 = neon_fp16x4_to_f32(data + i + 4); + v2 = neon_fp16x4_to_f32(data + i + 8); v3 = neon_fp16x4_to_f32(data + i + 12); + } + vmax = vmaxq_f32(vmax, vmaxq_f32(vmaxq_f32(vabsq_f32(v0), vabsq_f32(v1)), + vmaxq_f32(vabsq_f32(v2), vabsq_f32(v3)))); } for (; i + 4 <= n; i += 4) { - float32x4_t v = vld1q_f32(data + i); + float32x4_t v; + if constexpr (std::is_same::value) + v = vld1q_f32(reinterpret_cast(data + i)); + else if constexpr (std::is_same::value) + v = neon_bf16x4_to_f32(data + i); + else + v = neon_fp16x4_to_f32(data + i); vmax = vmaxq_f32(vmax, vabsq_f32(v)); } - // Horizontal max float result = vmaxvq_f32(vmax); - // Handle remainder for (; i < n; ++i) { - result = std::max(result, std::fabs(data[i])); + float val; + if constexpr (std::is_same::value) val = data[i]; + else if constexpr (std::is_same::value) val = bf16_to_float(data[i].v); + else val = fp16_to_float(data[i].v); + result = std::max(result, std::fabs(val)); } return result; } @@ -418,6 +441,31 @@ void dequantizeBlockwise8bitCpu( long long valid_items = (n - block_idx >= blocksize ? blocksize : n - block_idx); long long block_end = block_idx + valid_items; float scale = absmax[block_idx / blocksize]; +#if defined(_M_ARM64) || defined(__aarch64__) + { + float32x4_t vscale = vdupq_n_f32(scale); + long long i = block_idx; + for (; i + 4 <= block_end; i += 4) { + float tmp[4] = { code[A[i]], code[A[i+1]], code[A[i+2]], code[A[i+3]] }; + float32x4_t v = vmulq_f32(vld1q_f32(tmp), vscale); + if constexpr (std::is_same::value) + vst1q_f32(reinterpret_cast(out + i), v); + else if constexpr (std::is_same::value) + neon_f32_to_bf16x4(v, out + i); + else + neon_f32_to_fp16x4(v, out + i); + } + for (; i < block_end; ++i) { + float v = code[A[i]] * scale; + if constexpr (std::is_same::value) + out[i] = float_to_bf16(v); + else if constexpr (std::is_same::value) + out[i] = float_to_fp16(v); + else + out[i] = static_cast(v); + } + } +#else #ifdef _MSC_VER #pragma loop(ivdep) #endif @@ -431,6 +479,7 @@ void dequantizeBlockwise8bitCpu( out[i] = static_cast(v); } } +#endif } } @@ -540,24 +589,19 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou float absmax_block = 0.0f; #if defined(_M_ARM64) || defined(__aarch64__) - if constexpr (std::is_same::value) { - // Use NEON-optimized absmax for float32 - absmax_block = neon_absmax_f32(reinterpret_cast(A + block_start), block_len); - } else -#endif - { - for (long long i = block_start; i < block_end; ++i) { - float val; - if constexpr (std::is_same::value) { - val = A[i]; - } else if constexpr (std::is_same::value) { - val = bf16_to_float(A[i].v); - } else if constexpr (std::is_same::value) { - val = fp16_to_float(A[i].v); - } - absmax_block = std::max(absmax_block, std::fabs(val)); - } + absmax_block = neon_absmax(A + block_start, block_len); +#else + for (long long i = block_start; i < block_end; ++i) { + float val; + if constexpr (std::is_same::value) + val = A[i]; + else if constexpr (std::is_same::value) + val = bf16_to_float(A[i].v); + else + val = fp16_to_float(A[i].v); + absmax_block = std::max(absmax_block, std::fabs(val)); } +#endif absmax[b] = absmax_block; @@ -571,41 +615,49 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou const float inv_absmax = 1.0f / absmax_block; #if defined(_M_ARM64) || defined(__aarch64__) - if constexpr (std::is_same::value) { - // NEON-optimized normalize + LUT index for float32 - const float* src = A + block_start; + { long long i = 0; float32x4_t vinv = vdupq_n_f32(inv_absmax); for (; i + 4 <= block_len; i += 4) { - float32x4_t v = vmulq_f32(vld1q_f32(src + i), vinv); + float32x4_t v; + if constexpr (std::is_same::value) + v = vld1q_f32(reinterpret_cast(A + block_start + i)); + else if constexpr (std::is_same::value) + v = neon_bf16x4_to_f32(A + block_start + i); + else + v = neon_fp16x4_to_f32(A + block_start + i); + v = vmulq_f32(v, vinv); uint16x4_t indices = neon_norm_to_lut_index_x4(v); uint16_t idx_arr[4]; vst1_u16(idx_arr, indices); - out[block_start + i] = lut[idx_arr[0]]; + out[block_start + i] = lut[idx_arr[0]]; out[block_start + i + 1] = lut[idx_arr[1]]; out[block_start + i + 2] = lut[idx_arr[2]]; out[block_start + i + 3] = lut[idx_arr[3]]; } for (; i < block_len; ++i) { - float normed_value = src[i] * inv_absmax; - out[block_start + i] = lut[norm_to_lut_index(normed_value)]; - } - } else -#endif - { - for (long long i = block_start; i < block_end; ++i) { float val; - if constexpr (std::is_same::value) { - val = A[i]; - } else if constexpr (std::is_same::value) { - val = bf16_to_float(A[i].v); - } else if constexpr (std::is_same::value) { - val = fp16_to_float(A[i].v); - } - float normed_value = val * inv_absmax; - out[i] = lut[norm_to_lut_index(normed_value)]; + if constexpr (std::is_same::value) + val = A[block_start + i]; + else if constexpr (std::is_same::value) + val = bf16_to_float(A[block_start + i].v); + else + val = fp16_to_float(A[block_start + i].v); + out[block_start + i] = lut[norm_to_lut_index(val * inv_absmax)]; } } +#else + for (long long i = block_start; i < block_end; ++i) { + float val; + if constexpr (std::is_same::value) + val = A[i]; + else if constexpr (std::is_same::value) + val = bf16_to_float(A[i].v); + else + val = fp16_to_float(A[i].v); + out[i] = lut[norm_to_lut_index(val * inv_absmax)]; + } +#endif } } From 6223b78c5ff9392948948e1c7eab6d7923d6f7a2 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 3 Jun 2026 15:37:31 -0400 Subject: [PATCH 10/25] cpu perf: ARM64 NEON improvements for blockwise quantization --- csrc/cpu_ops.cpp | 188 ++++++++++++++++++++++++----------------------- 1 file changed, 98 insertions(+), 90 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 9ea661615..bda400e10 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -102,42 +102,28 @@ static inline void // interleaved.val[0] has elements 0-7, interleaved.val[1] has elements 8-15 uint8x16_t indices = vcombine_u8(interleaved.val[0], interleaved.val[1]); - // Use flat LUT for fast indexed access - // Store LUT as flat float array on stack (likely in L1 cache) - float flat_lut[16]; - vst1q_f32(flat_lut, lut[0]); - vst1q_f32(flat_lut + 4, lut[1]); - vst1q_f32(flat_lut + 8, lut[2]); - vst1q_f32(flat_lut + 12, lut[3]); - - // Extract indices and do lookups in groups of 4 for NEON multiply - uint8_t idx_arr[16]; - vst1q_u8(idx_arr, indices); - + // Reinterpret float LUT as 64-byte table for vqtbl4q_u8 lookup. + // Each 4-bit index i maps to bytes [i*4 .. i*4+3] in the table. + uint8x16x4_t lut_bytes = { + vreinterpretq_u8_f32(lut[0]), vreinterpretq_u8_f32(lut[1]), vreinterpretq_u8_f32(lut[2]), + vreinterpretq_u8_f32(lut[3]) + }; + // Multiply each index by 4 to get byte offset (max 15*4=60 < 64, safe) + uint8x16_t base = vshlq_n_u8(indices, 2); + // Expand each base offset to 4 consecutive bytes via zip + static const uint8x16_t off = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; + uint8x8_t lo = vget_low_u8(base), hi = vget_high_u8(base); + uint8x8x2_t z0 = vzip_u8(lo, lo); + uint8x8x2_t z1 = vzip_u8(hi, hi); + uint8x8x2_t zlo = vzip_u8(z0.val[0], z0.val[0]); + uint8x8x2_t zhi = vzip_u8(z0.val[1], z0.val[1]); + uint8x8x2_t zlo2 = vzip_u8(z1.val[0], z1.val[0]); + uint8x8x2_t zhi2 = vzip_u8(z1.val[1], z1.val[1]); float32x4_t vscale = vdupq_n_f32(scale); - - // Process 4 values at a time with NEON - load from temp buffer - float tmp_vals[16]; - tmp_vals[0] = flat_lut[idx_arr[0]]; - tmp_vals[1] = flat_lut[idx_arr[1]]; - tmp_vals[2] = flat_lut[idx_arr[2]]; - tmp_vals[3] = flat_lut[idx_arr[3]]; - tmp_vals[4] = flat_lut[idx_arr[4]]; - tmp_vals[5] = flat_lut[idx_arr[5]]; - tmp_vals[6] = flat_lut[idx_arr[6]]; - tmp_vals[7] = flat_lut[idx_arr[7]]; - tmp_vals[8] = flat_lut[idx_arr[8]]; - tmp_vals[9] = flat_lut[idx_arr[9]]; - tmp_vals[10] = flat_lut[idx_arr[10]]; - tmp_vals[11] = flat_lut[idx_arr[11]]; - tmp_vals[12] = flat_lut[idx_arr[12]]; - tmp_vals[13] = flat_lut[idx_arr[13]]; - tmp_vals[14] = flat_lut[idx_arr[14]]; - tmp_vals[15] = flat_lut[idx_arr[15]]; - float32x4_t v0 = vld1q_f32(tmp_vals); - float32x4_t v1 = vld1q_f32(tmp_vals + 4); - float32x4_t v2 = vld1q_f32(tmp_vals + 8); - float32x4_t v3 = vld1q_f32(tmp_vals + 12); + float32x4_t v0 = vreinterpretq_f32_u8(vqtbl4q_u8(lut_bytes, vaddq_u8(vcombine_u8(zlo.val[0], zlo.val[1]), off))); + float32x4_t v1 = vreinterpretq_f32_u8(vqtbl4q_u8(lut_bytes, vaddq_u8(vcombine_u8(zhi.val[0], zhi.val[1]), off))); + float32x4_t v2 = vreinterpretq_f32_u8(vqtbl4q_u8(lut_bytes, vaddq_u8(vcombine_u8(zlo2.val[0], zlo2.val[1]), off))); + float32x4_t v3 = vreinterpretq_f32_u8(vqtbl4q_u8(lut_bytes, vaddq_u8(vcombine_u8(zhi2.val[0], zhi2.val[1]), off))); vst1q_f32(out, vmulq_f32(v0, vscale)); vst1q_f32(out + 4, vmulq_f32(v1, vscale)); @@ -180,25 +166,31 @@ static inline float32x4_t neon_fp16x4_to_f32(const fp16_t* src) { } // NEON-optimized absmax computation for a block of float32, bf16, or fp16. -template -static inline float neon_absmax(const T* data, long long n) { +template static inline float neon_absmax(const T* data, long long n) { float32x4_t vmax = vdupq_n_f32(0.0f); long long i = 0; for (; i + 16 <= n; i += 16) { float32x4_t v0, v1, v2, v3; if constexpr (std::is_same::value) { const float* p = reinterpret_cast(data + i); - v0 = vld1q_f32(p); v1 = vld1q_f32(p + 4); - v2 = vld1q_f32(p + 8); v3 = vld1q_f32(p + 12); + v0 = vld1q_f32(p); + v1 = vld1q_f32(p + 4); + v2 = vld1q_f32(p + 8); + v3 = vld1q_f32(p + 12); } else if constexpr (std::is_same::value) { - v0 = neon_bf16x4_to_f32(data + i); v1 = neon_bf16x4_to_f32(data + i + 4); - v2 = neon_bf16x4_to_f32(data + i + 8); v3 = neon_bf16x4_to_f32(data + i + 12); + v0 = neon_bf16x4_to_f32(data + i); + v1 = neon_bf16x4_to_f32(data + i + 4); + v2 = neon_bf16x4_to_f32(data + i + 8); + v3 = neon_bf16x4_to_f32(data + i + 12); } else { - v0 = neon_fp16x4_to_f32(data + i); v1 = neon_fp16x4_to_f32(data + i + 4); - v2 = neon_fp16x4_to_f32(data + i + 8); v3 = neon_fp16x4_to_f32(data + i + 12); + v0 = neon_fp16x4_to_f32(data + i); + v1 = neon_fp16x4_to_f32(data + i + 4); + v2 = neon_fp16x4_to_f32(data + i + 8); + v3 = neon_fp16x4_to_f32(data + i + 12); } - vmax = vmaxq_f32(vmax, vmaxq_f32(vmaxq_f32(vabsq_f32(v0), vabsq_f32(v1)), - vmaxq_f32(vabsq_f32(v2), vabsq_f32(v3)))); + vmax = vmaxq_f32( + vmax, vmaxq_f32(vmaxq_f32(vabsq_f32(v0), vabsq_f32(v1)), vmaxq_f32(vabsq_f32(v2), vabsq_f32(v3))) + ); } for (; i + 4 <= n; i += 4) { float32x4_t v; @@ -213,9 +205,12 @@ static inline float neon_absmax(const T* data, long long n) { float result = vmaxvq_f32(vmax); for (; i < n; ++i) { float val; - if constexpr (std::is_same::value) val = data[i]; - else if constexpr (std::is_same::value) val = bf16_to_float(data[i].v); - else val = fp16_to_float(data[i].v); + if constexpr (std::is_same::value) + val = data[i]; + else if constexpr (std::is_same::value) + val = bf16_to_float(data[i].v); + else + val = fp16_to_float(data[i].v); result = std::max(result, std::fabs(val)); } return result; @@ -296,50 +291,63 @@ void dequantizeBlockwise4bitCpu( { long long dim_0 = m; long long dim_1 = n; - long long input_dim_1 = dim_1 >> 1; - long long absmax_dim_1 = dim_1 / blocksize; - // NEON path: process 16 output values at a time (8 packed bytes) - // Only use when blocksize evenly divides dim_1 to ensure correct scale indexing - constexpr long long VEC_LEN = 16; - if (dim_1 % VEC_LEN == 0 && blocksize >= VEC_LEN && (dim_1 % blocksize == 0)) { - float32x4_t lut[4]; - if constexpr (DATA_TYPE == 1) { - neon_fp4_lut(lut); - } else { - neon_nf4_lut(lut); + long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1 + long long absmax_dim_1 = (dim_1 + blocksize - 1) / blocksize; + float32x4_t lut[4]; + if constexpr (DATA_TYPE == 1) { + neon_fp4_lut(lut); + } else { + neon_nf4_lut(lut); + } + constexpr long long k_step = 8; // 8 packed bytes = 16 output values + BNB_OMP_PARALLEL_FOR + for (long long block_idx = 0; block_idx < dim_0; ++block_idx) { + long long k = 0; + for (; k + k_step <= input_dim_1; k += k_step) { + long long scale_idx = k * 2 / blocksize; + float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; + const uint8_t* p = &A[block_idx * input_dim_1 + k]; + float tmp_f32[16]; + neon_dequant_4bit_16values(p, scale, lut, tmp_f32); + T* pout = &out[block_idx * dim_1 + k * 2]; + if constexpr (std::is_same()) { + std::memcpy(pout, tmp_f32, 16 * sizeof(float)); + } else if constexpr (std::is_same()) { + neon_f32_to_bf16x4(vld1q_f32(tmp_f32), pout); + neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 4), pout + 4); + neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 8), pout + 8); + neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 12), pout + 12); + } else { + neon_f32_to_fp16x4(vld1q_f32(tmp_f32), pout); + neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 4), pout + 4); + neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 8), pout + 8); + neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12); + } } - constexpr long long k_step = VEC_LEN / 2; // 8 bytes per iteration - BNB_OMP_PARALLEL_FOR - for (long long block_idx = 0; block_idx < dim_0; ++block_idx) { - for (long long k = 0; k < input_dim_1; k += k_step) { - long long scale_idx = k * 2 / blocksize; - float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; - const uint8_t* p = &A[block_idx * input_dim_1 + k]; - - // Dequantize 16 values into a temp float buffer - float tmp_f32[16]; - neon_dequant_4bit_16values(p, scale, lut, tmp_f32); - - // Store results (convert to output type using NEON) - T* pout = &out[block_idx * dim_1 + k * 2]; - if constexpr (std::is_same()) { - // Direct copy - already float - std::memcpy(pout, tmp_f32, 16 * sizeof(float)); - } else if constexpr (std::is_same()) { - neon_f32_to_bf16x4(vld1q_f32(tmp_f32), pout); - neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 4), pout + 4); - neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 8), pout + 8); - neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 12), pout + 12); - } else if constexpr (std::is_same()) { - neon_f32_to_fp16x4(vld1q_f32(tmp_f32), pout); - neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 4), pout + 4); - neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 8), pout + 8); - neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12); - } + // Scalar remainder for dim_1 not divisible by 16, and last nibble when dim_1 is odd + for (; k < input_dim_1; ++k) { + long long out_base = block_idx * dim_1 + k * 2; + long long scale_idx = k * 2 / blocksize; + float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; + unsigned char byte = A[block_idx * input_dim_1 + k]; + float v0 = (DATA_TYPE == 1 ? dDequantizeFP4(byte >> 4) : dDequantizeNF4(byte >> 4)) * scale; + float v1 = (DATA_TYPE == 1 ? dDequantizeFP4(byte & 0x0F) : dDequantizeNF4(byte & 0x0F)) * scale; + if constexpr (std::is_same()) { + out[out_base] = v0; + if (k * 2 + 1 < dim_1) + out[out_base + 1] = v1; + } else if constexpr (std::is_same()) { + out[out_base] = float_to_bf16(v0); + if (k * 2 + 1 < dim_1) + out[out_base + 1] = float_to_bf16(v1); + } else { + out[out_base] = float_to_fp16(v0); + if (k * 2 + 1 < dim_1) + out[out_base + 1] = float_to_fp16(v1); } } - return; } + return; } #endif // _M_ARM64 || __aarch64__ @@ -446,7 +454,7 @@ void dequantizeBlockwise8bitCpu( float32x4_t vscale = vdupq_n_f32(scale); long long i = block_idx; for (; i + 4 <= block_end; i += 4) { - float tmp[4] = { code[A[i]], code[A[i+1]], code[A[i+2]], code[A[i+3]] }; + float tmp[4] = {code[A[i]], code[A[i + 1]], code[A[i + 2]], code[A[i + 3]]}; float32x4_t v = vmulq_f32(vld1q_f32(tmp), vscale); if constexpr (std::is_same::value) vst1q_f32(reinterpret_cast(out + i), v); @@ -630,7 +638,7 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou uint16x4_t indices = neon_norm_to_lut_index_x4(v); uint16_t idx_arr[4]; vst1_u16(idx_arr, indices); - out[block_start + i] = lut[idx_arr[0]]; + out[block_start + i] = lut[idx_arr[0]]; out[block_start + i + 1] = lut[idx_arr[1]]; out[block_start + i + 2] = lut[idx_arr[2]]; out[block_start + i + 3] = lut[idx_arr[3]]; From 2adea9975a6e15ea41c5e8e2dc420ba3fe807ec8 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 3 Jun 2026 15:51:09 -0400 Subject: [PATCH 11/25] fix msvc arm64 build --- csrc/cpu_ops.cpp | 2 +- csrc/cpu_ops.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index bda400e10..94bf6c57b 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -111,7 +111,7 @@ static inline void // Multiply each index by 4 to get byte offset (max 15*4=60 < 64, safe) uint8x16_t base = vshlq_n_u8(indices, 2); // Expand each base offset to 4 consecutive bytes via zip - static const uint8x16_t off = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; + const uint8x16_t off = vreinterpretq_u8_u32(vdupq_n_u32(0x03020100)); uint8x8_t lo = vget_low_u8(base), hi = vget_high_u8(base); uint8x8x2_t z0 = vzip_u8(lo, lo); uint8x8x2_t z1 = vzip_u8(hi, hi); diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h index 14df69921..2f7a8f873 100644 --- a/csrc/cpu_ops.h +++ b/csrc/cpu_ops.h @@ -32,7 +32,7 @@ template inline int get_cache_blocks(int chunk_size) { } // forced unroll for perf critical path -#if __has_attribute(always_inline) +#if defined(__has_attribute) && __has_attribute(always_inline) #define ALWAYS_INLINE __attribute__((__always_inline__)) inline #else #define ALWAYS_INLINE inline From 144edc780a2d081c852ff0b1fa2b3aec97b54891 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:50:47 -0400 Subject: [PATCH 12/25] fix --- csrc/cpu_ops.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 94bf6c57b..8053b0832 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -288,7 +288,9 @@ void dequantizeBlockwise4bitCpu( return; #if defined(_M_ARM64) || defined(__aarch64__) - { + // n % blocksize == 0: absmax is organized by flat element blocks; row and block + // boundaries must align or the 2D absmax indexing gives wrong scale values. + if (n % blocksize == 0) { long long dim_0 = m; long long dim_1 = n; long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1 From 4021345363ff8190dfcb10c7dd85348d8ed9cd19 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:17:26 -0400 Subject: [PATCH 13/25] remove dead code --- csrc/cpu_ops.cpp | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 8053b0832..8ffb93d99 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -293,8 +293,8 @@ void dequantizeBlockwise4bitCpu( if (n % blocksize == 0) { long long dim_0 = m; long long dim_1 = n; - long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1 - long long absmax_dim_1 = (dim_1 + blocksize - 1) / blocksize; + long long input_dim_1 = dim_1 >> 1; + long long absmax_dim_1 = dim_1 / blocksize; float32x4_t lut[4]; if constexpr (DATA_TYPE == 1) { neon_fp4_lut(lut); @@ -304,8 +304,7 @@ void dequantizeBlockwise4bitCpu( constexpr long long k_step = 8; // 8 packed bytes = 16 output values BNB_OMP_PARALLEL_FOR for (long long block_idx = 0; block_idx < dim_0; ++block_idx) { - long long k = 0; - for (; k + k_step <= input_dim_1; k += k_step) { + for (long long k = 0; k < input_dim_1; k += k_step) { long long scale_idx = k * 2 / blocksize; float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; const uint8_t* p = &A[block_idx * input_dim_1 + k]; @@ -326,28 +325,6 @@ void dequantizeBlockwise4bitCpu( neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12); } } - // Scalar remainder for dim_1 not divisible by 16, and last nibble when dim_1 is odd - for (; k < input_dim_1; ++k) { - long long out_base = block_idx * dim_1 + k * 2; - long long scale_idx = k * 2 / blocksize; - float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; - unsigned char byte = A[block_idx * input_dim_1 + k]; - float v0 = (DATA_TYPE == 1 ? dDequantizeFP4(byte >> 4) : dDequantizeNF4(byte >> 4)) * scale; - float v1 = (DATA_TYPE == 1 ? dDequantizeFP4(byte & 0x0F) : dDequantizeNF4(byte & 0x0F)) * scale; - if constexpr (std::is_same()) { - out[out_base] = v0; - if (k * 2 + 1 < dim_1) - out[out_base + 1] = v1; - } else if constexpr (std::is_same()) { - out[out_base] = float_to_bf16(v0); - if (k * 2 + 1 < dim_1) - out[out_base + 1] = float_to_bf16(v1); - } else { - out[out_base] = float_to_fp16(v0); - if (k * 2 + 1 < dim_1) - out[out_base + 1] = float_to_fp16(v1); - } - } } return; } From ed1db523062a1c479a1a4eeedd94119236ad2224 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:25:12 -0400 Subject: [PATCH 14/25] x86-64 cpu perf improvement --- CMakeLists.txt | 1 + csrc/cpu_ops.cpp | 58 +++++++++++++++++++++++++++++++++++++++++++----- csrc/cpu_ops.h | 15 +++++++++++++ 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 19e8ae0b1..9714f9946 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -380,6 +380,7 @@ if (BUILD_CPU) -mprefer-vector-width=256 -mfma -mavx2 + -mf16c -mlzcnt -mbmi -mbmi2 diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 8ffb93d99..6ad62f012 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -231,7 +231,6 @@ static inline uint16x4_t neon_norm_to_lut_index_x4(float32x4_t vals) { #endif // _M_ARM64 || __aarch64__ #if defined(__AVX512F__) -#include inline __m256i cvt_fp32_to_fp16(const __m512 src) { return _mm512_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); @@ -277,6 +276,29 @@ static inline __m512 set_fp4_lut() { } #endif +static constexpr float fp4_lut[16] = { + 0.0f, 0.005208333333f, 0.66666667f, 1.0f, 0.33333333f, 0.5f, 0.16666667f, 0.25f, + -0.0f, -0.005208333333f, -0.66666667f, -1.0f, -0.33333333f, -0.5f, -0.16666667f, -0.25f, +}; +static constexpr float nf4_lut[16] = { + -1.0f, + -0.6961928009986877f, + -0.5250730514526367f, + -0.39491748809814453f, + -0.28444138169288635f, + -0.18477343022823334f, + -0.09105003625154495f, + 0.0f, + 0.07958029955625534f, + 0.16093020141124725f, + 0.24611230194568634f, + 0.33791524171829224f, + 0.44070982933044434f, + 0.5626170039176941f, + 0.7229568362236023f, + 1.0f, +}; + // 4-bit (FP4 / NF4) dequantization helper extracted from the original else branch. // DATA_TYPE: 1 = FP4, 2 = NF4 template @@ -293,8 +315,8 @@ void dequantizeBlockwise4bitCpu( if (n % blocksize == 0) { long long dim_0 = m; long long dim_1 = n; - long long input_dim_1 = dim_1 >> 1; - long long absmax_dim_1 = dim_1 / blocksize; + long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1 + long long absmax_dim_1 = (dim_1 + blocksize - 1) / blocksize; float32x4_t lut[4]; if constexpr (DATA_TYPE == 1) { neon_fp4_lut(lut); @@ -304,7 +326,8 @@ void dequantizeBlockwise4bitCpu( constexpr long long k_step = 8; // 8 packed bytes = 16 output values BNB_OMP_PARALLEL_FOR for (long long block_idx = 0; block_idx < dim_0; ++block_idx) { - for (long long k = 0; k < input_dim_1; k += k_step) { + long long k = 0; + for (; k + k_step <= input_dim_1; k += k_step) { long long scale_idx = k * 2 / blocksize; float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; const uint8_t* p = &A[block_idx * input_dim_1 + k]; @@ -325,6 +348,28 @@ void dequantizeBlockwise4bitCpu( neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12); } } + // Scalar remainder for dim_1 not divisible by 16, and last nibble when dim_1 is odd + for (; k < input_dim_1; ++k) { + long long out_base = block_idx * dim_1 + k * 2; + long long scale_idx = k * 2 / blocksize; + float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; + unsigned char byte = A[block_idx * input_dim_1 + k]; + float v0 = lut[byte >> 4] * scale; + float v1 = lut[byte & 0x0F] * scale; + if constexpr (std::is_same()) { + out[out_base] = v0; + if (k * 2 + 1 < dim_1) + out[out_base + 1] = v1; + } else if constexpr (std::is_same()) { + out[out_base] = float_to_bf16(v0); + if (k * 2 + 1 < dim_1) + out[out_base + 1] = float_to_bf16(v1); + } else { + out[out_base] = float_to_fp16(v0); + if (k * 2 + 1 < dim_1) + out[out_base + 1] = float_to_fp16(v1); + } + } } return; } @@ -381,6 +426,7 @@ void dequantizeBlockwise4bitCpu( } #endif // Scalar fallback branch + const float* lut = DATA_TYPE == 1 ? fp4_lut : nf4_lut; long long total = m * n; BNB_OMP_PARALLEL_FOR for (long long block_idx = 0; block_idx < total; block_idx += blocksize) { @@ -391,9 +437,9 @@ void dequantizeBlockwise4bitCpu( unsigned char byte = A[byte_index]; // High nibble first (matches previous code logic) - float v0 = (DATA_TYPE == 1 ? dDequantizeFP4(byte >> 4) : dDequantizeNF4(byte >> 4)) * scale; + float v0 = lut[byte >> 4] * scale; // Low nibble second - float v1 = (DATA_TYPE == 1 ? dDequantizeFP4(byte & 0x0F) : dDequantizeNF4(byte & 0x0F)) * scale; + float v1 = lut[byte & 0x0F] * scale; if constexpr (std::is_same::value) { out[block_idx + i] = float_to_bf16(v0); diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h index 2f7a8f873..3a1ec40a8 100644 --- a/csrc/cpu_ops.h +++ b/csrc/cpu_ops.h @@ -13,6 +13,10 @@ #include #endif +#if defined(__x86_64__) || defined(_M_X64) +#include +#endif + // amx-bf16 #define TILE_M 16 #define TILE_N 16 @@ -147,6 +151,12 @@ static float bf16_to_float(uint16_t bf16) { } static inline fp16_t float_to_fp16(float x) { +#if defined(__AVX2__) + // F16C is guaranteed on all AVX2 CPUs; matches CUDA round-to-nearest-even behavior + return fp16_t{ + (uint16_t)_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), 0) + }; +#else uint32_t bits; std::memcpy(&bits, &x, 4); uint32_t sign = (bits >> 31) & 0x1; @@ -186,9 +196,13 @@ static inline fp16_t float_to_fp16(float x) { h = (sign << 15) | ((uint16_t)exp_h << 10) | ((uint16_t)(mant_rounded >> 13)); } return fp16_t{h}; +#endif } static inline float fp16_to_float(uint16_t h) { +#if defined(__AVX2__) + return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(h))); +#else uint32_t sign = (h >> 15) & 0x1; uint32_t exp = (h >> 10) & 0x1F; uint32_t mant = h & 0x3FF; @@ -216,6 +230,7 @@ static inline float fp16_to_float(uint16_t h) { float f; std::memcpy(&f, &bits, sizeof(f)); return f; +#endif } inline float dDequantizeFP4(unsigned char val) { From e54ccf8c638695dc5c546ed7d9084dae39784483 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:31:17 -0400 Subject: [PATCH 15/25] fix --- csrc/cpu_ops.cpp | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 6ad62f012..cbbb79a30 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -315,24 +315,23 @@ void dequantizeBlockwise4bitCpu( if (n % blocksize == 0) { long long dim_0 = m; long long dim_1 = n; - long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1 - long long absmax_dim_1 = (dim_1 + blocksize - 1) / blocksize; - float32x4_t lut[4]; + long long input_dim_1 = dim_1 >> 1; + long long absmax_dim_1 = dim_1 / blocksize; + float32x4_t neon_lut[4]; if constexpr (DATA_TYPE == 1) { - neon_fp4_lut(lut); + neon_fp4_lut(neon_lut); } else { - neon_nf4_lut(lut); + neon_nf4_lut(neon_lut); } constexpr long long k_step = 8; // 8 packed bytes = 16 output values BNB_OMP_PARALLEL_FOR for (long long block_idx = 0; block_idx < dim_0; ++block_idx) { - long long k = 0; - for (; k + k_step <= input_dim_1; k += k_step) { + for (long long k = 0; k < input_dim_1; k += k_step) { long long scale_idx = k * 2 / blocksize; float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; const uint8_t* p = &A[block_idx * input_dim_1 + k]; float tmp_f32[16]; - neon_dequant_4bit_16values(p, scale, lut, tmp_f32); + neon_dequant_4bit_16values(p, scale, neon_lut, tmp_f32); T* pout = &out[block_idx * dim_1 + k * 2]; if constexpr (std::is_same()) { std::memcpy(pout, tmp_f32, 16 * sizeof(float)); @@ -348,28 +347,6 @@ void dequantizeBlockwise4bitCpu( neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12); } } - // Scalar remainder for dim_1 not divisible by 16, and last nibble when dim_1 is odd - for (; k < input_dim_1; ++k) { - long long out_base = block_idx * dim_1 + k * 2; - long long scale_idx = k * 2 / blocksize; - float scale = absmax[block_idx * absmax_dim_1 + scale_idx]; - unsigned char byte = A[block_idx * input_dim_1 + k]; - float v0 = lut[byte >> 4] * scale; - float v1 = lut[byte & 0x0F] * scale; - if constexpr (std::is_same()) { - out[out_base] = v0; - if (k * 2 + 1 < dim_1) - out[out_base + 1] = v1; - } else if constexpr (std::is_same()) { - out[out_base] = float_to_bf16(v0); - if (k * 2 + 1 < dim_1) - out[out_base + 1] = float_to_bf16(v1); - } else { - out[out_base] = float_to_fp16(v0); - if (k * 2 + 1 < dim_1) - out[out_base + 1] = float_to_fp16(v1); - } - } } return; } From 8cc47f0383317130563d39756209ed34ab03e2ed Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 3 Jun 2026 21:04:41 -0400 Subject: [PATCH 16/25] cpu: update tests --- tests/test_functional.py | 4 ---- tests/test_linear4bit.py | 3 --- tests/test_ops.py | 10 ++-------- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index 95d8727f7..895c0b436 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -275,10 +275,6 @@ def test_few_bit_quant(self, device, bits, method): @pytest.mark.parametrize("device", get_available_devices()) def test_fp8_quant(self, device): - # TODO - if device == "cpu": - pytest.skip("CPU implementation segfaults") - for e_bits in range(1, 7): p_bits = 7 - e_bits code = F.create_fp8_map(True, e_bits, p_bits).to(device) diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 12ed0eb27..79ede45b2 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -221,9 +221,6 @@ def test_params4bit_torch_chunk_split(device, quant_type): if device == "hpu" and not is_supported_on_hpu(quant_type, torch.float16, torch.uint8): pytest.skip("This configuration is not supported on HPU.") - if device == "cpu": - pytest.skip("CPU quantization causes segfault, skipping CPU test") - original_tensor = torch.randn(8, 4, dtype=torch.float16, device="cpu") params4bit = bnb.nn.Params4bit(data=original_tensor, quant_type=quant_type, requires_grad=False) diff --git a/tests/test_ops.py b/tests/test_ops.py index bd5217748..6cae6706c 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -98,12 +98,8 @@ class TestInt8BlockwiseQuantOps: @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) def test_quantize_blockwise(self, device, dtype, blocksize): - if device == "cpu": - if dtype != torch.float32: - pytest.skip("CPU implementation is only available for float32") - - if blocksize != 256: - pytest.skip("CPU implementation is slow; only test blocksize=256") + if device == "cpu" and blocksize != 256: + pytest.skip("CPU implementation is slow; only test blocksize=256") code = bitsandbytes.functional.create_dynamic_map().to(device) A = torch.randn(1024, 1024, dtype=dtype, device=device) @@ -122,8 +118,6 @@ def test_quantize_blockwise(self, device, dtype, blocksize): @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) def test_dequantize_blockwise(self, device, dtype, blocksize): - if device == "cpu" and dtype != torch.float32: - pytest.skip("CPU implementation is only available for float32") A = torch.randint(0, 255, (1024, 1024), dtype=torch.uint8, device=device) code = bitsandbytes.functional.create_dynamic_map().to(device, dtype=torch.float32) From 365c6d829fedcdcb920361df6ff007c388214ad6 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 3 Jun 2026 22:22:44 -0400 Subject: [PATCH 17/25] x64 avx512 improvements, test improvements --- csrc/cpu_ops.cpp | 22 ++++++++-------------- tests/test_autograd.py | 10 +++------- tests/test_functional.py | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index cbbb79a30..e5938a677 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -366,22 +366,16 @@ void dequantizeBlockwise4bitCpu( BNB_OMP_PARALLEL_FOR for (int block_idx = 0; block_idx < dim_0; ++block_idx) { for (int k = 0; k < input_dim_1; k += k_step) { - // Load 64 bits of nf4 data and a single scale data - uint8_t* p = &A[block_idx * input_dim_1 + k]; - uint64_t packed; - std::memcpy(&packed, p, sizeof(uint64_t)); + const uint8_t* p = &A[block_idx * input_dim_1 + k]; auto scale_idx = k * 2 / blocksize; auto vscales = _mm512_set1_ps((float)absmax[block_idx * absmax_dim_1 + scale_idx]); - // unpack nf4 data to 32-bit integers - uint64_t high = 0; - uint64_t low = 0; - for (int i = 0; i < 4; ++i) { - low |= ((packed >> (2 * i * 4)) & 0xf) << ((2 * i + 1) * 8); - low |= ((packed >> ((2 * i + 1) * 4)) & 0xf) << (2 * i * 8); - high |= ((packed >> (2 * i * 4 + 32)) & 0xf) << ((2 * i + 1) * 8); - high |= ((packed >> ((2 * i + 1) * 4 + 32)) & 0xf) << (2 * i * 8); - } - __m128i packed_128 = _mm_set_epi64x(high, low); + // Unpack 8 packed bytes into 16 nibble indices using SSE. + // Each byte holds two 4-bit values; high nibble is the first output element. + __m128i raw = _mm_loadl_epi64(reinterpret_cast(p)); + __m128i mask4 = _mm_set1_epi8(0x0f); + __m128i hi = _mm_and_si128(_mm_srli_epi16(raw, 4), mask4); + __m128i lo = _mm_and_si128(raw, mask4); + __m128i packed_128 = _mm_unpacklo_epi8(hi, lo); __m512i vint32 = _mm512_cvtepu8_epi32(packed_128); // Table look-up __m512 vout = _mm512_permutexvar_ps(vint32, lut); diff --git a/tests/test_autograd.py b/tests/test_autograd.py index d150f4735..7d273c853 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -134,7 +134,7 @@ def test_matmullt( @pytest.mark.parametrize("dim2", [64, 0], ids=id_formatter("dim2")) @pytest.mark.parametrize("dim3", [64], ids=id_formatter("dim3")) @pytest.mark.parametrize("dim4", [96], ids=id_formatter("dim4")) -@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("req_grad", REQ_GRAD_NO_B_WEIGHT, ids=id_formatter("req_grad")) @pytest.mark.parametrize("transpose_B", TRUE_FALSE, ids=id_formatter("transpose_B")) @pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) @@ -169,8 +169,8 @@ def test_matmul_4bit( for i in range(3): A = torch.randn(size=dimA, device=device, requires_grad=req_grad[0], dtype=dtype) - B = torch.randn(size=dimB, device=device, requires_grad=req_grad[1], dtype=dtype) - target = torch.randn(size=(dim2, dim4), device=device, requires_grad=req_grad[1], dtype=dtype) + B = torch.randn(size=dimB, device=device, dtype=dtype) + target = torch.randn(size=(dim2, dim4), device=device, dtype=dtype) bias = None bias2 = None if has_bias: @@ -212,9 +212,7 @@ def test_matmul_4bit( loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean() loss_bnb.backward() gradA1 = A.grad - gradB1 = B.grad A.grad = None - B.grad = None if has_bias: gradBias1 = bias.grad bias.grad = None @@ -222,9 +220,7 @@ def test_matmul_4bit( loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean() loss_torch.backward() gradA2 = A.grad - gradB2 = B.grad A.grad = None - B.grad = None if has_bias: gradBias2 = bias.grad bias.grad = None diff --git a/tests/test_functional.py b/tests/test_functional.py index 95d8727f7..099024a58 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -896,6 +896,30 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind): dim_key = "le512" if dim <= 512 else "gt512" thresholds = gemv_thresholds[dtype][dim_key] + + # On CPU with AVX512BF16, fp16/fp32 inputs are downcast to bf16 for the fused + # kernel for performance. Thresholds calibrated from 100 iterations on CPU. + cpu_bf16_cast = device == "cpu" and F.has_avx512bf16() and dtype in (torch.float16, torch.float32) + if cpu_bf16_cast: + thresholds = { + "le512": { + "err1": (2.72e-4, 9.96e-5), + "relerr1": ( + 1.88e-3 if dtype == torch.float16 else 1.64e-3, + 1.27e-2 if dtype == torch.float16 else 3.61e-3, + ), + "maxerr1": (1.22e-3, 3.80e-4), + }, + "gt512": { + "err1": (1.00e-4, 3.48e-5), + "relerr1": ( + 6.92e-4 if dtype == torch.float16 else 6.31e-4, + 9.21e-4 if dtype == torch.float16 else 4.71e-4, + ), + "maxerr1": (5.16e-4, 1.68e-4), + }, + }[dim_key] + for metric_name, metric_val in [("err1", err1), ("relerr1", relerr1), ("maxerr1", maxerr1)]: mean_val, std_val = thresholds[metric_name] limit = mean_val + N_SIGMA * std_val @@ -906,11 +930,12 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind): # Ratios check that gemv_4bit and matmul_4bit produce consistent results. # These are tight bounds on internal consistency, not absolute accuracy. - if dtype == torch.float16: + # On CPU with AVX512BF16, fp16/fp32 use bf16 arithmetic so get bf16-level bounds. + if dtype == torch.float16 and not cpu_bf16_cast: assert absratio < 1.005 and absratio > 0.995 assert relratio < 1.005 and relratio > 0.992 assert maxratio < 1.005 and maxratio > 0.992 - elif dtype == torch.float32: + elif dtype == torch.float32 and not cpu_bf16_cast: assert absratio < 1.005 and absratio > 0.995 assert relratio < 1.005 and relratio > 0.995 assert maxratio < 1.005 and maxratio > 0.995 @@ -918,6 +943,10 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind): assert absratio < 1.005 and absratio > 0.995 assert relratio < 1.05 and relratio > 0.96 assert maxratio < 1.05 and maxratio > 0.97 + elif cpu_bf16_cast: + assert absratio < 1.02 and absratio > 0.98 + assert relratio < 1.1 and relratio > 0.90 + assert maxratio < 1.1 and maxratio > 0.90 @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"]) From d8550100f8672397b7198be2b16d4159c8fc87db Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 5 Jun 2026 15:47:38 -0400 Subject: [PATCH 18/25] update build flags --- CMakeLists.txt | 58 +++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9714f9946..dd984a98e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -355,36 +355,50 @@ set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) add_library(bitsandbytes SHARED ${SRC_FILES}) target_compile_features(bitsandbytes PUBLIC cxx_std_17) target_include_directories(bitsandbytes PUBLIC csrc) +set_target_properties(bitsandbytes PROPERTIES VISIBILITY_INLINES_HIDDEN ON) if (BUILD_CPU) + include(CheckIPOSupported) + check_ipo_supported(RESULT ipo_supported OUTPUT ipo_output) + if (ipo_supported) + set_property(TARGET bitsandbytes PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) + endif() + if (OpenMP_CXX_FOUND) target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX) add_definitions(-DHAS_OPENMP) endif() - if ((HOST_ARCH MATCHES "x86_64|amd64") AND (NOT MSVC)) - include(CheckCXXCompilerFlag) - check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG) - check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG) - if (HAS_AVX512F_FLAG) - target_compile_options(bitsandbytes PRIVATE -mavx512f) - target_compile_options(bitsandbytes PRIVATE -mavx512dq) - target_compile_options(bitsandbytes PRIVATE -mavx512bw) - target_compile_options(bitsandbytes PRIVATE -mavx512vl) - endif() - if (HAS_AVX512BF16_FLAG) - target_compile_options(bitsandbytes PRIVATE -mavx512bf16) + if (NOT MSVC) + target_compile_options(bitsandbytes PRIVATE -fno-semantic-interposition) + + if (HOST_ARCH MATCHES "x86_64|amd64") + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG) + check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG) + if (HAS_AVX512F_FLAG) + target_compile_options( + bitsandbytes PRIVATE + -mavx512f + -mavx512dw + -mavx512bw + -mavx512vl + ) + endif() + if (HAS_AVX512BF16_FLAG) + target_compile_options(bitsandbytes PRIVATE -mavx512bf16) + endif() + target_compile_options( + bitsandbytes PRIVATE + -mprefer-vector-width=256 + -mfma + -mavx2 + -mf16c + -mlzcnt + -mbmi + -mbmi2 + ) endif() - target_compile_options( - bitsandbytes PRIVATE - -mprefer-vector-width=256 - -mfma - -mavx2 - -mf16c - -mlzcnt - -mbmi - -mbmi2 - ) endif() endif() From b8b328cba72ba02a8c94e5c287bfe80079010170 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 5 Jun 2026 15:51:13 -0400 Subject: [PATCH 19/25] update build flags --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dd984a98e..ede840b7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -380,8 +380,8 @@ if (BUILD_CPU) target_compile_options( bitsandbytes PRIVATE -mavx512f - -mavx512dw -mavx512bw + -mavx512dq -mavx512vl ) endif() From 2ebf82158b5a71109baf042f0e57a670186af8c1 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:13:22 -0400 Subject: [PATCH 20/25] fix windows --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ede840b7c..326f7d46f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -360,7 +360,7 @@ set_target_properties(bitsandbytes PROPERTIES VISIBILITY_INLINES_HIDDEN ON) if (BUILD_CPU) include(CheckIPOSupported) check_ipo_supported(RESULT ipo_supported OUTPUT ipo_output) - if (ipo_supported) + if (ipo_supported AND NOT MSVC) set_property(TARGET bitsandbytes PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) endif() From 7dad0e6c4d8ae3e87b117f5d3d3a7327c07f649f Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 8 Jun 2026 16:00:53 -0400 Subject: [PATCH 21/25] Update build flag --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 326f7d46f..3d420edb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -370,7 +370,9 @@ if (BUILD_CPU) endif() if (NOT MSVC) - target_compile_options(bitsandbytes PRIVATE -fno-semantic-interposition) + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_compile_options(bitsandbytes PRIVATE -fno-semantic-interposition) + endif() if (HOST_ARCH MATCHES "x86_64|amd64") include(CheckCXXCompilerFlag) From 77ae5facae153b3c7e5219a23fdd9e8bf69bdf94 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 8 Jun 2026 16:53:54 -0400 Subject: [PATCH 22/25] Update omp simd hints --- csrc/cpu_ops.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index e5938a677..327fca531 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -473,6 +473,7 @@ void dequantizeBlockwise8bitCpu( #ifdef _MSC_VER #pragma loop(ivdep) #endif +#pragma omp simd for (long long i = block_idx; i < block_end; ++i) { float v = code[A[i]] * scale; if constexpr (std::is_same::value) { @@ -595,6 +596,7 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou #if defined(_M_ARM64) || defined(__aarch64__) absmax_block = neon_absmax(A + block_start, block_len); #else +#pragma omp simd reduction(max:absmax_block) for (long long i = block_start; i < block_end; ++i) { float val; if constexpr (std::is_same::value) From aebfb02ddd70ed2bdfa8252b58c349edd501f538 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 8 Jun 2026 17:02:49 -0400 Subject: [PATCH 23/25] fix msvc --- csrc/cpu_ops.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 327fca531..91cd01168 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -470,9 +470,6 @@ void dequantizeBlockwise8bitCpu( } } #else -#ifdef _MSC_VER -#pragma loop(ivdep) -#endif #pragma omp simd for (long long i = block_idx; i < block_end; ++i) { float v = code[A[i]] * scale; From efc6f4bf632b088a8c74d515529b1454bf33d304 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 9 Jun 2026 14:28:03 -0400 Subject: [PATCH 24/25] fix lint --- csrc/cpu_ops.cpp | 2 +- tests/test_ops.py | 1 - tests/test_optim.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 91cd01168..dfb9046ac 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -593,7 +593,7 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou #if defined(_M_ARM64) || defined(__aarch64__) absmax_block = neon_absmax(A + block_start, block_len); #else -#pragma omp simd reduction(max:absmax_block) +#pragma omp simd reduction(max : absmax_block) for (long long i = block_start; i < block_end; ++i) { float val; if constexpr (std::is_same::value) diff --git a/tests/test_ops.py b/tests/test_ops.py index 6cae6706c..3550c0b6f 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -118,7 +118,6 @@ def test_quantize_blockwise(self, device, dtype, blocksize): @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) def test_dequantize_blockwise(self, device, dtype, blocksize): - A = torch.randint(0, 255, (1024, 1024), dtype=torch.uint8, device=device) code = bitsandbytes.functional.create_dynamic_map().to(device, dtype=torch.float32) diff --git a/tests/test_optim.py b/tests/test_optim.py index d96bdc169..0a4b3d6af 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -24,7 +24,6 @@ def assert_most_approx_close(a, b, rtol=1e-3, atol=1e-3, max_error_count=0): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) - str2optimizers = {} ## TODO: maybe remove these three. From fbbf23e1e80052a941570c1180249e87fd6ed80e Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 9 Jun 2026 14:32:07 -0400 Subject: [PATCH 25/25] update build script --- .github/scripts/build-cpu.sh | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh index f75325505..5db76ecce 100644 --- a/.github/scripts/build-cpu.sh +++ b/.github/scripts/build-cpu.sh @@ -10,19 +10,10 @@ else pip install cmake==3.28.3 fi -# Temporary: vectorization reporting -if [[ "${build_os}" == windows* ]]; then - EXTRA_CXX_FLAGS="/Qvec-report:2 /Qpar-report:1" -elif [[ "${build_os:0:5}" == macos ]]; then - EXTRA_CXX_FLAGS="-Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize" -else - EXTRA_CXX_FLAGS="-fopt-info-vec-missed -fopt-info-vec -fopt-info-loop-optimized" -fi - if [ "${build_os:0:5}" == macos ] && [ "${build_arch}" == aarch64 ]; then - cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu -DCMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" . + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . else - cmake -DCOMPUTE_BACKEND=cpu -DCMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" . + cmake -DCOMPUTE_BACKEND=cpu . fi cmake --build . --config Release