From 2cd85621e66c90a2e7b03d4ea2c2a52ec9a66ba6 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 1 Jun 2026 19:12:02 -0400
Subject: [PATCH 01/25] perf: improve int8 on arm64 CPU

---
 bitsandbytes/backends/.DS_Store  | Bin 0 -> 6148 bytes
 bitsandbytes/backends/cpu/ops.py |   8 ++++++--
 2 files changed, 6 insertions(+), 2 deletions(-)
 create mode 100644 bitsandbytes/backends/.DS_Store

diff --git a/bitsandbytes/backends/.DS_Store b/bitsandbytes/backends/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..c2102dc454e1d95e549a115218b72225758e63a7
GIT binary patch
literal 6148
zcmeHK%SyvQ6rE|SO({Ya3SADkE!g^OaT8+w0VBFlsfj5XjG59jW>E@R>ks)QevkLg
zOu%B+BKBUGIrlk}IgoiU#<-h>ea39Ym<0`yqf#O0t_;;oG9t$@(rFOO0IZK-YGQvK
z@Z0O`flXN!<lnzPj?#Ir_sMJZ#>QsTYFcgU&U=!%H}~gR>iV-Av@WHLgHrc{t7wvs
z?d>y}%>5{t%v3=XP9Wv>I!Z#ByE03{RMmPqV708)*zPPANBu!p92^gqU9lMM_qyWn
zWUySetexGx(~I$Q{F2BwO(zGom24R-;T;sSnpba@#4>pVPnlh25fTH$05L!etTzMZ
zT(H{fO#`i-7$62J7{L8OKtps478=#o0UchSG2TE#0Uh5Gh{B*_u+Rt*5UxuBbtyMb
z46e(;FHD|eu+XT>8CNsIIA-SZ@xs;Y;1?>LaYrNd!~ij{%0OM4HlF|I@XJ&_@>fg9
zA_j<of5rfBjJ%Nxi!x{HkLBT6E1=y&L&3ZP6%f!@E&*WRKGIP^9T%uWo@21kh@+ri
Rl>^d6KoLS6G4Klvd;w;TNs|Bo

literal 0
HcmV?d00001

diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
index ed6803eda..74f3cfe30 100755
--- a/bitsandbytes/backends/cpu/ops.py
+++ b/bitsandbytes/backends/cpu/ops.py
@@ -3,6 +3,7 @@
 import logging
 import math
 from math import prod
+import platform
 from typing import Optional
 
 import torch
@@ -20,8 +21,11 @@
 # However, we can overflow if we use this without AVX512_VNNI support.
 # This is fixed in torch 2.6+, so we set this as the minimum to be safe.
 # For more information: https://github.com/pytorch/pytorch/pull/136942
-# TODO(matthewdouglas): aarch64?
-if torch.__version__ >= (2, 6):
+#
+# On aarch64, torch._int_mm uses a scalar fallback that is much slower
+# than fp32 matmul. Skip it and let the default backend handle this.
+_is_arm64 = platform.machine().lower() in ("arm64", "aarch64")
+if torch.__version__ >= (2, 6) and not _is_arm64:
 
     @register_kernel("bitsandbytes::int8_linear_matmul", "cpu")
     def _(A: torch.Tensor, B: torch.Tensor):

From 62427bf7cf4c636484b4a7d31c20bac2c729dbd3 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 1 Jun 2026 19:12:24 -0400
Subject: [PATCH 02/25] build: temporary add CPU build verbosity

---
 .github/scripts/build-cpu.sh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh
index 5daeb5ea5..24c731063 100644
--- a/.github/scripts/build-cpu.sh
+++ b/.github/scripts/build-cpu.sh
@@ -6,10 +6,17 @@ set -xeuo pipefail
 
 pip install cmake==3.28.3
 
+# Temporary: vectorization reporting
+if [[ "${build_os}" == windows* ]]; then
+    EXTRA_CXX_FLAGS="/Qvec-report:2 /Qpar-report:1"
+else
+    EXTRA_CXX_FLAGS="-fopt-info-vec-missed -fopt-info-vec -fopt-info-loop-optimized"
+fi
+
 if [ "${build_os:0:5}" == macos ] && [ "${build_arch}" == aarch64 ]; then
-	cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu .
+	cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu -DCMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" .
 else
-	cmake -DCOMPUTE_BACKEND=cpu .
+	cmake -DCOMPUTE_BACKEND=cpu -DCMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" .
 fi
 cmake --build . --config Release
 

From f7f1cdbe88f03d6db56c55c7b6460a9d212f9017 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 1 Jun 2026 19:15:15 -0400
Subject: [PATCH 03/25] fix

---
 .github/scripts/build-cpu.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh
index 24c731063..520e22eab 100644
--- a/.github/scripts/build-cpu.sh
+++ b/.github/scripts/build-cpu.sh
@@ -9,6 +9,8 @@ pip install cmake==3.28.3
 # Temporary: vectorization reporting
 if [[ "${build_os}" == windows* ]]; then
     EXTRA_CXX_FLAGS="/Qvec-report:2 /Qpar-report:1"
+elif [[ "${build_os:0:5}" == macos ]]; then
+    EXTRA_CXX_FLAGS="-Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize"
 else
     EXTRA_CXX_FLAGS="-fopt-info-vec-missed -fopt-info-vec -fopt-info-loop-optimized"
 fi

From ec1e76d75f2e1a15b54e1fc812a83d3143025106 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 1 Jun 2026 21:05:40 -0400
Subject: [PATCH 04/25] cpu: skip _int_mm when not on avx512.

---
 bitsandbytes/backends/.DS_Store  | Bin 6148 -> 0 bytes
 bitsandbytes/backends/cpu/ops.py |   8 +++-----
 2 files changed, 3 insertions(+), 5 deletions(-)
 delete mode 100644 bitsandbytes/backends/.DS_Store

diff --git a/bitsandbytes/backends/.DS_Store b/bitsandbytes/backends/.DS_Store
deleted file mode 100644
index c2102dc454e1d95e549a115218b72225758e63a7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK%SyvQ6rE|SO({Ya3SADkE!g^OaT8+w0VBFlsfj5XjG59jW>E@R>ks)QevkLg
zOu%B+BKBUGIrlk}IgoiU#<-h>ea39Ym<0`yqf#O0t_;;oG9t$@(rFOO0IZK-YGQvK
z@Z0O`flXN!<lnzPj?#Ir_sMJZ#>QsTYFcgU&U=!%H}~gR>iV-Av@WHLgHrc{t7wvs
z?d>y}%>5{t%v3=XP9Wv>I!Z#ByE03{RMmPqV708)*zPPANBu!p92^gqU9lMM_qyWn
zWUySetexGx(~I$Q{F2BwO(zGom24R-;T;sSnpba@#4>pVPnlh25fTH$05L!etTzMZ
zT(H{fO#`i-7$62J7{L8OKtps478=#o0UchSG2TE#0Uh5Gh{B*_u+Rt*5UxuBbtyMb
z46e(;FHD|eu+XT>8CNsIIA-SZ@xs;Y;1?>LaYrNd!~ij{%0OM4HlF|I@XJ&_@>fg9
zA_j<of5rfBjJ%Nxi!x{HkLBT6E1=y&L&3ZP6%f!@E&*WRKGIP^9T%uWo@21kh@+ri
Rl>^d6KoLS6G4Klvd;w;TNs|Bo

diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
index 74f3cfe30..597511c4b 100755
--- a/bitsandbytes/backends/cpu/ops.py
+++ b/bitsandbytes/backends/cpu/ops.py
@@ -3,7 +3,6 @@
 import logging
 import math
 from math import prod
-import platform
 from typing import Optional
 
 import torch
@@ -22,10 +21,9 @@
 # This is fixed in torch 2.6+, so we set this as the minimum to be safe.
 # For more information: https://github.com/pytorch/pytorch/pull/136942
 #
-# On aarch64, torch._int_mm uses a scalar fallback that is much slower
-# than fp32 matmul. Skip it and let the default backend handle this.
-_is_arm64 = platform.machine().lower() in ("arm64", "aarch64")
-if torch.__version__ >= (2, 6) and not _is_arm64:
+# Without AVX-512 (including aarch64), torch._int_mm uses a scalar fallback
+# that is much slower than fp32 matmul. Only use it when AVX-512 is available.
+if torch.__version__ >= (2, 6) and _has_avx512:
 
     @register_kernel("bitsandbytes::int8_linear_matmul", "cpu")
     def _(A: torch.Tensor, B: torch.Tensor):

From 45831bd0052c6f6d5fdfc3f401d180e19d34a66f Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 1 Jun 2026 22:00:52 -0400
Subject: [PATCH 05/25] MSVC optimization for CPU ops

---
 csrc/cpu_ops.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 2a8912674..a5fb7b158 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -263,7 +263,7 @@ static inline __m512 set_fp4_lut() {
 // DATA_TYPE: 1 = FP4, 2 = NF4
 template <typename T, int DATA_TYPE>
 void dequantizeBlockwise4bitCpu(
-    unsigned char* A, const float* absmax, T* out, long long blocksize, long long m, long long n
+    unsigned char* __restrict A, const float* __restrict absmax, T* __restrict out, long long blocksize, long long m, long long n
 ) {
     static_assert(DATA_TYPE == 1 || DATA_TYPE == 2, "dequantizeBlockwise4bitCpu called with non 4-bit DATA_TYPE");
     if (blocksize <= 0 || m < 0 || n <= 0)
@@ -408,7 +408,7 @@ void dequantizeBlockwise4bitCpu(
 
 template <typename T>
 void dequantizeBlockwise8bitCpu(
-    float* code, unsigned char* A, const float* absmax, T* out, long long blocksize, long long n
+    float* __restrict code, unsigned char* __restrict A, const float* __restrict absmax, T* __restrict out, long long blocksize, long long n
 ) {
     if (blocksize <= 0 || n <= 0)
         return;
@@ -518,7 +518,7 @@ static inline uint16_t norm_to_lut_index(float val) {
 }
 
 template <typename T>
-void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* out, long long blocksize, long long n) {
+void quantize_cpu_impl(float* __restrict code, const T* __restrict A, float* __restrict absmax, unsigned char* __restrict out, long long blocksize, long long n) {
     if (blocksize <= 0 || n <= 0)
         return;
 

From 4e2b6116356ee39e4602cc35b5d2fa5009d26a5e Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 1 Jun 2026 22:37:56 -0400
Subject: [PATCH 06/25] msvc improvement

---
 csrc/cpu_ops.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index a5fb7b158..272858838 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -263,7 +263,7 @@ static inline __m512 set_fp4_lut() {
 // DATA_TYPE: 1 = FP4, 2 = NF4
 template <typename T, int DATA_TYPE>
 void dequantizeBlockwise4bitCpu(
-    unsigned char* __restrict A, const float* __restrict absmax, T* __restrict out, long long blocksize, long long m, long long n
+    unsigned char* A, const float* absmax, T* out, long long blocksize, long long m, long long n
 ) {
     static_assert(DATA_TYPE == 1 || DATA_TYPE == 2, "dequantizeBlockwise4bitCpu called with non 4-bit DATA_TYPE");
     if (blocksize <= 0 || m < 0 || n <= 0)
@@ -408,7 +408,7 @@ void dequantizeBlockwise4bitCpu(
 
 template <typename T>
 void dequantizeBlockwise8bitCpu(
-    float* __restrict code, unsigned char* __restrict A, const float* __restrict absmax, T* __restrict out, long long blocksize, long long n
+    float* code, unsigned char* A, const float* absmax, T* out, long long blocksize, long long n
 ) {
     if (blocksize <= 0 || n <= 0)
         return;
@@ -418,6 +418,9 @@ void dequantizeBlockwise8bitCpu(
         long long valid_items = (n - block_idx >= blocksize ? blocksize : n - block_idx);
         long long block_end = block_idx + valid_items;
         float scale = absmax[block_idx / blocksize];
+#ifdef _MSC_VER
+#pragma loop(ivdep)
+#endif
         for (long long i = block_idx; i < block_end; ++i) {
             float v = code[A[i]] * scale;
             if constexpr (std::is_same<T, bf16_t>::value) {
@@ -518,7 +521,7 @@ static inline uint16_t norm_to_lut_index(float val) {
 }
 
 template <typename T>
-void quantize_cpu_impl(float* __restrict code, const T* __restrict A, float* __restrict absmax, unsigned char* __restrict out, long long blocksize, long long n) {
+void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* out, long long blocksize, long long n) {
     if (blocksize <= 0 || n <= 0)
         return;
 

From 6755f0387838ad49718c2d3ec279eabf26462bcb Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 2 Jun 2026 14:40:02 -0400
Subject: [PATCH 07/25] cpu: enable openmp:experimental on windows; add back
 avx2/fma for linux-x64

---
 .github/scripts/build-cpu.sh | 6 +++++-
 CMakeLists.txt               | 5 +++++
 csrc/cpu_ops.cpp             | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh
index 520e22eab..f75325505 100644
--- a/.github/scripts/build-cpu.sh
+++ b/.github/scripts/build-cpu.sh
@@ -4,7 +4,11 @@ declare build_os
 
 set -xeuo pipefail
 
-pip install cmake==3.28.3
+if [[ "${build_os}" == windows* ]]; then
+    pip install cmake==3.30.9
+else
+    pip install cmake==3.28.3
+fi
 
 # Temporary: vectorization reporting
 if [[ "${build_os}" == windows* ]]; then
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a787866f6..19e8ae0b1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,6 +118,11 @@ if (BUILD_CPU)
     set(CMAKE_CXX_STANDARD 17)
     set(CMAKE_CXX_STANDARD_REQUIRED ON)
     string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" HOST_ARCH)
+    if(MSVC)
+        # Use the experimental OpenMP runtime for persistent thread pool support.
+        # Requires CMake 3.30+; silently ignored on older CMake versions.
+        set(OpenMP_RUNTIME_MSVC "experimental")
+    endif()
     find_package(OpenMP)
 endif()
 
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 272858838..f6f2f59b4 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -439,7 +439,7 @@ void dequantizeBlockwise8bitCpu(
 // which would SIGILL on non-AVX512 CPUs like Zen3. These functions are scalar C++ and don't need AVX512.
 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #pragma GCC push_options
-#pragma GCC target("no-avx512f")
+#pragma GCC target("avx2,fma,no-avx512f")
 #endif
 
 // Precomputed direct lookup table: maps quantized uint16 index [0..65535] to codebook index.

From 44e6da6448e5d0522183c81f90f334bffc75845d Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 2 Jun 2026 15:59:17 -0400
Subject: [PATCH 08/25] improve optim test perf

---
 tests/test_optim.py | 44 +++++++++++++++++---------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/tests/test_optim.py b/tests/test_optim.py
index dbfb9d469..d96bdc169 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -1,9 +1,6 @@
-import os
-from os.path import join
-import shutil
+import io
 import sys
 import time
-import uuid
 
 from lion_pytorch import Lion
 import pytest
@@ -27,15 +24,6 @@ def assert_most_approx_close(a, b, rtol=1e-3, atol=1e-3, max_error_count=0):
         torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
-def get_temp_dir():
-    path = f"/tmp/autoswap/{uuid.uuid4()}"
-    os.makedirs(path, exist_ok=True)
-    return path
-
-
-def rm_path(path):
-    shutil.rmtree(path)
-
 
 str2optimizers = {}
 
@@ -223,13 +211,13 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name, device):
         assert_most_approx_close(p1, p2.float(), atol=atol, rtol=rtol, max_error_count=15)
 
         if i % (k // 5) == 0 and i > 0:
-            path = get_temp_dir()
-            torch.save(bnb_optimizer.state_dict(), join(path, "opt.pt"))
+            buf = io.BytesIO()
+            torch.save(bnb_optimizer.state_dict(), buf)
             del bnb_optimizer
             bnb_optimizer = None
             bnb_optimizer = str2optimizers[optim_name][1]([p2])
-            bnb_optimizer.load_state_dict(torch.load(join(path, "opt.pt")))
-            rm_path(path)
+            buf.seek(0)
+            bnb_optimizer.load_state_dict(torch.load(buf))
             # since Lion can have pretty noisy updates where things lie at the boundary
             # allow up to 10 errors for Lion
             assert_most_approx_close(p1, p2.float(), atol=atol, rtol=rtol, max_error_count=10)
@@ -441,13 +429,13 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name, device):
                 raws1cpy = bnb_optimizer.state[p2][name2].clone()
                 qmap1 = bnb_optimizer.state[p2][qmap].clone()
 
-                path = get_temp_dir()
-                torch.save(bnb_optimizer.state_dict(), join(path, "opt.pt"))
+                buf = io.BytesIO()
+                torch.save(bnb_optimizer.state_dict(), buf)
                 del bnb_optimizer
                 bnb_optimizer = None
                 bnb_optimizer = str2optimizers[optim_name][1]([p2])
-                bnb_optimizer.load_state_dict(torch.load(join(path, "opt.pt")))
-                rm_path(path)
+                buf.seek(0)
+                bnb_optimizer.load_state_dict(torch.load(buf))
                 torch.testing.assert_close(raws1cpy, bnb_optimizer.state[p2][name2])
                 torch.testing.assert_close(qmap1, bnb_optimizer.state[p2][qmap])
 
@@ -577,16 +565,18 @@ def test_ademamix_state_dict_no_nan(optim_name, optim_factory, device):
     # Save state
     model_sd = {k: v.clone() for k, v in model.state_dict().items()}
     opt_sd = opt.state_dict()
-    path = get_temp_dir()
-    torch.save(opt_sd, join(path, "opt.pt"))
-    torch.save(model_sd, join(path, "model.pt"))
+    opt_buf = io.BytesIO()
+    model_buf = io.BytesIO()
+    torch.save(opt_sd, opt_buf)
+    torch.save(model_sd, model_buf)
 
     # Create fresh model and optimizer, load state
     model2 = nn.Linear(256, 64).to(device)
-    model2.load_state_dict(torch.load(join(path, "model.pt")))
+    model_buf.seek(0)
+    model2.load_state_dict(torch.load(model_buf))
     opt2 = optim_factory(model2.parameters())
-    opt2.load_state_dict(torch.load(join(path, "opt.pt")))
-    rm_path(path)
+    opt_buf.seek(0)
+    opt2.load_state_dict(torch.load(opt_buf))
 
     # Verify loaded state matches original byte-for-byte
     orig_params = list(model.parameters())

From c5df7fade172992a9b02e21d56fe157298985877 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 2 Jun 2026 22:54:25 -0400
Subject: [PATCH 09/25] cpu perf: improvements for arm64 8bit blockwise
 quant/dequant (neon)

---
 csrc/cpu_ops.cpp | 152 +++++++++++++++++++++++++++++++----------------
 1 file changed, 102 insertions(+), 50 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index f6f2f59b4..9ea661615 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -173,27 +173,50 @@ static inline void neon_f32_to_fp16x4(const float32x4_t src, fp16_t* dst) {
     vst1_u16(reinterpret_cast<uint16_t*>(dst), vreinterpret_u16_f16(half));
 }
 
-// NEON-optimized absmax computation for a block of float32
-static inline float neon_absmax_f32(const float* data, long long n) {
+// NEON-optimized FP16 to float conversion (4 values at a time)
+static inline float32x4_t neon_fp16x4_to_f32(const fp16_t* src) {
+    uint16x4_t raw = vld1_u16(reinterpret_cast<const uint16_t*>(src));
+    return vcvt_f32_f16(vreinterpret_f16_u16(raw));
+}
+
+// NEON-optimized absmax computation for a block of float32, bf16, or fp16.
+template <typename T>
+static inline float neon_absmax(const T* data, long long n) {
     float32x4_t vmax = vdupq_n_f32(0.0f);
     long long i = 0;
-    // Process 16 elements per iteration for better throughput
     for (; i + 16 <= n; i += 16) {
-        float32x4_t v0 = vabsq_f32(vld1q_f32(data + i));
-        float32x4_t v1 = vabsq_f32(vld1q_f32(data + i + 4));
-        float32x4_t v2 = vabsq_f32(vld1q_f32(data + i + 8));
-        float32x4_t v3 = vabsq_f32(vld1q_f32(data + i + 12));
-        vmax = vmaxq_f32(vmax, vmaxq_f32(vmaxq_f32(v0, v1), vmaxq_f32(v2, v3)));
+        float32x4_t v0, v1, v2, v3;
+        if constexpr (std::is_same<T, float>::value) {
+            const float* p = reinterpret_cast<const float*>(data + i);
+            v0 = vld1q_f32(p);      v1 = vld1q_f32(p + 4);
+            v2 = vld1q_f32(p + 8);  v3 = vld1q_f32(p + 12);
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            v0 = neon_bf16x4_to_f32(data + i);     v1 = neon_bf16x4_to_f32(data + i + 4);
+            v2 = neon_bf16x4_to_f32(data + i + 8); v3 = neon_bf16x4_to_f32(data + i + 12);
+        } else {
+            v0 = neon_fp16x4_to_f32(data + i);     v1 = neon_fp16x4_to_f32(data + i + 4);
+            v2 = neon_fp16x4_to_f32(data + i + 8); v3 = neon_fp16x4_to_f32(data + i + 12);
+        }
+        vmax = vmaxq_f32(vmax, vmaxq_f32(vmaxq_f32(vabsq_f32(v0), vabsq_f32(v1)),
+                                         vmaxq_f32(vabsq_f32(v2), vabsq_f32(v3))));
     }
     for (; i + 4 <= n; i += 4) {
-        float32x4_t v = vld1q_f32(data + i);
+        float32x4_t v;
+        if constexpr (std::is_same<T, float>::value)
+            v = vld1q_f32(reinterpret_cast<const float*>(data + i));
+        else if constexpr (std::is_same<T, bf16_t>::value)
+            v = neon_bf16x4_to_f32(data + i);
+        else
+            v = neon_fp16x4_to_f32(data + i);
         vmax = vmaxq_f32(vmax, vabsq_f32(v));
     }
-    // Horizontal max
     float result = vmaxvq_f32(vmax);
-    // Handle remainder
     for (; i < n; ++i) {
-        result = std::max(result, std::fabs(data[i]));
+        float val;
+        if constexpr (std::is_same<T, float>::value)       val = data[i];
+        else if constexpr (std::is_same<T, bf16_t>::value) val = bf16_to_float(data[i].v);
+        else                                                val = fp16_to_float(data[i].v);
+        result = std::max(result, std::fabs(val));
     }
     return result;
 }
@@ -418,6 +441,31 @@ void dequantizeBlockwise8bitCpu(
         long long valid_items = (n - block_idx >= blocksize ? blocksize : n - block_idx);
         long long block_end = block_idx + valid_items;
         float scale = absmax[block_idx / blocksize];
+#if defined(_M_ARM64) || defined(__aarch64__)
+        {
+            float32x4_t vscale = vdupq_n_f32(scale);
+            long long i = block_idx;
+            for (; i + 4 <= block_end; i += 4) {
+                float tmp[4] = { code[A[i]], code[A[i+1]], code[A[i+2]], code[A[i+3]] };
+                float32x4_t v = vmulq_f32(vld1q_f32(tmp), vscale);
+                if constexpr (std::is_same<T, float>::value)
+                    vst1q_f32(reinterpret_cast<float*>(out + i), v);
+                else if constexpr (std::is_same<T, bf16_t>::value)
+                    neon_f32_to_bf16x4(v, out + i);
+                else
+                    neon_f32_to_fp16x4(v, out + i);
+            }
+            for (; i < block_end; ++i) {
+                float v = code[A[i]] * scale;
+                if constexpr (std::is_same<T, bf16_t>::value)
+                    out[i] = float_to_bf16(v);
+                else if constexpr (std::is_same<T, fp16_t>::value)
+                    out[i] = float_to_fp16(v);
+                else
+                    out[i] = static_cast<T>(v);
+            }
+        }
+#else
 #ifdef _MSC_VER
 #pragma loop(ivdep)
 #endif
@@ -431,6 +479,7 @@ void dequantizeBlockwise8bitCpu(
                 out[i] = static_cast<T>(v);
             }
         }
+#endif
     }
 }
 
@@ -540,24 +589,19 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou
         float absmax_block = 0.0f;
 
 #if defined(_M_ARM64) || defined(__aarch64__)
-        if constexpr (std::is_same<T, float>::value) {
-            // Use NEON-optimized absmax for float32
-            absmax_block = neon_absmax_f32(reinterpret_cast<const float*>(A + block_start), block_len);
-        } else
-#endif
-        {
-            for (long long i = block_start; i < block_end; ++i) {
-                float val;
-                if constexpr (std::is_same<T, float>::value) {
-                    val = A[i];
-                } else if constexpr (std::is_same<T, bf16_t>::value) {
-                    val = bf16_to_float(A[i].v);
-                } else if constexpr (std::is_same<T, fp16_t>::value) {
-                    val = fp16_to_float(A[i].v);
-                }
-                absmax_block = std::max(absmax_block, std::fabs(val));
-            }
+        absmax_block = neon_absmax<T>(A + block_start, block_len);
+#else
+        for (long long i = block_start; i < block_end; ++i) {
+            float val;
+            if constexpr (std::is_same<T, float>::value)
+                val = A[i];
+            else if constexpr (std::is_same<T, bf16_t>::value)
+                val = bf16_to_float(A[i].v);
+            else
+                val = fp16_to_float(A[i].v);
+            absmax_block = std::max(absmax_block, std::fabs(val));
         }
+#endif
 
         absmax[b] = absmax_block;
 
@@ -571,41 +615,49 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou
         const float inv_absmax = 1.0f / absmax_block;
 
 #if defined(_M_ARM64) || defined(__aarch64__)
-        if constexpr (std::is_same<T, float>::value) {
-            // NEON-optimized normalize + LUT index for float32
-            const float* src = A + block_start;
+        {
             long long i = 0;
             float32x4_t vinv = vdupq_n_f32(inv_absmax);
             for (; i + 4 <= block_len; i += 4) {
-                float32x4_t v = vmulq_f32(vld1q_f32(src + i), vinv);
+                float32x4_t v;
+                if constexpr (std::is_same<T, float>::value)
+                    v = vld1q_f32(reinterpret_cast<const float*>(A + block_start + i));
+                else if constexpr (std::is_same<T, bf16_t>::value)
+                    v = neon_bf16x4_to_f32(A + block_start + i);
+                else
+                    v = neon_fp16x4_to_f32(A + block_start + i);
+                v = vmulq_f32(v, vinv);
                 uint16x4_t indices = neon_norm_to_lut_index_x4(v);
                 uint16_t idx_arr[4];
                 vst1_u16(idx_arr, indices);
-                out[block_start + i] = lut[idx_arr[0]];
+                out[block_start + i]     = lut[idx_arr[0]];
                 out[block_start + i + 1] = lut[idx_arr[1]];
                 out[block_start + i + 2] = lut[idx_arr[2]];
                 out[block_start + i + 3] = lut[idx_arr[3]];
             }
             for (; i < block_len; ++i) {
-                float normed_value = src[i] * inv_absmax;
-                out[block_start + i] = lut[norm_to_lut_index(normed_value)];
-            }
-        } else
-#endif
-        {
-            for (long long i = block_start; i < block_end; ++i) {
                 float val;
-                if constexpr (std::is_same<T, float>::value) {
-                    val = A[i];
-                } else if constexpr (std::is_same<T, bf16_t>::value) {
-                    val = bf16_to_float(A[i].v);
-                } else if constexpr (std::is_same<T, fp16_t>::value) {
-                    val = fp16_to_float(A[i].v);
-                }
-                float normed_value = val * inv_absmax;
-                out[i] = lut[norm_to_lut_index(normed_value)];
+                if constexpr (std::is_same<T, float>::value)
+                    val = A[block_start + i];
+                else if constexpr (std::is_same<T, bf16_t>::value)
+                    val = bf16_to_float(A[block_start + i].v);
+                else
+                    val = fp16_to_float(A[block_start + i].v);
+                out[block_start + i] = lut[norm_to_lut_index(val * inv_absmax)];
             }
         }
+#else
+        for (long long i = block_start; i < block_end; ++i) {
+            float val;
+            if constexpr (std::is_same<T, float>::value)
+                val = A[i];
+            else if constexpr (std::is_same<T, bf16_t>::value)
+                val = bf16_to_float(A[i].v);
+            else
+                val = fp16_to_float(A[i].v);
+            out[i] = lut[norm_to_lut_index(val * inv_absmax)];
+        }
+#endif
     }
 }
 

From 6223b78c5ff9392948948e1c7eab6d7923d6f7a2 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Jun 2026 15:37:31 -0400
Subject: [PATCH 10/25] cpu perf: ARM64 NEON improvements for blockwise
 quantization

---
 csrc/cpu_ops.cpp | 188 ++++++++++++++++++++++++-----------------------
 1 file changed, 98 insertions(+), 90 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 9ea661615..bda400e10 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -102,42 +102,28 @@ static inline void
     // interleaved.val[0] has elements 0-7, interleaved.val[1] has elements 8-15
     uint8x16_t indices = vcombine_u8(interleaved.val[0], interleaved.val[1]);
 
-    // Use flat LUT for fast indexed access
-    // Store LUT as flat float array on stack (likely in L1 cache)
-    float flat_lut[16];
-    vst1q_f32(flat_lut, lut[0]);
-    vst1q_f32(flat_lut + 4, lut[1]);
-    vst1q_f32(flat_lut + 8, lut[2]);
-    vst1q_f32(flat_lut + 12, lut[3]);
-
-    // Extract indices and do lookups in groups of 4 for NEON multiply
-    uint8_t idx_arr[16];
-    vst1q_u8(idx_arr, indices);
-
+    // Reinterpret float LUT as 64-byte table for vqtbl4q_u8 lookup.
+    // Each 4-bit index i maps to bytes [i*4 .. i*4+3] in the table.
+    uint8x16x4_t lut_bytes = {
+        vreinterpretq_u8_f32(lut[0]), vreinterpretq_u8_f32(lut[1]), vreinterpretq_u8_f32(lut[2]),
+        vreinterpretq_u8_f32(lut[3])
+    };
+    // Multiply each index by 4 to get byte offset (max 15*4=60 < 64, safe)
+    uint8x16_t base = vshlq_n_u8(indices, 2);
+    // Expand each base offset to 4 consecutive bytes via zip
+    static const uint8x16_t off = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
+    uint8x8_t lo = vget_low_u8(base), hi = vget_high_u8(base);
+    uint8x8x2_t z0 = vzip_u8(lo, lo);
+    uint8x8x2_t z1 = vzip_u8(hi, hi);
+    uint8x8x2_t zlo = vzip_u8(z0.val[0], z0.val[0]);
+    uint8x8x2_t zhi = vzip_u8(z0.val[1], z0.val[1]);
+    uint8x8x2_t zlo2 = vzip_u8(z1.val[0], z1.val[0]);
+    uint8x8x2_t zhi2 = vzip_u8(z1.val[1], z1.val[1]);
     float32x4_t vscale = vdupq_n_f32(scale);
-
-    // Process 4 values at a time with NEON - load from temp buffer
-    float tmp_vals[16];
-    tmp_vals[0] = flat_lut[idx_arr[0]];
-    tmp_vals[1] = flat_lut[idx_arr[1]];
-    tmp_vals[2] = flat_lut[idx_arr[2]];
-    tmp_vals[3] = flat_lut[idx_arr[3]];
-    tmp_vals[4] = flat_lut[idx_arr[4]];
-    tmp_vals[5] = flat_lut[idx_arr[5]];
-    tmp_vals[6] = flat_lut[idx_arr[6]];
-    tmp_vals[7] = flat_lut[idx_arr[7]];
-    tmp_vals[8] = flat_lut[idx_arr[8]];
-    tmp_vals[9] = flat_lut[idx_arr[9]];
-    tmp_vals[10] = flat_lut[idx_arr[10]];
-    tmp_vals[11] = flat_lut[idx_arr[11]];
-    tmp_vals[12] = flat_lut[idx_arr[12]];
-    tmp_vals[13] = flat_lut[idx_arr[13]];
-    tmp_vals[14] = flat_lut[idx_arr[14]];
-    tmp_vals[15] = flat_lut[idx_arr[15]];
-    float32x4_t v0 = vld1q_f32(tmp_vals);
-    float32x4_t v1 = vld1q_f32(tmp_vals + 4);
-    float32x4_t v2 = vld1q_f32(tmp_vals + 8);
-    float32x4_t v3 = vld1q_f32(tmp_vals + 12);
+    float32x4_t v0 = vreinterpretq_f32_u8(vqtbl4q_u8(lut_bytes, vaddq_u8(vcombine_u8(zlo.val[0], zlo.val[1]), off)));
+    float32x4_t v1 = vreinterpretq_f32_u8(vqtbl4q_u8(lut_bytes, vaddq_u8(vcombine_u8(zhi.val[0], zhi.val[1]), off)));
+    float32x4_t v2 = vreinterpretq_f32_u8(vqtbl4q_u8(lut_bytes, vaddq_u8(vcombine_u8(zlo2.val[0], zlo2.val[1]), off)));
+    float32x4_t v3 = vreinterpretq_f32_u8(vqtbl4q_u8(lut_bytes, vaddq_u8(vcombine_u8(zhi2.val[0], zhi2.val[1]), off)));
 
     vst1q_f32(out, vmulq_f32(v0, vscale));
     vst1q_f32(out + 4, vmulq_f32(v1, vscale));
@@ -180,25 +166,31 @@ static inline float32x4_t neon_fp16x4_to_f32(const fp16_t* src) {
 }
 
 // NEON-optimized absmax computation for a block of float32, bf16, or fp16.
-template <typename T>
-static inline float neon_absmax(const T* data, long long n) {
+template <typename T> static inline float neon_absmax(const T* data, long long n) {
     float32x4_t vmax = vdupq_n_f32(0.0f);
     long long i = 0;
     for (; i + 16 <= n; i += 16) {
         float32x4_t v0, v1, v2, v3;
         if constexpr (std::is_same<T, float>::value) {
             const float* p = reinterpret_cast<const float*>(data + i);
-            v0 = vld1q_f32(p);      v1 = vld1q_f32(p + 4);
-            v2 = vld1q_f32(p + 8);  v3 = vld1q_f32(p + 12);
+            v0 = vld1q_f32(p);
+            v1 = vld1q_f32(p + 4);
+            v2 = vld1q_f32(p + 8);
+            v3 = vld1q_f32(p + 12);
         } else if constexpr (std::is_same<T, bf16_t>::value) {
-            v0 = neon_bf16x4_to_f32(data + i);     v1 = neon_bf16x4_to_f32(data + i + 4);
-            v2 = neon_bf16x4_to_f32(data + i + 8); v3 = neon_bf16x4_to_f32(data + i + 12);
+            v0 = neon_bf16x4_to_f32(data + i);
+            v1 = neon_bf16x4_to_f32(data + i + 4);
+            v2 = neon_bf16x4_to_f32(data + i + 8);
+            v3 = neon_bf16x4_to_f32(data + i + 12);
         } else {
-            v0 = neon_fp16x4_to_f32(data + i);     v1 = neon_fp16x4_to_f32(data + i + 4);
-            v2 = neon_fp16x4_to_f32(data + i + 8); v3 = neon_fp16x4_to_f32(data + i + 12);
+            v0 = neon_fp16x4_to_f32(data + i);
+            v1 = neon_fp16x4_to_f32(data + i + 4);
+            v2 = neon_fp16x4_to_f32(data + i + 8);
+            v3 = neon_fp16x4_to_f32(data + i + 12);
         }
-        vmax = vmaxq_f32(vmax, vmaxq_f32(vmaxq_f32(vabsq_f32(v0), vabsq_f32(v1)),
-                                         vmaxq_f32(vabsq_f32(v2), vabsq_f32(v3))));
+        vmax = vmaxq_f32(
+            vmax, vmaxq_f32(vmaxq_f32(vabsq_f32(v0), vabsq_f32(v1)), vmaxq_f32(vabsq_f32(v2), vabsq_f32(v3)))
+        );
     }
     for (; i + 4 <= n; i += 4) {
         float32x4_t v;
@@ -213,9 +205,12 @@ static inline float neon_absmax(const T* data, long long n) {
     float result = vmaxvq_f32(vmax);
     for (; i < n; ++i) {
         float val;
-        if constexpr (std::is_same<T, float>::value)       val = data[i];
-        else if constexpr (std::is_same<T, bf16_t>::value) val = bf16_to_float(data[i].v);
-        else                                                val = fp16_to_float(data[i].v);
+        if constexpr (std::is_same<T, float>::value)
+            val = data[i];
+        else if constexpr (std::is_same<T, bf16_t>::value)
+            val = bf16_to_float(data[i].v);
+        else
+            val = fp16_to_float(data[i].v);
         result = std::max(result, std::fabs(val));
     }
     return result;
@@ -296,50 +291,63 @@ void dequantizeBlockwise4bitCpu(
     {
         long long dim_0 = m;
         long long dim_1 = n;
-        long long input_dim_1 = dim_1 >> 1;
-        long long absmax_dim_1 = dim_1 / blocksize;
-        // NEON path: process 16 output values at a time (8 packed bytes)
-        // Only use when blocksize evenly divides dim_1 to ensure correct scale indexing
-        constexpr long long VEC_LEN = 16;
-        if (dim_1 % VEC_LEN == 0 && blocksize >= VEC_LEN && (dim_1 % blocksize == 0)) {
-            float32x4_t lut[4];
-            if constexpr (DATA_TYPE == 1) {
-                neon_fp4_lut(lut);
-            } else {
-                neon_nf4_lut(lut);
+        long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1
+        long long absmax_dim_1 = (dim_1 + blocksize - 1) / blocksize;
+        float32x4_t lut[4];
+        if constexpr (DATA_TYPE == 1) {
+            neon_fp4_lut(lut);
+        } else {
+            neon_nf4_lut(lut);
+        }
+        constexpr long long k_step = 8; // 8 packed bytes = 16 output values
+        BNB_OMP_PARALLEL_FOR
+        for (long long block_idx = 0; block_idx < dim_0; ++block_idx) {
+            long long k = 0;
+            for (; k + k_step <= input_dim_1; k += k_step) {
+                long long scale_idx = k * 2 / blocksize;
+                float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
+                const uint8_t* p = &A[block_idx * input_dim_1 + k];
+                float tmp_f32[16];
+                neon_dequant_4bit_16values(p, scale, lut, tmp_f32);
+                T* pout = &out[block_idx * dim_1 + k * 2];
+                if constexpr (std::is_same<T, float>()) {
+                    std::memcpy(pout, tmp_f32, 16 * sizeof(float));
+                } else if constexpr (std::is_same<T, bf16_t>()) {
+                    neon_f32_to_bf16x4(vld1q_f32(tmp_f32), pout);
+                    neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 4), pout + 4);
+                    neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 8), pout + 8);
+                    neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 12), pout + 12);
+                } else {
+                    neon_f32_to_fp16x4(vld1q_f32(tmp_f32), pout);
+                    neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 4), pout + 4);
+                    neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 8), pout + 8);
+                    neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12);
+                }
             }
-            constexpr long long k_step = VEC_LEN / 2; // 8 bytes per iteration
-            BNB_OMP_PARALLEL_FOR
-            for (long long block_idx = 0; block_idx < dim_0; ++block_idx) {
-                for (long long k = 0; k < input_dim_1; k += k_step) {
-                    long long scale_idx = k * 2 / blocksize;
-                    float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
-                    const uint8_t* p = &A[block_idx * input_dim_1 + k];
-
-                    // Dequantize 16 values into a temp float buffer
-                    float tmp_f32[16];
-                    neon_dequant_4bit_16values(p, scale, lut, tmp_f32);
-
-                    // Store results (convert to output type using NEON)
-                    T* pout = &out[block_idx * dim_1 + k * 2];
-                    if constexpr (std::is_same<T, float>()) {
-                        // Direct copy - already float
-                        std::memcpy(pout, tmp_f32, 16 * sizeof(float));
-                    } else if constexpr (std::is_same<T, bf16_t>()) {
-                        neon_f32_to_bf16x4(vld1q_f32(tmp_f32), pout);
-                        neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 4), pout + 4);
-                        neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 8), pout + 8);
-                        neon_f32_to_bf16x4(vld1q_f32(tmp_f32 + 12), pout + 12);
-                    } else if constexpr (std::is_same<T, fp16_t>()) {
-                        neon_f32_to_fp16x4(vld1q_f32(tmp_f32), pout);
-                        neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 4), pout + 4);
-                        neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 8), pout + 8);
-                        neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12);
-                    }
+            // Scalar remainder for dim_1 not divisible by 16, and last nibble when dim_1 is odd
+            for (; k < input_dim_1; ++k) {
+                long long out_base = block_idx * dim_1 + k * 2;
+                long long scale_idx = k * 2 / blocksize;
+                float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
+                unsigned char byte = A[block_idx * input_dim_1 + k];
+                float v0 = (DATA_TYPE == 1 ? dDequantizeFP4(byte >> 4) : dDequantizeNF4(byte >> 4)) * scale;
+                float v1 = (DATA_TYPE == 1 ? dDequantizeFP4(byte & 0x0F) : dDequantizeNF4(byte & 0x0F)) * scale;
+                if constexpr (std::is_same<T, float>()) {
+                    out[out_base] = v0;
+                    if (k * 2 + 1 < dim_1)
+                        out[out_base + 1] = v1;
+                } else if constexpr (std::is_same<T, bf16_t>()) {
+                    out[out_base] = float_to_bf16(v0);
+                    if (k * 2 + 1 < dim_1)
+                        out[out_base + 1] = float_to_bf16(v1);
+                } else {
+                    out[out_base] = float_to_fp16(v0);
+                    if (k * 2 + 1 < dim_1)
+                        out[out_base + 1] = float_to_fp16(v1);
                 }
             }
-            return;
         }
+        return;
     }
 #endif // _M_ARM64 || __aarch64__
 
@@ -446,7 +454,7 @@ void dequantizeBlockwise8bitCpu(
             float32x4_t vscale = vdupq_n_f32(scale);
             long long i = block_idx;
             for (; i + 4 <= block_end; i += 4) {
-                float tmp[4] = { code[A[i]], code[A[i+1]], code[A[i+2]], code[A[i+3]] };
+                float tmp[4] = {code[A[i]], code[A[i + 1]], code[A[i + 2]], code[A[i + 3]]};
                 float32x4_t v = vmulq_f32(vld1q_f32(tmp), vscale);
                 if constexpr (std::is_same<T, float>::value)
                     vst1q_f32(reinterpret_cast<float*>(out + i), v);
@@ -630,7 +638,7 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou
                 uint16x4_t indices = neon_norm_to_lut_index_x4(v);
                 uint16_t idx_arr[4];
                 vst1_u16(idx_arr, indices);
-                out[block_start + i]     = lut[idx_arr[0]];
+                out[block_start + i] = lut[idx_arr[0]];
                 out[block_start + i + 1] = lut[idx_arr[1]];
                 out[block_start + i + 2] = lut[idx_arr[2]];
                 out[block_start + i + 3] = lut[idx_arr[3]];

From 2adea9975a6e15ea41c5e8e2dc420ba3fe807ec8 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Jun 2026 15:51:09 -0400
Subject: [PATCH 11/25] fix msvc arm64 build

---
 csrc/cpu_ops.cpp | 2 +-
 csrc/cpu_ops.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index bda400e10..94bf6c57b 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -111,7 +111,7 @@ static inline void
     // Multiply each index by 4 to get byte offset (max 15*4=60 < 64, safe)
     uint8x16_t base = vshlq_n_u8(indices, 2);
     // Expand each base offset to 4 consecutive bytes via zip
-    static const uint8x16_t off = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
+    const uint8x16_t off = vreinterpretq_u8_u32(vdupq_n_u32(0x03020100));
     uint8x8_t lo = vget_low_u8(base), hi = vget_high_u8(base);
     uint8x8x2_t z0 = vzip_u8(lo, lo);
     uint8x8x2_t z1 = vzip_u8(hi, hi);
diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h
index 14df69921..2f7a8f873 100644
--- a/csrc/cpu_ops.h
+++ b/csrc/cpu_ops.h
@@ -32,7 +32,7 @@ template <typename T> inline int get_cache_blocks(int chunk_size) {
 }
 
 // forced unroll for perf critical path
-#if __has_attribute(always_inline)
+#if defined(__has_attribute) && __has_attribute(always_inline)
 #define ALWAYS_INLINE __attribute__((__always_inline__)) inline
 #else
 #define ALWAYS_INLINE inline

From 144edc780a2d081c852ff0b1fa2b3aec97b54891 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Jun 2026 16:50:47 -0400
Subject: [PATCH 12/25] fix

---
 csrc/cpu_ops.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 94bf6c57b..8053b0832 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -288,7 +288,9 @@ void dequantizeBlockwise4bitCpu(
         return;
 
 #if defined(_M_ARM64) || defined(__aarch64__)
-    {
+    // n % blocksize == 0: absmax is organized by flat element blocks; row and block
+    // boundaries must align or the 2D absmax indexing gives wrong scale values.
+    if (n % blocksize == 0) {
         long long dim_0 = m;
         long long dim_1 = n;
         long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1

From 4021345363ff8190dfcb10c7dd85348d8ed9cd19 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Jun 2026 18:17:26 -0400
Subject: [PATCH 13/25] remove dead code

---
 csrc/cpu_ops.cpp | 29 +++--------------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 8053b0832..8ffb93d99 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -293,8 +293,8 @@ void dequantizeBlockwise4bitCpu(
     if (n % blocksize == 0) {
         long long dim_0 = m;
         long long dim_1 = n;
-        long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1
-        long long absmax_dim_1 = (dim_1 + blocksize - 1) / blocksize;
+        long long input_dim_1 = dim_1 >> 1;
+        long long absmax_dim_1 = dim_1 / blocksize;
         float32x4_t lut[4];
         if constexpr (DATA_TYPE == 1) {
             neon_fp4_lut(lut);
@@ -304,8 +304,7 @@ void dequantizeBlockwise4bitCpu(
         constexpr long long k_step = 8; // 8 packed bytes = 16 output values
         BNB_OMP_PARALLEL_FOR
         for (long long block_idx = 0; block_idx < dim_0; ++block_idx) {
-            long long k = 0;
-            for (; k + k_step <= input_dim_1; k += k_step) {
+            for (long long k = 0; k < input_dim_1; k += k_step) {
                 long long scale_idx = k * 2 / blocksize;
                 float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
                 const uint8_t* p = &A[block_idx * input_dim_1 + k];
@@ -326,28 +325,6 @@ void dequantizeBlockwise4bitCpu(
                     neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12);
                 }
             }
-            // Scalar remainder for dim_1 not divisible by 16, and last nibble when dim_1 is odd
-            for (; k < input_dim_1; ++k) {
-                long long out_base = block_idx * dim_1 + k * 2;
-                long long scale_idx = k * 2 / blocksize;
-                float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
-                unsigned char byte = A[block_idx * input_dim_1 + k];
-                float v0 = (DATA_TYPE == 1 ? dDequantizeFP4(byte >> 4) : dDequantizeNF4(byte >> 4)) * scale;
-                float v1 = (DATA_TYPE == 1 ? dDequantizeFP4(byte & 0x0F) : dDequantizeNF4(byte & 0x0F)) * scale;
-                if constexpr (std::is_same<T, float>()) {
-                    out[out_base] = v0;
-                    if (k * 2 + 1 < dim_1)
-                        out[out_base + 1] = v1;
-                } else if constexpr (std::is_same<T, bf16_t>()) {
-                    out[out_base] = float_to_bf16(v0);
-                    if (k * 2 + 1 < dim_1)
-                        out[out_base + 1] = float_to_bf16(v1);
-                } else {
-                    out[out_base] = float_to_fp16(v0);
-                    if (k * 2 + 1 < dim_1)
-                        out[out_base + 1] = float_to_fp16(v1);
-                }
-            }
         }
         return;
     }

From ed1db523062a1c479a1a4eeedd94119236ad2224 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Jun 2026 18:25:12 -0400
Subject: [PATCH 14/25] x86-64 cpu perf improvement

---
 CMakeLists.txt   |  1 +
 csrc/cpu_ops.cpp | 58 +++++++++++++++++++++++++++++++++++++++++++-----
 csrc/cpu_ops.h   | 15 +++++++++++++
 3 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19e8ae0b1..9714f9946 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -380,6 +380,7 @@ if (BUILD_CPU)
             -mprefer-vector-width=256
             -mfma
             -mavx2
+            -mf16c
             -mlzcnt
             -mbmi
             -mbmi2
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 8ffb93d99..6ad62f012 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -231,7 +231,6 @@ static inline uint16x4_t neon_norm_to_lut_index_x4(float32x4_t vals) {
 #endif // _M_ARM64 || __aarch64__
 
 #if defined(__AVX512F__)
-#include <immintrin.h>
 
 inline __m256i cvt_fp32_to_fp16(const __m512 src) {
     return _mm512_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
@@ -277,6 +276,29 @@ static inline __m512 set_fp4_lut() {
 }
 #endif
 
+static constexpr float fp4_lut[16] = {
+    0.0f,  0.005208333333f,  0.66666667f,  1.0f,  0.33333333f,  0.5f,  0.16666667f,  0.25f,
+    -0.0f, -0.005208333333f, -0.66666667f, -1.0f, -0.33333333f, -0.5f, -0.16666667f, -0.25f,
+};
+static constexpr float nf4_lut[16] = {
+    -1.0f,
+    -0.6961928009986877f,
+    -0.5250730514526367f,
+    -0.39491748809814453f,
+    -0.28444138169288635f,
+    -0.18477343022823334f,
+    -0.09105003625154495f,
+    0.0f,
+    0.07958029955625534f,
+    0.16093020141124725f,
+    0.24611230194568634f,
+    0.33791524171829224f,
+    0.44070982933044434f,
+    0.5626170039176941f,
+    0.7229568362236023f,
+    1.0f,
+};
+
 // 4-bit (FP4 / NF4) dequantization helper extracted from the original else branch.
 // DATA_TYPE: 1 = FP4, 2 = NF4
 template <typename T, int DATA_TYPE>
@@ -293,8 +315,8 @@ void dequantizeBlockwise4bitCpu(
     if (n % blocksize == 0) {
         long long dim_0 = m;
         long long dim_1 = n;
-        long long input_dim_1 = dim_1 >> 1;
-        long long absmax_dim_1 = dim_1 / blocksize;
+        long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1
+        long long absmax_dim_1 = (dim_1 + blocksize - 1) / blocksize;
         float32x4_t lut[4];
         if constexpr (DATA_TYPE == 1) {
             neon_fp4_lut(lut);
@@ -304,7 +326,8 @@ void dequantizeBlockwise4bitCpu(
         constexpr long long k_step = 8; // 8 packed bytes = 16 output values
         BNB_OMP_PARALLEL_FOR
         for (long long block_idx = 0; block_idx < dim_0; ++block_idx) {
-            for (long long k = 0; k < input_dim_1; k += k_step) {
+            long long k = 0;
+            for (; k + k_step <= input_dim_1; k += k_step) {
                 long long scale_idx = k * 2 / blocksize;
                 float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
                 const uint8_t* p = &A[block_idx * input_dim_1 + k];
@@ -325,6 +348,28 @@ void dequantizeBlockwise4bitCpu(
                     neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12);
                 }
             }
+            // Scalar remainder for dim_1 not divisible by 16, and last nibble when dim_1 is odd
+            for (; k < input_dim_1; ++k) {
+                long long out_base = block_idx * dim_1 + k * 2;
+                long long scale_idx = k * 2 / blocksize;
+                float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
+                unsigned char byte = A[block_idx * input_dim_1 + k];
+                float v0 = lut[byte >> 4] * scale;
+                float v1 = lut[byte & 0x0F] * scale;
+                if constexpr (std::is_same<T, float>()) {
+                    out[out_base] = v0;
+                    if (k * 2 + 1 < dim_1)
+                        out[out_base + 1] = v1;
+                } else if constexpr (std::is_same<T, bf16_t>()) {
+                    out[out_base] = float_to_bf16(v0);
+                    if (k * 2 + 1 < dim_1)
+                        out[out_base + 1] = float_to_bf16(v1);
+                } else {
+                    out[out_base] = float_to_fp16(v0);
+                    if (k * 2 + 1 < dim_1)
+                        out[out_base + 1] = float_to_fp16(v1);
+                }
+            }
         }
         return;
     }
@@ -381,6 +426,7 @@ void dequantizeBlockwise4bitCpu(
     }
 #endif
     // Scalar fallback branch
+    const float* lut = DATA_TYPE == 1 ? fp4_lut : nf4_lut;
     long long total = m * n;
     BNB_OMP_PARALLEL_FOR
     for (long long block_idx = 0; block_idx < total; block_idx += blocksize) {
@@ -391,9 +437,9 @@ void dequantizeBlockwise4bitCpu(
             unsigned char byte = A[byte_index];
 
             // High nibble first (matches previous code logic)
-            float v0 = (DATA_TYPE == 1 ? dDequantizeFP4(byte >> 4) : dDequantizeNF4(byte >> 4)) * scale;
+            float v0 = lut[byte >> 4] * scale;
             // Low nibble second
-            float v1 = (DATA_TYPE == 1 ? dDequantizeFP4(byte & 0x0F) : dDequantizeNF4(byte & 0x0F)) * scale;
+            float v1 = lut[byte & 0x0F] * scale;
 
             if constexpr (std::is_same<T, bf16_t>::value) {
                 out[block_idx + i] = float_to_bf16(v0);
diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h
index 2f7a8f873..3a1ec40a8 100644
--- a/csrc/cpu_ops.h
+++ b/csrc/cpu_ops.h
@@ -13,6 +13,10 @@
 #include <omp.h>
 #endif
 
+#if defined(__x86_64__) || defined(_M_X64)
+#include <immintrin.h>
+#endif
+
 // amx-bf16
 #define TILE_M 16
 #define TILE_N 16
@@ -147,6 +151,12 @@ static float bf16_to_float(uint16_t bf16) {
 }
 
 static inline fp16_t float_to_fp16(float x) {
+#if defined(__AVX2__)
+    // F16C is guaranteed on all AVX2 CPUs; matches CUDA round-to-nearest-even behavior
+    return fp16_t{
+        (uint16_t)_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), 0)
+    };
+#else
     uint32_t bits;
     std::memcpy(&bits, &x, 4);
     uint32_t sign = (bits >> 31) & 0x1;
@@ -186,9 +196,13 @@ static inline fp16_t float_to_fp16(float x) {
         h = (sign << 15) | ((uint16_t)exp_h << 10) | ((uint16_t)(mant_rounded >> 13));
     }
     return fp16_t{h};
+#endif
 }
 
 static inline float fp16_to_float(uint16_t h) {
+#if defined(__AVX2__)
+    return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(h)));
+#else
     uint32_t sign = (h >> 15) & 0x1;
     uint32_t exp = (h >> 10) & 0x1F;
     uint32_t mant = h & 0x3FF;
@@ -216,6 +230,7 @@ static inline float fp16_to_float(uint16_t h) {
     float f;
     std::memcpy(&f, &bits, sizeof(f));
     return f;
+#endif
 }
 
 inline float dDequantizeFP4(unsigned char val) {

From e54ccf8c638695dc5c546ed7d9084dae39784483 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Jun 2026 18:31:17 -0400
Subject: [PATCH 15/25] fix

---
 csrc/cpu_ops.cpp | 37 +++++++------------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 6ad62f012..cbbb79a30 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -315,24 +315,23 @@ void dequantizeBlockwise4bitCpu(
     if (n % blocksize == 0) {
         long long dim_0 = m;
         long long dim_1 = n;
-        long long input_dim_1 = (dim_1 + 1) >> 1; // ceil(dim_1/2): handles odd dim_1
-        long long absmax_dim_1 = (dim_1 + blocksize - 1) / blocksize;
-        float32x4_t lut[4];
+        long long input_dim_1 = dim_1 >> 1;
+        long long absmax_dim_1 = dim_1 / blocksize;
+        float32x4_t neon_lut[4];
         if constexpr (DATA_TYPE == 1) {
-            neon_fp4_lut(lut);
+            neon_fp4_lut(neon_lut);
         } else {
-            neon_nf4_lut(lut);
+            neon_nf4_lut(neon_lut);
         }
         constexpr long long k_step = 8; // 8 packed bytes = 16 output values
         BNB_OMP_PARALLEL_FOR
         for (long long block_idx = 0; block_idx < dim_0; ++block_idx) {
-            long long k = 0;
-            for (; k + k_step <= input_dim_1; k += k_step) {
+            for (long long k = 0; k < input_dim_1; k += k_step) {
                 long long scale_idx = k * 2 / blocksize;
                 float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
                 const uint8_t* p = &A[block_idx * input_dim_1 + k];
                 float tmp_f32[16];
-                neon_dequant_4bit_16values(p, scale, lut, tmp_f32);
+                neon_dequant_4bit_16values(p, scale, neon_lut, tmp_f32);
                 T* pout = &out[block_idx * dim_1 + k * 2];
                 if constexpr (std::is_same<T, float>()) {
                     std::memcpy(pout, tmp_f32, 16 * sizeof(float));
@@ -348,28 +347,6 @@ void dequantizeBlockwise4bitCpu(
                     neon_f32_to_fp16x4(vld1q_f32(tmp_f32 + 12), pout + 12);
                 }
             }
-            // Scalar remainder for dim_1 not divisible by 16, and last nibble when dim_1 is odd
-            for (; k < input_dim_1; ++k) {
-                long long out_base = block_idx * dim_1 + k * 2;
-                long long scale_idx = k * 2 / blocksize;
-                float scale = absmax[block_idx * absmax_dim_1 + scale_idx];
-                unsigned char byte = A[block_idx * input_dim_1 + k];
-                float v0 = lut[byte >> 4] * scale;
-                float v1 = lut[byte & 0x0F] * scale;
-                if constexpr (std::is_same<T, float>()) {
-                    out[out_base] = v0;
-                    if (k * 2 + 1 < dim_1)
-                        out[out_base + 1] = v1;
-                } else if constexpr (std::is_same<T, bf16_t>()) {
-                    out[out_base] = float_to_bf16(v0);
-                    if (k * 2 + 1 < dim_1)
-                        out[out_base + 1] = float_to_bf16(v1);
-                } else {
-                    out[out_base] = float_to_fp16(v0);
-                    if (k * 2 + 1 < dim_1)
-                        out[out_base + 1] = float_to_fp16(v1);
-                }
-            }
         }
         return;
     }

From 8cc47f0383317130563d39756209ed34ab03e2ed Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Jun 2026 21:04:41 -0400
Subject: [PATCH 16/25] cpu: update tests

---
 tests/test_functional.py |  4 ----
 tests/test_linear4bit.py |  3 ---
 tests/test_ops.py        | 10 ++--------
 3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 95d8727f7..895c0b436 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -275,10 +275,6 @@ def test_few_bit_quant(self, device, bits, method):
 
     @pytest.mark.parametrize("device", get_available_devices())
     def test_fp8_quant(self, device):
-        # TODO
-        if device == "cpu":
-            pytest.skip("CPU implementation segfaults")
-
         for e_bits in range(1, 7):
             p_bits = 7 - e_bits
             code = F.create_fp8_map(True, e_bits, p_bits).to(device)
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 12ed0eb27..79ede45b2 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -221,9 +221,6 @@ def test_params4bit_torch_chunk_split(device, quant_type):
     if device == "hpu" and not is_supported_on_hpu(quant_type, torch.float16, torch.uint8):
         pytest.skip("This configuration is not supported on HPU.")
 
-    if device == "cpu":
-        pytest.skip("CPU quantization causes segfault, skipping CPU test")
-
     original_tensor = torch.randn(8, 4, dtype=torch.float16, device="cpu")
 
     params4bit = bnb.nn.Params4bit(data=original_tensor, quant_type=quant_type, requires_grad=False)
diff --git a/tests/test_ops.py b/tests/test_ops.py
index bd5217748..6cae6706c 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -98,12 +98,8 @@ class TestInt8BlockwiseQuantOps:
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_quantize_blockwise(self, device, dtype, blocksize):
-        if device == "cpu":
-            if dtype != torch.float32:
-                pytest.skip("CPU implementation is only available for float32")
-
-            if blocksize != 256:
-                pytest.skip("CPU implementation is slow; only test blocksize=256")
+        if device == "cpu" and blocksize != 256:
+            pytest.skip("CPU implementation is slow; only test blocksize=256")
 
         code = bitsandbytes.functional.create_dynamic_map().to(device)
         A = torch.randn(1024, 1024, dtype=dtype, device=device)
@@ -122,8 +118,6 @@ def test_quantize_blockwise(self, device, dtype, blocksize):
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_dequantize_blockwise(self, device, dtype, blocksize):
-        if device == "cpu" and dtype != torch.float32:
-            pytest.skip("CPU implementation is only available for float32")
 
         A = torch.randint(0, 255, (1024, 1024), dtype=torch.uint8, device=device)
         code = bitsandbytes.functional.create_dynamic_map().to(device, dtype=torch.float32)

From 365c6d829fedcdcb920361df6ff007c388214ad6 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Jun 2026 22:22:44 -0400
Subject: [PATCH 17/25] x64 avx512 improvements, test improvements

---
 csrc/cpu_ops.cpp         | 22 ++++++++--------------
 tests/test_autograd.py   | 10 +++-------
 tests/test_functional.py | 33 +++++++++++++++++++++++++++++++--
 3 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index cbbb79a30..e5938a677 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -366,22 +366,16 @@ void dequantizeBlockwise4bitCpu(
             BNB_OMP_PARALLEL_FOR
             for (int block_idx = 0; block_idx < dim_0; ++block_idx) {
                 for (int k = 0; k < input_dim_1; k += k_step) {
-                    // Load 64 bits of nf4 data and a single scale data
-                    uint8_t* p = &A[block_idx * input_dim_1 + k];
-                    uint64_t packed;
-                    std::memcpy(&packed, p, sizeof(uint64_t));
+                    const uint8_t* p = &A[block_idx * input_dim_1 + k];
                     auto scale_idx = k * 2 / blocksize;
                     auto vscales = _mm512_set1_ps((float)absmax[block_idx * absmax_dim_1 + scale_idx]);
-                    // unpack nf4 data to 32-bit integers
-                    uint64_t high = 0;
-                    uint64_t low = 0;
-                    for (int i = 0; i < 4; ++i) {
-                        low |= ((packed >> (2 * i * 4)) & 0xf) << ((2 * i + 1) * 8);
-                        low |= ((packed >> ((2 * i + 1) * 4)) & 0xf) << (2 * i * 8);
-                        high |= ((packed >> (2 * i * 4 + 32)) & 0xf) << ((2 * i + 1) * 8);
-                        high |= ((packed >> ((2 * i + 1) * 4 + 32)) & 0xf) << (2 * i * 8);
-                    }
-                    __m128i packed_128 = _mm_set_epi64x(high, low);
+                    // Unpack 8 packed bytes into 16 nibble indices using SSE.
+                    // Each byte holds two 4-bit values; high nibble is the first output element.
+                    __m128i raw = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
+                    __m128i mask4 = _mm_set1_epi8(0x0f);
+                    __m128i hi = _mm_and_si128(_mm_srli_epi16(raw, 4), mask4);
+                    __m128i lo = _mm_and_si128(raw, mask4);
+                    __m128i packed_128 = _mm_unpacklo_epi8(hi, lo);
                     __m512i vint32 = _mm512_cvtepu8_epi32(packed_128);
                     // Table look-up
                     __m512 vout = _mm512_permutexvar_ps(vint32, lut);
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index d150f4735..7d273c853 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -134,7 +134,7 @@ def test_matmullt(
 @pytest.mark.parametrize("dim2", [64, 0], ids=id_formatter("dim2"))
 @pytest.mark.parametrize("dim3", [64], ids=id_formatter("dim3"))
 @pytest.mark.parametrize("dim4", [96], ids=id_formatter("dim4"))
-@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
+@pytest.mark.parametrize("req_grad", REQ_GRAD_NO_B_WEIGHT, ids=id_formatter("req_grad"))
 @pytest.mark.parametrize("transpose_B", TRUE_FALSE, ids=id_formatter("transpose_B"))
 @pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@@ -169,8 +169,8 @@ def test_matmul_4bit(
 
     for i in range(3):
         A = torch.randn(size=dimA, device=device, requires_grad=req_grad[0], dtype=dtype)
-        B = torch.randn(size=dimB, device=device, requires_grad=req_grad[1], dtype=dtype)
-        target = torch.randn(size=(dim2, dim4), device=device, requires_grad=req_grad[1], dtype=dtype)
+        B = torch.randn(size=dimB, device=device, dtype=dtype)
+        target = torch.randn(size=(dim2, dim4), device=device, dtype=dtype)
         bias = None
         bias2 = None
         if has_bias:
@@ -212,9 +212,7 @@ def test_matmul_4bit(
             loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean()
             loss_bnb.backward()
             gradA1 = A.grad
-            gradB1 = B.grad
             A.grad = None
-            B.grad = None
             if has_bias:
                 gradBias1 = bias.grad
                 bias.grad = None
@@ -222,9 +220,7 @@ def test_matmul_4bit(
             loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean()
             loss_torch.backward()
             gradA2 = A.grad
-            gradB2 = B.grad
             A.grad = None
-            B.grad = None
             if has_bias:
                 gradBias2 = bias.grad
                 bias.grad = None
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 95d8727f7..099024a58 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -896,6 +896,30 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind):
 
         dim_key = "le512" if dim <= 512 else "gt512"
         thresholds = gemv_thresholds[dtype][dim_key]
+
+        # On CPU with AVX512BF16, fp16/fp32 inputs are downcast to bf16 for the fused
+        # kernel for performance. Thresholds calibrated from 100 iterations on CPU.
+        cpu_bf16_cast = device == "cpu" and F.has_avx512bf16() and dtype in (torch.float16, torch.float32)
+        if cpu_bf16_cast:
+            thresholds = {
+                "le512": {
+                    "err1": (2.72e-4, 9.96e-5),
+                    "relerr1": (
+                        1.88e-3 if dtype == torch.float16 else 1.64e-3,
+                        1.27e-2 if dtype == torch.float16 else 3.61e-3,
+                    ),
+                    "maxerr1": (1.22e-3, 3.80e-4),
+                },
+                "gt512": {
+                    "err1": (1.00e-4, 3.48e-5),
+                    "relerr1": (
+                        6.92e-4 if dtype == torch.float16 else 6.31e-4,
+                        9.21e-4 if dtype == torch.float16 else 4.71e-4,
+                    ),
+                    "maxerr1": (5.16e-4, 1.68e-4),
+                },
+            }[dim_key]
+
         for metric_name, metric_val in [("err1", err1), ("relerr1", relerr1), ("maxerr1", maxerr1)]:
             mean_val, std_val = thresholds[metric_name]
             limit = mean_val + N_SIGMA * std_val
@@ -906,11 +930,12 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind):
 
         # Ratios check that gemv_4bit and matmul_4bit produce consistent results.
         # These are tight bounds on internal consistency, not absolute accuracy.
-        if dtype == torch.float16:
+        # On CPU with AVX512BF16, fp16/fp32 use bf16 arithmetic so get bf16-level bounds.
+        if dtype == torch.float16 and not cpu_bf16_cast:
             assert absratio < 1.005 and absratio > 0.995
             assert relratio < 1.005 and relratio > 0.992
             assert maxratio < 1.005 and maxratio > 0.992
-        elif dtype == torch.float32:
+        elif dtype == torch.float32 and not cpu_bf16_cast:
             assert absratio < 1.005 and absratio > 0.995
             assert relratio < 1.005 and relratio > 0.995
             assert maxratio < 1.005 and maxratio > 0.995
@@ -918,6 +943,10 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind):
             assert absratio < 1.005 and absratio > 0.995
             assert relratio < 1.05 and relratio > 0.96
             assert maxratio < 1.05 and maxratio > 0.97
+        elif cpu_bf16_cast:
+            assert absratio < 1.02 and absratio > 0.98
+            assert relratio < 1.1 and relratio > 0.90
+            assert maxratio < 1.1 and maxratio > 0.90
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])

From d8550100f8672397b7198be2b16d4159c8fc87db Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 15:47:38 -0400
Subject: [PATCH 18/25] update build flags

---
 CMakeLists.txt | 58 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9714f9946..dd984a98e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -355,36 +355,50 @@ set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
 add_library(bitsandbytes SHARED ${SRC_FILES})
 target_compile_features(bitsandbytes PUBLIC cxx_std_17)
 target_include_directories(bitsandbytes PUBLIC csrc)
+set_target_properties(bitsandbytes PROPERTIES VISIBILITY_INLINES_HIDDEN ON)
 
 if (BUILD_CPU)
+    include(CheckIPOSupported)
+    check_ipo_supported(RESULT ipo_supported OUTPUT ipo_output)
+    if (ipo_supported)
+        set_property(TARGET bitsandbytes PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+    endif()
+
     if (OpenMP_CXX_FOUND)
         target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
         add_definitions(-DHAS_OPENMP)
     endif()
 
-    if ((HOST_ARCH MATCHES "x86_64|amd64") AND (NOT MSVC))
-        include(CheckCXXCompilerFlag)
-        check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
-        check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
-        if (HAS_AVX512F_FLAG)
-            target_compile_options(bitsandbytes PRIVATE -mavx512f)
-            target_compile_options(bitsandbytes PRIVATE -mavx512dq)
-            target_compile_options(bitsandbytes PRIVATE -mavx512bw)
-            target_compile_options(bitsandbytes PRIVATE -mavx512vl)
-        endif()
-        if (HAS_AVX512BF16_FLAG)
-            target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
+    if (NOT MSVC)
+        target_compile_options(bitsandbytes PRIVATE -fno-semantic-interposition)
+
+        if (HOST_ARCH MATCHES "x86_64|amd64")
+            include(CheckCXXCompilerFlag)
+            check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
+            check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
+            if (HAS_AVX512F_FLAG)
+                target_compile_options(
+                    bitsandbytes PRIVATE
+                    -mavx512f
+                    -mavx512dw
+                    -mavx512bw
+                    -mavx512vl
+                )
+            endif()
+            if (HAS_AVX512BF16_FLAG)
+                target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
+            endif()
+            target_compile_options(
+                bitsandbytes PRIVATE
+                -mprefer-vector-width=256
+                -mfma
+                -mavx2
+                -mf16c
+                -mlzcnt
+                -mbmi
+                -mbmi2
+            )
         endif()
-        target_compile_options(
-            bitsandbytes PRIVATE
-            -mprefer-vector-width=256
-            -mfma
-            -mavx2
-            -mf16c
-            -mlzcnt
-            -mbmi
-            -mbmi2
-        )
     endif()
 endif()
 

From b8b328cba72ba02a8c94e5c287bfe80079010170 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 15:51:13 -0400
Subject: [PATCH 19/25] update build flags

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dd984a98e..ede840b7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -380,8 +380,8 @@ if (BUILD_CPU)
                 target_compile_options(
                     bitsandbytes PRIVATE
                     -mavx512f
-                    -mavx512dw
                     -mavx512bw
+                    -mavx512dq
                     -mavx512vl
                 )
             endif()

From 2ebf82158b5a71109baf042f0e57a670186af8c1 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 16:13:22 -0400
Subject: [PATCH 20/25] fix windows

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ede840b7c..326f7d46f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -360,7 +360,7 @@ set_target_properties(bitsandbytes PROPERTIES VISIBILITY_INLINES_HIDDEN ON)
 if (BUILD_CPU)
     include(CheckIPOSupported)
     check_ipo_supported(RESULT ipo_supported OUTPUT ipo_output)
-    if (ipo_supported)
+    if (ipo_supported AND NOT MSVC)
         set_property(TARGET bitsandbytes PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
     endif()
 

From 7dad0e6c4d8ae3e87b117f5d3d3a7327c07f649f Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 8 Jun 2026 16:00:53 -0400
Subject: [PATCH 21/25] Update build flag

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 326f7d46f..3d420edb1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -370,7 +370,9 @@ if (BUILD_CPU)
     endif()
 
     if (NOT MSVC)
-        target_compile_options(bitsandbytes PRIVATE -fno-semantic-interposition)
+        if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+            target_compile_options(bitsandbytes PRIVATE -fno-semantic-interposition)
+        endif()
 
         if (HOST_ARCH MATCHES "x86_64|amd64")
             include(CheckCXXCompilerFlag)

From 77ae5facae153b3c7e5219a23fdd9e8bf69bdf94 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 8 Jun 2026 16:53:54 -0400
Subject: [PATCH 22/25] Update omp simd hints

---
 csrc/cpu_ops.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index e5938a677..327fca531 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -473,6 +473,7 @@ void dequantizeBlockwise8bitCpu(
 #ifdef _MSC_VER
 #pragma loop(ivdep)
 #endif
+#pragma omp simd
         for (long long i = block_idx; i < block_end; ++i) {
             float v = code[A[i]] * scale;
             if constexpr (std::is_same<T, bf16_t>::value) {
@@ -595,6 +596,7 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou
 #if defined(_M_ARM64) || defined(__aarch64__)
         absmax_block = neon_absmax<T>(A + block_start, block_len);
 #else
+#pragma omp simd reduction(max:absmax_block)
         for (long long i = block_start; i < block_end; ++i) {
             float val;
             if constexpr (std::is_same<T, float>::value)

From aebfb02ddd70ed2bdfa8252b58c349edd501f538 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 8 Jun 2026 17:02:49 -0400
Subject: [PATCH 23/25] fix msvc

---
 csrc/cpu_ops.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 327fca531..91cd01168 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -470,9 +470,6 @@ void dequantizeBlockwise8bitCpu(
             }
         }
 #else
-#ifdef _MSC_VER
-#pragma loop(ivdep)
-#endif
 #pragma omp simd
         for (long long i = block_idx; i < block_end; ++i) {
             float v = code[A[i]] * scale;

From efc6f4bf632b088a8c74d515529b1454bf33d304 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 14:28:03 -0400
Subject: [PATCH 24/25] fix lint

---
 csrc/cpu_ops.cpp    | 2 +-
 tests/test_ops.py   | 1 -
 tests/test_optim.py | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 91cd01168..dfb9046ac 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -593,7 +593,7 @@ void quantize_cpu_impl(float* code, const T* A, float* absmax, unsigned char* ou
 #if defined(_M_ARM64) || defined(__aarch64__)
         absmax_block = neon_absmax<T>(A + block_start, block_len);
 #else
-#pragma omp simd reduction(max:absmax_block)
+#pragma omp simd reduction(max : absmax_block)
         for (long long i = block_start; i < block_end; ++i) {
             float val;
             if constexpr (std::is_same<T, float>::value)
diff --git a/tests/test_ops.py b/tests/test_ops.py
index 6cae6706c..3550c0b6f 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -118,7 +118,6 @@ def test_quantize_blockwise(self, device, dtype, blocksize):
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_dequantize_blockwise(self, device, dtype, blocksize):
-
         A = torch.randint(0, 255, (1024, 1024), dtype=torch.uint8, device=device)
         code = bitsandbytes.functional.create_dynamic_map().to(device, dtype=torch.float32)
 
diff --git a/tests/test_optim.py b/tests/test_optim.py
index d96bdc169..0a4b3d6af 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -24,7 +24,6 @@ def assert_most_approx_close(a, b, rtol=1e-3, atol=1e-3, max_error_count=0):
         torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
-
 str2optimizers = {}
 
 ## TODO: maybe remove these three.

From fbbf23e1e80052a941570c1180249e87fd6ed80e Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 14:32:07 -0400
Subject: [PATCH 25/25] update build script

---
 .github/scripts/build-cpu.sh | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh
index f75325505..5db76ecce 100644
--- a/.github/scripts/build-cpu.sh
+++ b/.github/scripts/build-cpu.sh
@@ -10,19 +10,10 @@ else
     pip install cmake==3.28.3
 fi
 
-# Temporary: vectorization reporting
-if [[ "${build_os}" == windows* ]]; then
-    EXTRA_CXX_FLAGS="/Qvec-report:2 /Qpar-report:1"
-elif [[ "${build_os:0:5}" == macos ]]; then
-    EXTRA_CXX_FLAGS="-Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize"
-else
-    EXTRA_CXX_FLAGS="-fopt-info-vec-missed -fopt-info-vec -fopt-info-loop-optimized"
-fi
-
 if [ "${build_os:0:5}" == macos ] && [ "${build_arch}" == aarch64 ]; then
-	cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu -DCMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" .
+	cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu .
 else
-	cmake -DCOMPUTE_BACKEND=cpu -DCMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" .
+	cmake -DCOMPUTE_BACKEND=cpu .
 fi
 cmake --build . --config Release