Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2cd8562
perf: improve int8 on arm64 CPU
matthewdouglas Jun 1, 2026
62427bf
build: temporary add CPU build verbosity
matthewdouglas Jun 1, 2026
f7f1cdb
fix
matthewdouglas Jun 1, 2026
f1f6650
Merge remote-tracking branch 'origin/main' into cpu-perf-improvements
matthewdouglas Jun 1, 2026
ec1e76d
cpu: skip _int_mm when not on avx512.
matthewdouglas Jun 2, 2026
45831bd
MSVC optimization for CPU ops
matthewdouglas Jun 2, 2026
4e2b611
msvc improvement
matthewdouglas Jun 2, 2026
6755f03
cpu: enable openmp:experimental on windows; add back avx2/fma for lin…
matthewdouglas Jun 2, 2026
44e6da6
improve optim test perf
matthewdouglas Jun 2, 2026
c5df7fa
cpu perf: improvements for arm64 8bit blockwise quant/dequant (neon)
matthewdouglas Jun 3, 2026
6223b78
cpu perf: ARM64 NEON improvements for blockwise quantization
matthewdouglas Jun 3, 2026
2adea99
fix msvc arm64 build
matthewdouglas Jun 3, 2026
144edc7
fix
matthewdouglas Jun 3, 2026
4021345
remove dead code
matthewdouglas Jun 3, 2026
ed1db52
x86-64 cpu perf improvement
matthewdouglas Jun 3, 2026
e54ccf8
fix
matthewdouglas Jun 3, 2026
8cc47f0
cpu: update tests
matthewdouglas Jun 4, 2026
365c6d8
x64 avx512 improvements, test improvements
matthewdouglas Jun 4, 2026
49f4293
Merge remote-tracking branch 'refs/remotes/origin/cpu-perf-improvemen…
matthewdouglas Jun 4, 2026
d855010
update build flags
matthewdouglas Jun 5, 2026
b8b328c
update build flags
matthewdouglas Jun 5, 2026
2ebf821
fix windows
matthewdouglas Jun 5, 2026
7dad0e6
Update build flag
matthewdouglas Jun 8, 2026
77ae5fa
Update omp simd hints
matthewdouglas Jun 8, 2026
aebfb02
fix msvc
matthewdouglas Jun 8, 2026
efc6f4b
fix lint
matthewdouglas Jun 9, 2026
fbbf23e
update build script
matthewdouglas Jun 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/scripts/build-cpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ declare build_os

set -xeuo pipefail

pip install cmake==3.28.3
if [[ "${build_os}" == windows* ]]; then
pip install cmake==3.30.9
else
pip install cmake==3.28.3
fi

if [ "${build_os:0:5}" == macos ] && [ "${build_arch}" == aarch64 ]; then
cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu .
Expand Down
62 changes: 42 additions & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ if (BUILD_CPU)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" HOST_ARCH)
if(MSVC)
# Use the experimental OpenMP runtime for persistent thread pool support.
# Requires CMake 3.30+; silently ignored on older CMake versions.
set(OpenMP_RUNTIME_MSVC "experimental")
endif()
find_package(OpenMP)
endif()

Expand Down Expand Up @@ -350,35 +355,52 @@ set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
add_library(bitsandbytes SHARED ${SRC_FILES})
target_compile_features(bitsandbytes PUBLIC cxx_std_17)
target_include_directories(bitsandbytes PUBLIC csrc)
set_target_properties(bitsandbytes PROPERTIES VISIBILITY_INLINES_HIDDEN ON)

if (BUILD_CPU)
include(CheckIPOSupported)
check_ipo_supported(RESULT ipo_supported OUTPUT ipo_output)
if (ipo_supported AND NOT MSVC)
set_property(TARGET bitsandbytes PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()

if (OpenMP_CXX_FOUND)
target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
add_definitions(-DHAS_OPENMP)
endif()

if ((HOST_ARCH MATCHES "x86_64|amd64") AND (NOT MSVC))
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
if (HAS_AVX512F_FLAG)
target_compile_options(bitsandbytes PRIVATE -mavx512f)
target_compile_options(bitsandbytes PRIVATE -mavx512dq)
target_compile_options(bitsandbytes PRIVATE -mavx512bw)
target_compile_options(bitsandbytes PRIVATE -mavx512vl)
if (NOT MSVC)
if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
target_compile_options(bitsandbytes PRIVATE -fno-semantic-interposition)
endif()
if (HAS_AVX512BF16_FLAG)
target_compile_options(bitsandbytes PRIVATE -mavx512bf16)

if (HOST_ARCH MATCHES "x86_64|amd64")
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
if (HAS_AVX512F_FLAG)
target_compile_options(
bitsandbytes PRIVATE
-mavx512f
-mavx512bw
-mavx512dq
-mavx512vl
)
endif()
if (HAS_AVX512BF16_FLAG)
target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
endif()
target_compile_options(
bitsandbytes PRIVATE
-mprefer-vector-width=256
-mfma
-mavx2
-mf16c
-mlzcnt
-mbmi
-mbmi2
)
endif()
target_compile_options(
bitsandbytes PRIVATE
-mprefer-vector-width=256
-mfma
-mavx2
-mlzcnt
-mbmi
-mbmi2
)
endif()
endif()

Expand Down
6 changes: 4 additions & 2 deletions bitsandbytes/backends/cpu/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
# However, we can overflow if we use this without AVX512_VNNI support.
# This is fixed in torch 2.6+, so we set this as the minimum to be safe.
# For more information: https://github.com/pytorch/pytorch/pull/136942
# TODO(matthewdouglas): aarch64?
if torch.__version__ >= (2, 6):
#
# Without AVX-512 (including aarch64), torch._int_mm uses a scalar fallback
# that is much slower than fp32 matmul. Only use it when AVX-512 is available.
if torch.__version__ >= (2, 6) and _has_avx512:

@register_kernel("bitsandbytes::int8_linear_matmul", "cpu")
def _(A: torch.Tensor, B: torch.Tensor):
Expand Down
Loading
Loading