From 0a4493145386382e7238e6b807fd1ea5ae46a3fc Mon Sep 17 00:00:00 2001
From: kathsucurry <sucipto.kathleen@gmail.com>
Date: Wed, 10 Jun 2026 20:41:36 -0400
Subject: [PATCH 1/2] Fix: ensure absmax_offset is of type float32 before
 passing to gemm_4bit kernel

---
 bitsandbytes/backends/cuda/ops.py |  5 +++-
 tests/test_ops.py                 | 44 +++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 7825d6585..09d2e2244 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -867,6 +867,9 @@ def _(
         else:
             raise RuntimeError(f"unsupported dtype {A.dtype}")
 
+        # Offset is expected to be a float32 tensor.
+        absmax_offset_f32 = absmax_offset.to(dtype=torch.float32) if absmax_offset is not None else None
+
         with _cuda_device_of(A):
             fn(
                 A.data_ptr(),
@@ -874,7 +877,7 @@ def _(
                 absmax.data_ptr(),
                 absmax_8bit.data_ptr() if absmax_8bit is not None else None,
                 absmax_code.data_ptr() if absmax_code is not None else None,
-                absmax_offset.data_ptr() if absmax_offset is not None else None,
+                absmax_offset_f32.data_ptr() if absmax_offset_f32 is not None else None,
                 out.data_ptr(),
                 bias.data_ptr() if bias is not None else None,
                 M,
diff --git a/tests/test_ops.py b/tests/test_ops.py
index 3550c0b6f..7d30cd351 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -333,6 +333,50 @@ def test_gemm_4bit(self, device, dtype, quant_type, compress_statistics, has_bia
                 kwargs={"bias": bias},
             )
 
+    @pytest.mark.parametrize("device", get_available_devices())
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=describe_dtype)
+    @pytest.mark.parametrize("offset_dtype", [torch.float16, torch.bfloat16], ids=describe_dtype)
+    def test_gemm_4bit_non_float32_offset(self, device, dtype, offset_dtype):
+        """Regression test: offset tensors not in float32 must still produce correct results.
+
+        Pre-quantized models (e.g. Unsloth bnb-4bit) may store qs.offset in non-float32 dtype.
+        """
+        N, K, blocksize = 64, 64, 64
+        A = torch.randn(4, K, dtype=dtype, device=device)
+        B = torch.randn(N, K, dtype=dtype, device=device)
+        B_q, qs = bitsandbytes.functional.quantize_4bit(
+            B, blocksize=blocksize, quant_type="nf4", compress_statistics=True
+        )
+
+        # Simulate a pre-quantized model where offset may not be float32.
+        offset_non_f32 = qs.offset.to(dtype=offset_dtype)
+
+        # Reference: explicitly use the rounded float32 value.
+        offset_as_f32 = offset_non_f32.to(dtype=torch.float32)
+        ref = torch.ops.bitsandbytes.gemm_4bit.default(
+            A,
+            B_q,
+            list(B.shape),
+            qs.state2.absmax,
+            blocksize,
+            "nf4",
+            absmax_8bit=qs.absmax,
+            absmax_code=qs.state2.code,
+            absmax_offset=offset_as_f32,
+        )
+        out = torch.ops.bitsandbytes.gemm_4bit.default(
+            A,
+            B_q,
+            list(B.shape),
+            qs.state2.absmax,
+            blocksize,
+            "nf4",
+            absmax_8bit=qs.absmax,
+            absmax_code=qs.state2.code,
+            absmax_offset=offset_non_f32,
+        )
+        torch.testing.assert_close(out, ref)
+
 
 class TestNonContiguousInputs:
     """Regression tests for #1342 and #1690: quantization must handle non-contiguous tensors correctly."""

From 74a2010595e97627a2cc8e21abf7c6ccccd53b25 Mon Sep 17 00:00:00 2001
From: Kathleen <sucipto.kathleen@gmail.com>
Date: Thu, 11 Jun 2026 12:27:08 -0400
Subject: [PATCH 2/2] Fix test docstring

Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
---
 tests/test_ops.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test_ops.py b/tests/test_ops.py
index 7d30cd351..69589dcc0 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -337,10 +337,7 @@ def test_gemm_4bit(self, device, dtype, quant_type, compress_statistics, has_bia
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=describe_dtype)
     @pytest.mark.parametrize("offset_dtype", [torch.float16, torch.bfloat16], ids=describe_dtype)
     def test_gemm_4bit_non_float32_offset(self, device, dtype, offset_dtype):
-        """Regression test: offset tensors not in float32 must still produce correct results.
-
-        Pre-quantized models (e.g. Unsloth bnb-4bit) may store qs.offset in non-float32 dtype.
-        """
+        """Regression test: offset tensors not in float32 must still produce correct results."""
         N, K, blocksize = 64, 64, 64
         A = torch.randn(4, K, dtype=dtype, device=device)
         B = torch.randn(N, K, dtype=dtype, device=device)