enable quant storage (#1563)

jiqing-feng · web-flow · commit 0cd87aaf3021 · 2025-03-13T10:27:09.000+01:00
* enable quant storage

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;

* fix to numpy

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;

---------

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
@@ -137,8 +137,7 @@ def quantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_cpu([A, absmax, out])
-        assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
-        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type, quant_storage)
 
     def dequantize_4bit(
         self,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
@@ -296,6 +296,7 @@ def quantize_4bit_impl(
     blocksize=64,
     compress_statistics=False,
     quant_type="nf4",
+    quant_storage=torch.uint8,
 ) -> Tensor:
     """
     Quantize tensor A in blocks of 4-bit values.
@@ -314,6 +315,8 @@ def quantize_4bit_impl(
         The blocksize used in quantization.
     quant_type : str
         The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+    quant_storage: torch.dtype
+        We can use bytes to convert storage type.
 
     Returns
     -------
@@ -401,6 +404,10 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
+    if quant_storage != torch.uint8:
+        bytes_value = out.cpu().numpy().tobytes()
+        out = torch.frombuffer(bytes_value, dtype=quant_storage).to(A.device)
+
     return out.reshape(-1, 1), state
 
 
@@ -418,7 +425,8 @@ def dequant_8bit(A, offset, quant_state):
     return absmax
 
 
-@_maybe_torch_compile
+# Compile will fail in torch.frombuffer
+# @_maybe_torch_compile
 def dequantize_4bit_impl(
     A: Tensor,
     quant_state=None,
@@ -453,6 +461,10 @@ def dequantize_4bit_impl(
     """
     transpose = True if A.shape[0] == 1 else False
     A = A.reshape(-1)
+    device = A.device
+    if A.dtype != torch.uint8:
+        bytes_value = A.cpu().numpy().tobytes()
+        A = torch.frombuffer(bytes_value, dtype=torch.uint8).to(device)
 
     if quant_state is None:
         assert absmax is not None and out is not None
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
@@ -138,8 +138,7 @@ def quantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        assert quant_storage == torch.uint8, "XPU backend only supports uint8 quant_storage"
-        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type, quant_storage)
         return output
 
     def dequantize_4bit(
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -498,6 +498,7 @@ def set_ipex_linear(self, x: torch.Tensor):
         if (
             (x.device.type in ("cpu", "xpu"))
             and not getattr(self.weight.quant_state, "ipex", False)
+            and self.weight.data.dtype == torch.uint8
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
             and not self.training