Rename matmul_4bits_quantizer.py to matmul_nbits_quantizer.py (#24472)

tianleiwu · web-flow · commit 0d26928b57ea · 2025-04-18T23:09:06.000-07:00
### Description * Rename filename and class name since it supports 4 and 8 bits. * Update HQQWeightOnlyQuantizer to support 8 bits. * Update some comments. ### Motivation and Context #24384 added 8 bits support for the default weight only quantizer.
diff --git a/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py
@@ -50,8 +50,8 @@ def __init__(
             quant_axes (dict[str, int], optional):
                 op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
             customized_weight_config:
-                customized weight config for nodes if needed.
-                If both customized_weight_config and nodes_to_exclude are set, nodes_to_exclude overwrites customized_weight_config.
+                customized weight config for nodes if needed. It is dictionary with node name as key,
+                and the value is a dict of customized config.
         """
         self.algorithm = algorithm
         self.quant_format = quant_format
@@ -81,6 +81,9 @@ def __init__(
                 Defaults to QuantFormat.QOperator.
             op_types_to_quantize (optional):
                 set of operator types to quantize.
+            customized_weight_config:
+                customized weight config for nodes if needed. It is dictionary with node name as key,
+                and the value is a dict of customized config.
         """
         assert quant_format == QuantFormat.QOperator, "RTN only supports QOperator format"
 
@@ -220,6 +223,8 @@ def __init__(
                 set of operator types to quantize.
             quant_axes (dict[str, int], optional):
                 op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
+            bits (int, optional):
+                number of bits per element after quantization. Default 4.
         """
         super().__init__(
             algorithm="DEFAULT",
@@ -654,32 +659,36 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeP
         b_array_torch = torch.from_numpy(b_array)
         if torch.cuda.is_available():
             b_array_torch = b_array_torch.cuda()
+
+        bits = self.config.bits
         quant_weight_torch, scales_torch, zero_points_torch = self.quantize_internal(
-            b_array_torch.T, bits=self.config.bits, group_size=self.config.block_size
+            b_array_torch.T, bits=bits, group_size=self.config.block_size
         )
         quant_weight_torch = quant_weight_torch.contiguous()
         scales_torch = scales_torch.contiguous()
         zero_points_torch = zero_points_torch.contiguous()
 
+        packed_size = 8 // bits  # number of elements packed into one byte
+
         packed_torch = torch.zeros(
-            (quant_weight_torch.shape[0], quant_weight_torch.shape[1] // 2),
+            (quant_weight_torch.shape[0], quant_weight_torch.shape[1] // packed_size),
             dtype=torch.uint8,
             device=quant_weight_torch.device,
         )
-        self.pack_on_row_fast_248bit(packed_torch, quant_weight_torch, self.config.bits)
+        self.pack_on_row_fast_248bit(packed_torch, quant_weight_torch, bits)
         scales = scales_torch.cpu().numpy()
         zero_points = zero_points_torch.cpu().numpy()
         # reshape to the predefined shape in MatmulNbits
         scales = scales.reshape(-1)
         zero_points = zero_points.reshape(-1)
         rows, cols = b_array_torch.shape
         block_size = self.config.block_size
-        blob_size = block_size // 2
+        blob_size = block_size // packed_size
         k_blocks = (rows + block_size - 1) // block_size
         packed_torch = packed_torch.reshape(cols, k_blocks, blob_size)
 
         b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy())
-        b_quant.name = b_pb.name + "_Q4"
+        b_quant.name = b_pb.name + "_Q" + str(bits)
         for input in bs_graph.input:
             if input.name == input_b:
                 bs_graph.input.remove(input)
@@ -699,21 +708,21 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeP
         rows, cols = b_array.shape
         kwargs["K"] = rows
         kwargs["N"] = cols
-        kwargs["bits"] = self.config.bits
+        kwargs["bits"] = bits
         kwargs["block_size"] = self.config.block_size
 
-        matmul_q4_node = onnx.helper.make_node(
+        matmul_q_node = onnx.helper.make_node(
             "MatMulNBits",
             inputs=input_names,
             outputs=[node.output[0]],
-            name=node.name + "_Q4" if node.name else "",
+            name=node.name + "_Q" + str(bits) if node.name else "",
             domain="com.microsoft",
             **kwargs,
         )
 
         logger.info(f"complete quantization of {node.name} ...")
 
-        return [matmul_q4_node]
+        return [matmul_q_node]
 
 
 def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
@@ -761,7 +770,7 @@ def qbits_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.n
                     packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
                 )
         else:
-            # QDQ format only support 4 bits quantization
+            assert qbits == 4, "QDQ format only support 4 bits quantization"
             packed = np.zeros((rows * cols + 1) // 2, dtype="uint8")
             zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
             scales = np.zeros((k_blocks, cols), dtype=fp32weight.dtype)
@@ -1095,14 +1104,13 @@ def quantize_awq(self, model: ModelProto | str) -> ModelProto:
         return quantized_model
 
 
-# TODO(fajin): change class name
-class MatMul4BitsQuantizer:
+class MatMulNBitsQuantizer:
     """
     Target node:        QOperator node:            QDQ nodes:
     MatMul              MatMulNBits                DeQuantizeLinear -> MatMul
     Gather              GatherBlockQuantized       Gather, Gather, Gather (optional) -> DequantizeLinear
 
-    Perform 4b quantization of constant weights for target nodes.
+    Perform 4/8 bits quantization of constant weights for target nodes.
     If algo_config.quant_format is QOperator:
       - nodes are replaced by the corresponding QOperator nodes.
       - quantized weights are stored in the contrib ops.
@@ -1114,6 +1122,7 @@ class MatMul4BitsQuantizer:
     Note:
       - for quantized gather, the memory usage of "DequantizeLinear + Gather" is the same as the original Gather
         during runtime. Therefor it is not recommended.
+      - when a node is in nodes_to_exclude, and the node configuration in algo_config.customized_weight_config will be ignored.
     """
 
     def __init__(
@@ -1148,8 +1157,13 @@ def __init__(
                 quant_format=quant_format,
                 op_types_to_quantize=op_types_to_quantize,
                 quant_axes=quant_axes,
+                bits=4,  # default to 4 bits
             )
+
         self.algo_config = algo_config
+        if hasattr(self.algo_config, "bits"):
+            assert self.algo_config.bits in [4, 8], "Only support 4 or 8 bits quantization"
+
         if algo_config.algorithm == "HQQ":
             self.node_quantizer = HQQWeightOnlyQuantizer(self.algo_config)
         elif algo_config.algorithm == "DEFAULT":
@@ -1511,7 +1525,7 @@ def parse_args():
     else:
         raise ValueError(f"Unsupported quantization method: {args.quant_method}")
 
-    quant = MatMul4BitsQuantizer(
+    quant = MatMulNBitsQuantizer(
         model=model,
         accuracy_level=args.accuracy_level,
         nodes_to_exclude=args.nodes_to_exclude,
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
@@ -929,11 +929,11 @@ def quantize(
         )
     else:
         # training package doesn't has quantize_matmul_4bits, avoid global import
-        from .matmul_4bits_quantizer import MatMul4BitsQuantizer, WeightOnlyQuantConfig
+        from .matmul_nbits_quantizer import MatMulNBitsQuantizer, WeightOnlyQuantConfig
 
         if isinstance(quant_config, WeightOnlyQuantConfig):
             model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load(model_input)
-            quant = MatMul4BitsQuantizer(model, algo_config=quant_config)
+            quant = MatMulNBitsQuantizer(model, algo_config=quant_config)
             quant.process()
             quant.model.save_model_to_file(model_output, True)
         else:
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -31,8 +31,13 @@
 from packaging import version
 from transformers import AutoConfig, AutoModelForCausalLM
 
+from onnxruntime import __version__ as ort_version
 from onnxruntime import quantization as ort_quantization
-from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
+
+if version.parse(ort_version) < version.parse("1.22.0"):
+    from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer as MatMulNBitsQuantizer
+else:
+    from onnxruntime.quantization.matmul_nbits_quantizer import MatMulNBitsQuantizer
 
 torch_export_onnx_opset_version = 14
 logger = logging.getLogger("")
@@ -714,7 +719,7 @@ def get_args():
         required=False,
         default=32,
         type=int,
-        help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.",
+        help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py for details.",
     )
 
     blockwise_group.add_argument(
@@ -1025,7 +1030,7 @@ def main():
                 for fp_path, int4_path in zip(old_paths, new_paths, strict=False):
                     if os.path.exists(fp_path):
                         model = onnx.load_model(fp_path, load_external_data=True)
-                        quant = MatMul4BitsQuantizer(
+                        quant = MatMulNBitsQuantizer(
                             model=model,
                             block_size=args.block_size,
                             is_symmetric=True,
diff --git a/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py
@@ -14,9 +14,15 @@
 from benchmark_helper import Precision
 from fusion_options import AttentionOpType
 from onnx_model import OnnxModel
+from packaging import version
 from transformers import AutoConfig, AutoModelForCausalLM
 
-from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
+from onnxruntime import __version__ as ort_version
+
+if version.parse(ort_version) < version.parse("1.22.0"):
+    from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer as MatMulNBitsQuantizer
+else:
+    from onnxruntime.quantization.matmul_nbits_quantizer import MatMulNBitsQuantizer
 
 
 class ConvertPhi2ToONNX:
@@ -160,7 +166,7 @@ def optimize_phi2_onnx(self, onnx_path: str, onnx_path_opt: str):
             return
         else:
             assert self.precision == Precision.INT4
-            quant = MatMul4BitsQuantizer(
+            quant = MatMulNBitsQuantizer(
                 model=optimizer.model,
                 block_size=self.block_size,
                 is_symmetric=True,
@@ -351,7 +357,7 @@ def parse_arguments():
         required=False,
         default=16,
         type=int,
-        help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.",
+        help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py for details.",
     )
 
     parser.add_argument(
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -195,17 +195,17 @@ def quant_test(
         )
 
         # Quantize fp32 model to int4 model
-        from onnxruntime.quantization import matmul_4bits_quantizer
+        from onnxruntime.quantization import matmul_nbits_quantizer
 
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
-        quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig(
+        quant_config = matmul_nbits_quantizer.DefaultWeightOnlyQuantConfig(
             block_size=block_size,
             is_symmetric=is_symmetric,
             quant_format=quant_format,
             op_types_to_quantize=op_types_to_quantize,
             quant_axes=quant_axes,
         )
-        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, algo_config=quant_config)
+        quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(model, algo_config=quant_config)
         quant.process()
         quant.model.save_model_to_file(model_int4_path, False)
 
@@ -260,21 +260,21 @@ def quant_test_with_algo(
         )
 
         # Quantize fp32 model to int4 model
-        from onnxruntime.quantization import matmul_4bits_quantizer
+        from onnxruntime.quantization import matmul_nbits_quantizer
 
         algo_config = None
         if algorithm == "RTN":
             # test RTN algorithm
-            algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig()
+            algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig()
         elif algorithm == "GPTQ":
             # test GPTQ algorithm
-            algo_config = matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader)
+            algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader)
         elif algorithm == "HQQ":
             # test HQQ algorithm
-            algo_config = matmul_4bits_quantizer.HQQWeightOnlyQuantConfig(block_size=block_size)
+            algo_config = matmul_nbits_quantizer.HQQWeightOnlyQuantConfig(block_size=block_size)
 
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
-        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config)
+        quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config)
         quant.process()
         quant.model.save_model_to_file(model_int4_path, False)