Enable fp16+int4 mixed precission path for int4 xpu path with int zero point (#2240)

liangan1 · web-flow · commit 0aa8dbd8b8e9 · 2025-05-29T02:06:57.000-07:00
* Enable fp16 path for int4 xpu path with int zero point

* Update int4_xpu_layout.py

* Fix typo
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -46,8 +46,8 @@
 from torchao.dtypes.uintx.int4_xpu_layout import (
     _linear_bf16_act_uint4_weight_float_zero_check,
     _linear_bf16_act_uint4_weight_float_zero_impl,
-    _linear_bf16_act_uint4_weight_int8_zero_check,
-    _linear_bf16_act_uint4_weight_int8_zero_impl,
+    _linear_fp_act_uint4_weight_int8_zero_check,
+    _linear_fp_act_uint4_weight_int8_zero_impl,
 )
 from torchao.dtypes.uintx.marlin_qqq_tensor import (
     _linear_int8_act_int4_weight_marlin_qqq_check,
@@ -240,8 +240,8 @@ def _register_aqt_quantized_linear_dispatches():
             _linear_q_dq_impl,
         ),
         (
-            _linear_bf16_act_uint4_weight_int8_zero_check,
-            _linear_bf16_act_uint4_weight_int8_zero_impl,
+            _linear_fp_act_uint4_weight_int8_zero_check,
+            _linear_fp_act_uint4_weight_int8_zero_impl,
         ),
         (
             _linear_bf16_act_uint4_weight_float_zero_check,
@@ -267,7 +267,6 @@ def _(func, types, args, kwargs):
         raise NotImplementedError(
             f"{func} is not implemented for non floating point input"
         )
-
     # using try/except here so that we can have a general fallback when input_tensor/weight_tensor
     # is not picked up by any of the dispatch paths in `_quantized_linear_op`, this allows us to
     # make the branches easier to understand in `_quantized_linear_op`
diff --git a/torchao/dtypes/uintx/int4_xpu_layout.py b/torchao/dtypes/uintx/int4_xpu_layout.py
@@ -89,26 +89,22 @@ def _linear_bf16_act_uint4_weight_float_zero_impl(input_tensor, weight_tensor, b
     return y.to(orig_dtype)
 
 
-def _linear_bf16_act_uint4_weight_int8_zero_check(input_tensor, weight_tensor, bias):
+def _linear_fp_act_uint4_weight_int8_zero_check(input_tensor, weight_tensor, bias):
     return (
-        # input is native bfloat16 tensor
         not is_traceable_wrapper_subclass(input_tensor)
-        and input_tensor.dtype == torch.bfloat16
         and
         # weight is uint4, group quantized tensor_core_tiled tensor impl affine quantized tensor
         isinstance(weight_tensor, AffineQuantizedTensor)
         and _aqt_is_xpu_layout_uint4(weight_tensor)
-        and weight_tensor.dtype == torch.bfloat16
         and len(weight_tensor.shape) == 2
         and weight_tensor.zero_point_domain == ZeroPointDomain.INT
         and weight_tensor.tensor_impl.scale_and_zero is None
-        and weight_tensor.tensor_impl.scale.dtype == torch.bfloat16
         and weight_tensor.tensor_impl.zero.dtype == torch.int8
         and isinstance(weight_tensor._layout, Int4XPULayout)
     )
 
 
-def _linear_bf16_act_uint4_weight_int8_zero_impl(input_tensor, weight_tensor, bias):
+def _linear_fp_act_uint4_weight_int8_zero_impl(input_tensor, weight_tensor, bias):
     assert weight_tensor.block_size[0] == 1, (
         f"Requires groupwise quantization, got block_size: {weight_tensor.block_size}"
     )
@@ -129,7 +125,7 @@ def _linear_bf16_act_uint4_weight_int8_zero_impl(input_tensor, weight_tensor, bi
     orig_act_size = act_mat.size()
     orig_dtype = act_mat.dtype
 
-    act_mat = act_mat.reshape(-1, act_mat.shape[-1]).to(torch.bfloat16)
+    act_mat = act_mat.reshape(-1, act_mat.shape[-1])
 
     # groupwise int4 quantization
     groupsize = weight_tensor.block_size[1]