Enable AWQ on Intel GPU.

xiaowangintel · xiaowangintel · commit 4f110618880c · 2025-05-29T19:19:49.000-07:00
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -135,7 +135,7 @@ def _groupwise_affine_quantize_tensor_from_qparams(
     if TORCH_VERSION_AT_LEAST_2_5:
         if (not (check_cpu_version(w.device))) and (not (check_xpu_version(w.device))):
             w_int4x8 = (w_int4x8[::, ::2] << 4 | w_int4x8[::, 1::2]).to(torch.uint8)
-        if (check_xpu_version(w.device)):
+        if check_xpu_version(w.device):
             w_int4x8 = (w_int4x8[::, 1::2] << 4 | w_int4x8[::, ::2]).to(torch.uint8)
 
     return w_int4x8
@@ -732,7 +732,7 @@ def test_groupwise_affine_dequantize_tensor_from_qparams(self):
                     not (check_xpu_version(input.device))
                 ):
                     input_tmp = (input[::, ::2] << 4 | input[::, 1::2]).to(torch.uint8)
-                if (check_xpu_version(input.device)):
+                if check_xpu_version(input.device):
                     input_tmp = (input[::, 1::2] << 4 | input[::, ::2]).to(torch.uint8)
                 w_bf16 = groupwise_affine_dequantize_tensor_from_qparams(
                     input_tmp, scales, zeros, n_bit, groupsize, zero_point_domain
diff --git a/torchao/dtypes/uintx/int4_xpu_layout.py b/torchao/dtypes/uintx/int4_xpu_layout.py
@@ -50,9 +50,9 @@ def _linear_bf16_act_uint4_weight_float_zero_check(input_tensor, weight_tensor,
 
 
 def _linear_bf16_act_uint4_weight_float_zero_impl(input_tensor, weight_tensor, bias):
-    assert weight_tensor.block_size[0] == 1, (
-        f"Requires groupwise quantization, got block_size: {weight_tensor.block_size}"
-    )
+    assert (
+        weight_tensor.block_size[0] == 1
+    ), f"Requires groupwise quantization, got block_size: {weight_tensor.block_size}"
     assert input_tensor.shape[-1] == weight_tensor.shape[1], (
         f"need input_tensor shape: {input_tensor.shape} final"
         f"dim to match weight_tensor shape: {weight_tensor.shape} second dim "
@@ -105,9 +105,9 @@ def _linear_fp_act_uint4_weight_int8_zero_check(input_tensor, weight_tensor, bia
 
 
 def _linear_fp_act_uint4_weight_int8_zero_impl(input_tensor, weight_tensor, bias):
-    assert weight_tensor.block_size[0] == 1, (
-        f"Requires groupwise quantization, got block_size: {weight_tensor.block_size}"
-    )
+    assert (
+        weight_tensor.block_size[0] == 1
+    ), f"Requires groupwise quantization, got block_size: {weight_tensor.block_size}"
     assert input_tensor.shape[-1] == weight_tensor.shape[1], (
         f"need input_tensor shape: {input_tensor.shape} final"
         f"dim to match weight_tensor shape: {weight_tensor.shape} second dim "
@@ -243,9 +243,9 @@ def from_plain(
         assert isinstance(_layout, Int4XPULayout)
 
         if TORCH_VERSION_AT_LEAST_2_8:
-            assert int_data.dtype == torch.int32, (
-                "torch.ops.aten._convert_weight_to_int4pack_for_cpu expects `int32` dtype"
-            )
+            assert (
+                int_data.dtype == torch.int32
+            ), "torch.ops.aten._convert_weight_to_int4pack_for_cpu expects `int32` dtype"
             packed_weight = (int_data[::, 1::2] << 4 | int_data[::, ::2]).to(
                 torch.uint8
             )
diff --git a/torchao/prototype/awq/api.py b/torchao/prototype/awq/api.py
@@ -38,9 +38,9 @@
     AWQObserver,
 )
 
-assert len(_DTYPE_TO_BIT_WIDTH) > 0, (
-    "Error importing low bit torch.uint dtypes. Please upgrade to torch 2.3+"
-)
+assert (
+    len(_DTYPE_TO_BIT_WIDTH) > 0
+), "Error importing low bit torch.uint dtypes. Please upgrade to torch 2.3+"
 
 
 def insert_awq_observer_(
@@ -63,9 +63,9 @@ def insert_awq_observer_(
         group_size: Quantization granularity. Use -1 for channel wise quantization
     """
     _is_linear = lambda m, fqn: isinstance(m, torch.nn.Linear)
-    assert quant_dtype in _DTYPE_TO_BIT_WIDTH or quant_dtype == torch.uint8, (
-        "Invalid quant_dtype. Please use torch.uint1 .. torch.uint8"
-    )
+    assert (
+        quant_dtype in _DTYPE_TO_BIT_WIDTH or quant_dtype == torch.uint8
+    ), "Invalid quant_dtype. Please use torch.uint1 .. torch.uint8"
     # AQT config
     mapping_type = MappingType.ASYMMETRIC
     quantization_granularity = PerGroup(group_size)
@@ -137,10 +137,10 @@ def _awq_uintx_transform(
         torchao.quantization.utils.recommended_inductor_config_setter()
     observed_linear = module
 
-    assert quant_dtype in _DTYPE_TO_BIT_WIDTH or quant_dtype == torch.uint8, (
-        "Invalid quant_dtype. Please use torch.uint1 .. torch.uint8"
-    )
-    
+    assert (
+        quant_dtype in _DTYPE_TO_BIT_WIDTH or quant_dtype == torch.uint8
+    ), "Invalid quant_dtype. Please use torch.uint1 .. torch.uint8"
+
     equalization_scale = observed_linear.act_obs.calculate_qparams()
     # AQT config
     if quant_dtype == torch.uint4:
diff --git a/torchao/prototype/awq/example.py b/torchao/prototype/awq/example.py
@@ -232,7 +232,9 @@ def wikitext2_ppl(
         use_hqq = "hqq" in quant
         print(f"running {quant_dtype} quantization")
         t0 = time.time()
-        awq_uintx_config = awq_uintx(quant_dtype=quant_dtype, group_size=group_size, use_hqq=use_hqq)
+        awq_uintx_config = awq_uintx(
+            quant_dtype=quant_dtype, group_size=group_size, use_hqq=use_hqq
+        )
         if "xpu" in device:
             awq_uintx_config.layout = Int4XPULayout()
         quantize_(
@@ -248,7 +250,9 @@ def wikitext2_ppl(
         group_size = int(quant.split("-")[1])
         use_hqq = "hqq" in quant
         print(f"running {quant} quantization with group size {group_size}")
-        int4_weight_only_config = int4_weight_only(group_size=group_size, use_hqq=use_hqq)
+        int4_weight_only_config = int4_weight_only(
+            group_size=group_size, use_hqq=use_hqq
+        )
         if "xpu" in device:
             int4_weight_only_config.layout = Int4XPULayout()
         quantize_(model, int4_weight_only_config)
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -231,19 +231,20 @@ def quant_int8_per_token_matmul(
       Y_i_j_fp32 = sx * sw dot(X_i, W_j)
     """
 
-    assert x_vals_int8.dtype == torch.int8, (
-        f"x dtype {x_vals_int8.dtype} not yet supported"
-    )
-    assert w_vals_int8_t.dtype == torch.int8, (
-        f"w dtype {w_vals_int8_t.dtype} not yet supported"
-    )
-
-    assert x_scales.dtype in [
-        torch.float,
-        torch.bfloat16,
-    ], (
-        f"x_scales needs to be a torch.float32 or torch.bfloat16 but got {x_scales.dtype}"
-    )
+    assert (
+        x_vals_int8.dtype == torch.int8
+    ), f"x dtype {x_vals_int8.dtype} not yet supported"
+    assert (
+        w_vals_int8_t.dtype == torch.int8
+    ), f"w dtype {w_vals_int8_t.dtype} not yet supported"
+
+    assert (
+        x_scales.dtype
+        in [
+            torch.float,
+            torch.bfloat16,
+        ]
+    ), f"x_scales needs to be a torch.float32 or torch.bfloat16 but got {x_scales.dtype}"
 
     #
     # 1. do the matrix form of dot(X_i, W_j)
@@ -488,8 +489,7 @@ def groupwise_affine_dequantize_tensor_from_qparams(
             dtype=torch.int32,
             device=w_int4x8.device,
         )
-        if (not (check_xpu_version(w_int4x8.device))
-        ):
+        if not (check_xpu_version(w_int4x8.device)):
             w_int32[::, ::2] = high_bits
             w_int32[::, 1::2] = low_bits
         else: