intel · wenhuach21 · Mar 12, 2026 · Mar 12, 2026 · Copilot · Mar 12, 2026
diff --git a/auto_round/data_type/nvfp.py b/auto_round/data_type/nvfp.py
@@ -17,7 +17,6 @@
 from auto_round.data_type.fp8 import float8_e4m3fn_ste
 from auto_round.data_type.register import register_dtype
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste
-from auto_round.logger import logger
 
 
 # taken from
@@ -206,6 +205,20 @@ def ref_fp4_quant(x, global_scale, block_size=16, v=0, max_scale=1.0):
     return (cast_to_fp4(clipped_x) * get_reciprocal(output_scale)).reshape(m, n), scale
 
 
+def ref_fp4_quant_v3(x, global_scale, block_size=16, v=0, max_scale=1.0):
+    assert (not isinstance(global_scale, torch.Tensor)) or global_scale.dtype == torch.float32
+    assert x.ndim == 2
+    m, n = x.shape
+    if isinstance(max_scale, torch.Tensor):
+        max_scale = max_scale.unsqueeze(dim=-1).to(x.device)
+    vec_max = torch.max(torch.abs(x), dim=-1, keepdim=True)[0] * max_scale
+    scale = global_scale * (vec_max.to(torch.bfloat16) * get_reciprocal(FLOAT4_E2M1_MAX))
+    output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
+    scaled_x = x.to(torch.float32) * output_scale + v
+    clipped_x = torch.clamp(scaled_x, -6.0, 6.0)
+    return (cast_to_fp4(clipped_x) * get_reciprocal(output_scale)).reshape(m, n), scale
+
+
 @register_dtype("fp4_v2_with_global_scale")
 def fp4_v2_with_global_scale(tensor, bits=4, group_size=16, v=0, tensor_max=None, max_scale=1.0, **kwargs):
     assert group_size == 32 or group_size == 16
@@ -235,6 +248,16 @@ def fp4_v2(tensor, bits=4, group_size=32, v=0, max_scale=1.0, **kwargs):
     return qdq_res.to(orig_dtype), scale, None
 
 
+@register_dtype("fp4_v3")
+def fp4_v3(tensor, bits=4, group_size=32, v=0, max_scale=1.0, **kwargs):
+    orig_dtype = tensor.dtype
+    tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
+    global_scale = 1.0
+    qdq_res, scale = ref_fp4_quant_v3(tensor, global_scale, group_size, v, max_scale)
+    qdq_res = revert_tensor_by_pad(qdq_res, orig_shape=orig_shape, pad_len=pad_len)
+    return qdq_res.to(orig_dtype), scale, None
+
+
 if __name__ == "__main__":
     data = torch.tensor([0.0, 0.25, 0.4, 0.75, 1.25, 1.4, 1.75, 2.5, 2.9, 3.5, 5.0, 5.1, 6.0, 6.2, 8.9])
     data1 = cast_to_fp4(data)