Propagate quantization mode in quantized layers (#3133)

vskiwi · web-flow · commit e226af720e15 · 2026-02-15T18:33:13.000-08:00
diff --git a/python/mlx/nn/layers/distributed.py b/python/mlx/nn/layers/distributed.py
@@ -371,6 +371,8 @@ class QuantizedAllToShardedLinear(Module):
             weight. See :func:`~mlx.core.quantize`. Default: ``64``.
         bits (int, optional): The bit width to use for the quantized weight.
             See :func:`~mlx.core.quantize`. Default: ``4``.
+        mode (str, optional): The quantization method to use (see
+            :func:`~mlx.core.quantize`). Default: ``"affine"``.
         group (mx.distributed.Group, optional): The sharding will happen across
             this group. If not set then the global group is used. Default is
             ``None``.
@@ -383,13 +385,15 @@ def __init__(
         bias: bool = True,
         group_size: int = 64,
         bits: int = 4,
+        mode: str = "affine",
         group: Optional[mx.distributed.Group] = None,
     ):
         super().__init__()
 
         # Quantization config
         self.group_size = group_size
         self.bits = bits
+        self.mode = mode
 
         # Initialize the quantized weight
         scale = math.sqrt(1.0 / input_dims)
@@ -406,7 +410,10 @@ def __init__(
             high=scale,
             shape=(output_dims // N, input_dims),
         )
-        self.weight, self.scales, self.biases = mx.quantize(weight, group_size, bits)
+        self.weight, self.scales, *biases = mx.quantize(
+            weight, group_size, bits, mode=mode
+        )
+        self.biases = biases[0] if biases else None
 
         # And bias if needed
         if bias:
@@ -427,7 +434,7 @@ def _extra_repr(self) -> str:
         out_dims *= self.group.size()
         return (
             f"input_dims={in_dims}, output_dims={out_dims}, bias={'bias' in self}, "
-            f"group_size={self.group_size}, bits={self.bits}"
+            f"group_size={self.group_size}, bits={self.bits}, mode={self.mode}"
         )
 
     def __call__(self, x: mx.array) -> mx.array:
@@ -438,10 +445,11 @@ def __call__(self, x: mx.array) -> mx.array:
             x,
             self["weight"],
             scales=self["scales"],
-            biases=self["biases"],
+            biases=self.get("biases"),
             transpose=True,
             group_size=self.group_size,
             bits=self.bits,
+            mode=self.mode,
         )
         if "bias" in self:
             x = x + self["bias"]
@@ -465,6 +473,7 @@ def from_quantized_linear(
             hasattr(quantized_linear_layer, "bias"),
             group_size=quantized_linear_layer.group_size,
             bits=quantized_linear_layer.bits,
+            mode=getattr(quantized_linear_layer, "mode", "affine"),
             group=group,
         )
         sl.update(
@@ -497,6 +506,8 @@ class QuantizedShardedToAllLinear(Module):
             weight. See :func:`~mlx.core.quantize`. Default: ``64``.
         bits (int, optional): The bit width to use for the quantized weight.
             See :func:`~mlx.core.quantize`. Default: ``4``.
+        mode (str, optional): The quantization method to use (see
+            :func:`~mlx.core.quantize`). Default: ``"affine"``.
         group (mx.distributed.Group, optional): The sharding will happen across
             this group. If not set then the global group is used. Default is
             ``None``.
@@ -509,13 +520,15 @@ def __init__(
         bias: bool = True,
         group_size: int = 64,
         bits: int = 4,
+        mode: str = "affine",
         group: Optional[mx.distributed.Group] = None,
     ):
         super().__init__()
 
         # Quantization config
         self.group_size = group_size
         self.bits = bits
+        self.mode = mode
 
         # Initialize the quantized weight
         scale = math.sqrt(1.0 / input_dims)
@@ -532,7 +545,10 @@ def __init__(
             high=scale,
             shape=(output_dims, input_dims // N),
         )
-        self.weight, self.scales, self.biases = mx.quantize(weight, group_size, bits)
+        self.weight, self.scales, *biases = mx.quantize(
+            weight, group_size, bits, mode=mode
+        )
+        self.biases = biases[0] if biases else None
 
         # And bias if needed
         if bias:
@@ -552,18 +568,19 @@ def _extra_repr(self) -> str:
         in_dims = (in_dims * 32) // self.bits * self.group.size()
         return (
             f"input_dims={in_dims}, output_dims={out_dims}, bias={'bias' in self}, "
-            f"group_size={self.group_size}, bits={self.bits}"
+            f"group_size={self.group_size}, bits={self.bits}, mode={self.mode}"
         )
 
     def __call__(self, x: mx.array) -> mx.array:
         x = mx.quantized_matmul(
             x,
             self["weight"],
             scales=self["scales"],
-            biases=self["biases"],
+            biases=self.get("biases"),
             transpose=True,
             group_size=self.group_size,
             bits=self.bits,
+            mode=self.mode,
         )
         x = mx.distributed.all_sum(x, group=self.group)
         if "bias" in self:
@@ -588,6 +605,7 @@ def from_quantized_linear(
             hasattr(quantized_linear_layer, "bias"),
             group_size=quantized_linear_layer.group_size,
             bits=quantized_linear_layer.bits,
+            mode=getattr(quantized_linear_layer, "mode", "affine"),
             group=group,
         )
         sl.update(
diff --git a/python/tests/mlx_distributed_tests.py b/python/tests/mlx_distributed_tests.py
@@ -146,7 +146,7 @@ def test_shard_linear(self):
         self.assertTrue(mx.allclose(y, y2, atol=self.atol, rtol=self.rtol))
         self.assertTrue(mx.allclose(y[part], y1, atol=self.atol, rtol=self.rtol))
 
-        # And their quant versions (QuintizedMatmul is not supported on CUDA)
+        # And their quant versions (QuantizedMatmul is not supported on CUDA)
         if not mx.cuda.is_available():
             qlin = lin.to_quantized()
             slin1 = shard_linear(qlin, "all-to-sharded")
@@ -157,6 +157,27 @@ def test_shard_linear(self):
             self.assertTrue(mx.allclose(y, y2, atol=self.atol, rtol=self.rtol))
             self.assertTrue(mx.allclose(y[part], y1))
 
+            # Test non-affine quantization modes (mxfp8)
+            qlin_mxfp8 = lin.to_quantized(group_size=32, bits=8, mode="mxfp8")
+            self.assertEqual(qlin_mxfp8.mode, "mxfp8")
+
+            slin1_mxfp8 = shard_linear(qlin_mxfp8, "all-to-sharded")
+            slin2_mxfp8 = shard_linear(qlin_mxfp8, "sharded-to-all")
+
+            # Verify mode is propagated
+            self.assertEqual(slin1_mxfp8.mode, "mxfp8")
+            self.assertEqual(slin2_mxfp8.mode, "mxfp8")
+
+            # Verify biases parameter is not set for mxfp8
+            self.assertIsNone(slin1_mxfp8.get("biases"))
+            self.assertIsNone(slin2_mxfp8.get("biases"))
+
+            y = qlin_mxfp8(x)
+            y1 = slin1_mxfp8(x)
+            y2 = slin2_mxfp8(x[part])
+            self.assertTrue(mx.allclose(y, y2, atol=self.atol, rtol=self.rtol))
+            self.assertTrue(mx.allclose(y[part], y1))
+
         # Check the backward works as expected
         def dummy_loss(model, x, y):
             return (model(x) * y).sum()