fix: route per-channel FP8 MoE to CompressedTensorsFp8MoEMethod

thpereir · thpereir · commit 7392aec58e30 · 2026-03-06T16:03:08.000Z
Per-channel (per_Token) FP8 quantization needs per-channel weight scale
allocation [E, N, 1] which CompressedTensorsFp8MoEMethod provides.
Fp8MoEMethod only allocates scalar-per-expert scales [E, 2]/[E].

- Add dispatch case for quant_dtype==fp8 + quant_type==per_Token to use
  CompressedTensorsFp8MoEMethod
- Fix _load_per_channel_weight_scale to unsqueeze 1D checkpoint scales
  to match 2D [N, 1] buffer shape
diff --git a/atom/model_ops/moe.py b/atom/model_ops/moe.py
@@ -1984,6 +1984,13 @@ def __init__(
         ):
             # Use CompressedTensorsFp8MoEMethod for compressed-tensors format
             self.quant_method = CompressedTensorsFp8MoEMethod(quant_config, moe)
+        elif (
+            quant_config["quant_dtype"] == dtypes.fp8
+            and quant_config["quant_type"] == QuantType.per_Token
+        ):
+            # Per-channel FP8 (e.g., Quark per_Token override for MTP layers)
+            # needs CompressedTensors-style weight scale handling
+            self.quant_method = CompressedTensorsFp8MoEMethod(quant_config, moe)
         elif quant_config["quant_dtype"] == dtypes.fp8:
             self.quant_method = Fp8MoEMethod(quant_config, moe)
         elif quant_config["quant_dtype"] == dtypes.fp4x2:
@@ -2100,6 +2107,10 @@ def _load_per_channel_weight_scale(
         tp_rank: int,
     ):
         # for per channel weight quantization
+        # When scales are stored as [N,1] (CompressedTensors per-channel)
+        # but loaded from checkpoint as [N], reshape to match.
+        if loaded_weight.dim() == 1 and expert_data.dim() == 2:
+            loaded_weight = loaded_weight.unsqueeze(-1)
         if shard_id == "w2":
             expert_data.copy_(loaded_weight)
         elif shard_id in ("w1", "w3"):