Add mixed MoE block quant config and TPU coverage

cjx0709 · cjx0709 · commit 98d6c8fe4d9b · 2026-03-09T17:10:45.000+08:00
diff --git a/python/sgl_jax/srt/kernels/quantized_matmul/kernel.py b/python/sgl_jax/srt/kernels/quantized_matmul/kernel.py
@@ -219,6 +219,21 @@ def _get_effective_block_sizes(
     return block_size_out, block_size_in
 
 
+def _should_use_3rd_party_blockwise_kernel(
+    *,
+    out_dim: int,
+    block_size_out: int,
+) -> bool:
+    """Guard known-bad narrow-N TPU blockwise cases.
+
+    When a tensor-parallel column shard collapses to a single output block
+    (for example local N=128 with block_size_out=128), the third-party TPU
+    blockwise kernel can produce NaNs on Qwen3-MoE k/v projections. The local
+    dequantized fallback remains numerically stable for the same inputs.
+    """
+    return out_dim > block_size_out
+
+
 def _expand_block_scales_to_weight_shape(
     w_scale: jax.Array,
     out_dim: int,
@@ -307,7 +322,14 @@ def xla_quantized_matmul_local(
         # path as fallback for non-TPU / unavailable environments.
         out = None
         blockwise_3rd_kernel = _get_blockwise_3rd_kernel()
-        if jax.default_backend() == "tpu" and blockwise_3rd_kernel is not None:
+        if (
+            jax.default_backend() == "tpu"
+            and blockwise_3rd_kernel is not None
+            and _should_use_3rd_party_blockwise_kernel(
+                out_dim=int(out_dim),
+                block_size_out=int(block_size_out),
+            )
+        ):
             try:
                 w_scale_3rd = _convert_block_scale_to_3rd_layout(
                     w_scale=w_scale,
diff --git a/python/sgl_jax/srt/utils/quantization/configs/int8_moe_block_128_linear_channel_dynamic.yaml b/python/sgl_jax/srt/utils/quantization/configs/int8_moe_block_128_linear_channel_dynamic.yaml
@@ -0,0 +1,18 @@
+# INT8 dynamic mixed quantization config
+# - Linear layers use per-channel quantization.
+# - MoE weights use 128x128 block quantization.
+
+quantization:
+  is_static_checkpoint: false
+  weight_block_size: [128, 128]
+
+  linear:
+    rules:
+      - module_path: '.*'
+        weight_dtype: 'int8'
+        activation_dtype: null
+        weight_block_size: null
+
+  moe:
+    weight_dtype: 'int8'
+    activation_dtype: null
diff --git a/python/sgl_jax/srt/utils/quantization/quantization_utils.py b/python/sgl_jax/srt/utils/quantization/quantization_utils.py
@@ -35,6 +35,28 @@ def _get_block_reshape_sharding(
     return NamedSharding(input_sharding.mesh, P(*blocked_spec))
 
 
+def _get_safe_block_quant_input_sharding(
+    tensor: jax.Array,
+    quantized_axes: list[int],
+) -> NamedSharding | None:
+    """Drop sharding on axes that are about to be split into (num_blocks, block)."""
+    input_sharding = getattr(tensor, "sharding", None)
+    if not isinstance(input_sharding, NamedSharding):
+        return None
+
+    adjusted_spec = list(input_sharding.spec)
+    changed = False
+    for axis_idx in quantized_axes:
+        if axis_idx < len(adjusted_spec) and adjusted_spec[axis_idx] is not None:
+            adjusted_spec[axis_idx] = None
+            changed = True
+
+    if not changed:
+        return None
+
+    return NamedSharding(input_sharding.mesh, P(*adjusted_spec))
+
+
 def apply_linear_quantization(
     model_config: ModelConfig, model: nnx.Module, is_static_input: bool = False
 ) -> nnx.Module:
@@ -76,6 +98,11 @@ def apply_linear_quantization(
         # Accept both sglang-jax style and Qwix-style field names.
         weight_dtype_str = rule.get("weight_dtype", rule.get("weight_qtype"))
         activation_dtype_str = rule.get("activation_dtype", rule.get("act_qtype"))
+        weight_block_size = (
+            rule["weight_block_size"]
+            if "weight_block_size" in rule
+            else getattr(quant_config, "weight_block_size", None)
+        )
 
         # Convert string dtypes to jnp dtypes
         weight_dtype = DTYPE_MAP.get(weight_dtype_str)
@@ -89,6 +116,7 @@ def apply_linear_quantization(
                 "pattern": pattern,
                 "weight_dtype": weight_dtype,
                 "activation_dtype": activation_dtype,
+                "weight_block_size": weight_block_size,
             }
         )
 
@@ -140,7 +168,7 @@ def _replace_linear_recursive(obj, path: str = "", visited: set | None = None):
                             weight_dtype=rule["weight_dtype"],
                             activation_dtype=rule["activation_dtype"],
                             is_static_input=is_static_input,
-                            weight_block_size=getattr(quant_config, "weight_block_size", None),
+                            weight_block_size=rule["weight_block_size"],
                         )
                         # Replace the attribute and free old weights
                         setattr(obj, attr_name, quantized_linear)
@@ -323,6 +351,7 @@ def quantize_tensor(
         axis = [axis]
 
     orig_shape = tensor.shape
+    original_input_sharding = getattr(tensor, "sharding", None)
     mask = None
 
     if block_size is not None:
@@ -356,14 +385,19 @@ def quantize_tensor(
 
         orig_shape = tensor.shape
         # Convert all axis into positive values.
-        axis = sorted([i % tensor.ndim for i in axis])
+        quantized_axes = sorted([i % tensor.ndim for i in axis])
+        safe_input_sharding = _get_safe_block_quant_input_sharding(tensor, quantized_axes)
+        if safe_input_sharding is not None:
+            tensor = jax.sharding.reshard(tensor, safe_input_sharding)
+            if mask is not None:
+                mask = jax.sharding.reshard(mask, safe_input_sharding)
+
         # Shift axis by 1 since its original position is now occupied by
         # num_blocks dim. Also, if n axes before an axis was also quantized,
         # shift its position by n.
-        axis = [1 + n + i for n, i in enumerate(axis)]
+        axis = [1 + n + i for n, i in enumerate(quantized_axes)]
 
-        input_sharding = getattr(tensor, "sharding", None)
-        blocked_out_sharding = _get_block_reshape_sharding(tensor, axis)
+        blocked_out_sharding = _get_block_reshape_sharding(tensor, quantized_axes)
 
         # Flatten list of lists that contains (num_blocks, block).
         blocked_shape = list(itertools.chain(*blocked_shape))
@@ -383,8 +417,8 @@ def quantize_tensor(
     # Guard all-zero blocks/tensors: scale==0 would produce 0/0 -> NaN.
     scale_safe = scale + (scale == 0).astype(scale.dtype)
     tensor_q = jnp.clip(tensor / scale_safe, dtype_min, dtype_max)
-    if block_size is not None and isinstance(input_sharding, NamedSharding):
-        tensor_q = jax.lax.reshape(tensor_q, orig_shape, out_sharding=input_sharding)
+    if block_size is not None and isinstance(original_input_sharding, NamedSharding):
+        tensor_q = jax.lax.reshape(tensor_q, orig_shape, out_sharding=original_input_sharding)
     else:
         tensor_q = tensor_q.reshape(orig_shape)
     tensor_q = tensor_q.astype(dtype)
diff --git a/python/sgl_jax/test/kernels/quantized_linear_test.py b/python/sgl_jax/test/kernels/quantized_linear_test.py
@@ -9,7 +9,8 @@
 
 import sgl_jax.srt.kernels.quantized_matmul.kernel as quant_kernel
 from sgl_jax.srt.kernels.quantized_matmul.kernel import xla_quantized_matmul_local
-from sgl_jax.srt.layers.linear import QuantizedLinear
+from sgl_jax.srt.layers.linear import LinearBase, QuantizedLinear
+from sgl_jax.srt.utils.quantization.quantization_utils import apply_linear_quantization
 from sgl_jax.srt.utils.quantization.quantization_utils import quantize_tensor
 
 
@@ -183,8 +184,55 @@ def test_blockwise_tuning_fallback_uses_compatible_seed(monkeypatch):
     _assert_blockwise_tuning_fallback_uses_compatible_seed()
 
 
+def test_linear_rule_weight_block_size_override():
+    class DummyModel(nnx.Module):
+        def __init__(self, mesh):
+            self.proj = LinearBase(
+                input_size=256,
+                output_size=512,
+                use_bias=False,
+                mesh=mesh,
+                kernel_axes=(None, None),
+                params_dtype=jnp.bfloat16,
+                scope_name="proj",
+            )
+
+    mesh = _create_single_device_mesh()
+    model = DummyModel(mesh)
+
+    class FakeModelConfig:
+        pass
+
+    model_config = FakeModelConfig()
+    model_config.quantization_config = type(
+        "FakeQuantConfig",
+        (),
+        {
+            "get_linear_rules": staticmethod(
+                lambda: [
+                    {
+                        "module_path": ".*",
+                        "weight_dtype": "int8",
+                        "activation_dtype": None,
+                        "weight_block_size": None,
+                    }
+                ]
+            ),
+            "ignored_layers": None,
+            "weight_block_size": [128, 128],
+        },
+    )()
+
+    apply_linear_quantization(model_config, model, is_static_input=False)
+
+    assert isinstance(model.proj, QuantizedLinear)
+    assert model.proj.weight_block_size is None
+    assert model.proj.weight_scale.value.ndim == 1
+
+
 if __name__ == "__main__":
     for fmt in ("per_channel", "block_channel", "block_quant"):
         test_quantized_linear_offline_scale_formats(fmt)
     test_xla_quantized_matmul_block_quant_all()
     _assert_blockwise_tuning_fallback_uses_compatible_seed()
+    test_linear_rule_weight_block_size_override()
diff --git a/test/srt/quantization/test_w8_moe_block_linear_channel_quantization.py b/test/srt/quantization/test_w8_moe_block_linear_channel_quantization.py
@@ -0,0 +1,91 @@
+import os
+import re
+import sys
+import time
+import unittest
+
+import requests
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sgl_jax.srt.utils import kill_process_tree
+from sgl_jax.test.test_utils import DEFAULT_URL_FOR_TEST, CustomTestCase, popen_launch_server
+
+
+class TestW8Int8MoeBlockLinearChannelQuant(CustomTestCase):
+    model = "Qwen/Qwen3-30B-A3B"
+    quantization_config_path = "int8_moe_block_128_linear_channel_dynamic.yaml"
+    other_args = [
+        "--tp-size=4",
+        "--ep-size=4",
+        "--download-dir=/dev/shm",
+        "--max-running-requests=64",
+        "--page-size=64",
+        "--disable-precompile",
+    ]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--quantization-config-path",
+            cls.quantization_config_path,
+            *cls.other_args,
+        ]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=1800,
+            other_args=other_args,
+            check_cache_miss=False,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+        try:
+            cls.process.wait(timeout=30)
+        except Exception:
+            pass
+        time.sleep(5)
+
+    def _generate(self, prompt, max_new_tokens=16):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+            },
+            timeout=600,
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def test_basic_generation(self):
+        prompts = [
+            (
+                "Answer with one word only. What is the capital of France?",
+                re.compile(r"\bparis\b"),
+            ),
+            (
+                "Answer with one number only. What is 12 + 7?",
+                re.compile(r"\b19\b"),
+            ),
+        ]
+
+        for prompt, expected_pattern in prompts:
+            data = self._generate(prompt, max_new_tokens=8)
+            text = data.get("text", "")
+            self.assertTrue(text.strip(), f"Empty generation response: {data}")
+            self.assertRegex(
+                text.lower(),
+                expected_pattern,
+                msg=f"Unexpected generation text for prompt {prompt!r}: {text!r}",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
@@ -486,6 +486,10 @@ def run_one_file(filename):
         TestFile("test/srt/test_model_loader.py", 5),
         TestFile("test/srt/quantization/test_w8_quantization.py", 10),
         TestFile("test/srt/quantization/test_w8_block_dynamic_quantization.py", 8, runner="pytest"),
+        TestFile(
+            "test/srt/quantization/test_w8_moe_block_linear_channel_quantization.py",
+            15,
+        ),
         TestFile("test/srt/test_engine_determine_generation.py", 5),
         TestFile("test/srt/test_engine_flush_cache.py", 5),
         TestFile("test/srt/test_engine_pause_continue.py", 6),