fix pr error

cjx0709 · cjx0709 · commit 05dea535b60f · 2026-03-09T18:11:36.000+08:00
diff --git a/python/sgl_jax/srt/kernels/fused_moe/v1/kernel.py b/python/sgl_jax/srt/kernels/fused_moe/v1/kernel.py
@@ -3086,4 +3086,4 @@ def kernel(
         w1_shared_scale,
         w3_shared_scale,
         w2_shared_scale,
-    )
+    )
diff --git a/python/sgl_jax/srt/layers/linear.py b/python/sgl_jax/srt/layers/linear.py
@@ -59,7 +59,7 @@ def __init__(
         else:
             self.bias = None
 
-    def __call__(self, x: jax.Array) -> jax.Array | tuple[jax.Array, jax.Array]:
+    def __call__(self, x: jax.Array) -> tuple[jax.Array, jax.Array | None]:
         """Forward pass."""
         x_2d = x.reshape(-1, x.shape[-1]) if x.ndim > 2 else x
 
@@ -88,7 +88,7 @@ def _sharded_dot(lhs: jax.Array, rhs: jax.Array) -> jax.Array:
         if self.skip_bias_add:
             return out, (self.bias.value if self.bias is not None else None)
         if self.bias is not None:
-            return out + self.bias.value
+            out = out + self.bias.value
         return out, None
 
 
@@ -176,7 +176,7 @@ def from_linear(
                 bias = linear.bias.value if linear.bias is not None else None
         else:
             weight = linear.weight.value
-            weight_t = weight.T 
+            weight_t = weight.T
 
             if effective_weight_block_size is not None and len(effective_weight_block_size) == 2:
                 weight_q, weight_scale = quantize_tensor(
@@ -197,7 +197,7 @@ def from_linear(
             scope_name=f"quantized_{linear.name}",
         )
 
-    def __call__(self, x: jax.Array) -> jax.Array | tuple[jax.Array, jax.Array]:
+    def __call__(self, x: jax.Array) -> tuple[jax.Array, jax.Array | None]:
         """Forward pass with quantization."""
         quantize_activation = self.activation_dtype is not None
         x_2d = x.reshape(-1, x.shape[-1]) if x.ndim > 2 else x
@@ -208,7 +208,7 @@ def __call__(self, x: jax.Array) -> jax.Array | tuple[jax.Array, jax.Array]:
 
         input_axis, output_axis = self.kernel_axes[0], self.kernel_axes[1]
         w_scale_spec = P(output_axis) if scale_val.ndim == 1 else P(output_axis, input_axis)
-        
+
         in_specs = (P(None, input_axis), P(output_axis, input_axis), w_scale_spec)
         out_specs = P(None, output_axis)
 
@@ -239,5 +239,5 @@ def __call__(self, x: jax.Array) -> jax.Array | tuple[jax.Array, jax.Array]:
         if self.skip_bias_add:
             return output, (self.bias.value if self.bias is not None else None)
         if self.bias is not None:
-            return output + self.bias.value
-        return output
+            output = output + self.bias.value
+        return output, None
diff --git a/python/sgl_jax/srt/utils/quantization/quantization_utils.py b/python/sgl_jax/srt/utils/quantization/quantization_utils.py
@@ -150,9 +150,6 @@ def _replace_linear_recursive(obj, path: str = "", visited: set | None = None):
                     if any(dot_path.endswith(ignored) or ignored in dot_path for ignored in ignored_layers):
                         logger.info("Skipping %s - in ignored_layers", child_path)
                         continue
-                    if "self_attn.o_proj" in dot_path and ignored_layers:
-                        logger.info("Skipping %s - explicit o_proj ignore", child_path)
-                        continue
 
                     rule = _find_matching_rule(child_path)
                     if rule is not None:
@@ -247,63 +244,6 @@ def _quantize_moe_recursive(obj, path: str = "", visited=None):
     return model
 
 
-def adapt_fused_moe_static_block_quant_for_kernel(
-    model: nnx.Module,
-    *,
-    target_subc_quant_wsz: int = 256,
-) -> nnx.Module:
-    """Adapt static fused-MoE block quant weights/scales before fused kernel execution.
-
-    This is a front-end compatibility step for static checkpoints whose fused MoE
-    subchannel block size is smaller than the fused kernel's supported size.
-    """
-    # Import here to avoid circular imports
-    from sgl_jax.srt.layers.moe import FusedEPMoE
-
-    adapted_count = 0
-
-    def _adapt_recursive(obj, path: str = "", visited=None):
-        nonlocal adapted_count
-        if visited is None:
-            visited = set()
-
-        obj_id = id(obj)
-        if obj_id in visited:
-            return
-        visited.add(obj_id)
-
-        if isinstance(obj, FusedEPMoE):
-            if obj.prepare_static_block_quant_for_fused_kernel(
-                target_subc_quant_wsz=target_subc_quant_wsz
-            ):
-                adapted_count += 1
-                logger.info(
-                    "Adapted static fused MoE at %s to subc=%s for fused kernel",
-                    path or getattr(obj, "name", type(obj).__name__),
-                    target_subc_quant_wsz,
-                )
-            return
-
-        if hasattr(obj, "__dict__"):
-            for attr_name, attr_value in obj.__dict__.items():
-                child_path = f"{path}/{attr_name}" if path else attr_name
-                if isinstance(attr_value, nnx.Module):
-                    _adapt_recursive(attr_value, child_path, visited)
-                elif isinstance(attr_value, list):
-                    for idx, item in enumerate(attr_value):
-                        if isinstance(item, nnx.Module):
-                            item_path = f"{child_path}[{idx}]"
-                            _adapt_recursive(item, item_path, visited)
-
-    _adapt_recursive(model)
-    if adapted_count:
-        logger.info(
-            "Completed static fused MoE block-quant kernel adaptation on %d layer(s)",
-            adapted_count,
-        )
-    return model
-
-
 def quantize_tensor_simple(
     x: jax.Array, dtype: jnp.dtype, dim: int = -1, out_dtype: jnp.dtype = jnp.float32
 ):
diff --git a/python/sgl_jax/test/kernels/moe_block_quant_test.py b/python/sgl_jax/test/kernels/moe_block_quant_test.py
@@ -22,40 +22,39 @@ def test_epmoe_block_quant_logic(weight_block_size, expected_k_blocks_wi, expect
     Focuses on weight/scale shapes and placeholder generation.
     """
     print("\n>>> Testing EP MoE Block Quantization Logic (CPU) <<<")
-    
+
     # 1. Setup a minimal mesh
     devices = jax.devices()
     # Use names that are safe for CPU/Standard JAX
     mesh = Mesh(np.array(devices[:1]).reshape(1, 1), axis_names=("data", "tensor"))
-    
+
     # 2. Configuration
     hidden_size = 512
     intermediate_dim = 1024
     num_experts = 4
     num_experts_per_tok = 1
-    block_size = 128
-    
+
     # Mock QuantizationConfig
     class MockQuantConfig:
-        def get_moe_weight_dtype(self): return jnp.int8
-        def get_moe_activation_dtype(self): return None
+        def get_moe_weight_dtype(self):
+            return jnp.int8
+
+        def get_moe_activation_dtype(self):
+            return None
+
         @property
-        def weight_block_size(self): return weight_block_size
-        
+        def weight_block_size(self):
+            return weight_block_size
+
     quant_config = MockQuantConfig()
-    
+
     # 3. Initialize EPMoE with Mocked Mesh to bypass sharding checks on CPU
-    from unittest.mock import MagicMock
-    mock_mesh = MagicMock(spec=Mesh)
-    mock_mesh.shape = {"expert": 1, "tensor": 1}
-    mock_mesh.axis_names = ("expert", "tensor")
-    mock_mesh.devices = np.array(jax.devices()[:1]).reshape(1, 1)
-    
     # We monkeypatch the sharding in EPMoE to be CPU-friendly for this test
     original_p = P
     import sgl_jax.srt.layers.moe as moe_module
-    moe_module.P = lambda *args: None # Disable sharding for CPU UT
-    
+
+    moe_module.P = lambda *args: None  # Disable sharding for CPU UT
+
     try:
         moe = EPMoE(
             hidden_size=hidden_size,
@@ -67,36 +66,40 @@ def weight_block_size(self): return weight_block_size
             quantization_config=quant_config,
         )
     finally:
-        moe_module.P = original_p # Restore
+        moe_module.P = original_p  # Restore
 
-    
     # 4. Run Quantization Prep
     moe.quantize_weights(is_static=True)
-    
+
     # 5. Assert Scale Shapes
     # EPMoE logic: k_blocks = hidden_size // block_size
     k_blocks_wi = expected_k_blocks_wi
     k_blocks_wo = expected_k_blocks_wo
-    
+
     print(f"  Expert Count: {num_experts}")
     print(f"  K Blocks (WI): {k_blocks_wi}")
     print(f"  K Blocks (WO): {k_blocks_wo}")
-    
+
     expected_wi_shape = (num_experts, k_blocks_wi, 1, intermediate_dim)
     expected_wo_shape = (num_experts, k_blocks_wo, 1, hidden_size)
-    
+
     print(f"  WI_0 Scale Shape: {moe.wi_0_scale.value.shape}")
     print(f"  WO Scale Shape:   {moe.wo_scale.value.shape}")
-    
-    assert moe.wi_0_scale.value.shape == expected_wi_shape, f"WI shape mismatch: {moe.wi_0_scale.value.shape} vs {expected_wi_shape}"
-    assert moe.wo_scale.value.shape == expected_wo_shape, f"WO shape mismatch: {moe.wo_scale.value.shape} vs {expected_wo_shape}"
-    
+
+    assert (
+        moe.wi_0_scale.value.shape == expected_wi_shape
+    ), f"WI shape mismatch: {moe.wi_0_scale.value.shape} vs {expected_wi_shape}"
+    assert (
+        moe.wo_scale.value.shape == expected_wo_shape
+    ), f"WO shape mismatch: {moe.wo_scale.value.shape} vs {expected_wo_shape}"
+
     print("  Shape Verification: PASSED")
-    
+
     # 6. Verify Content (Should be zeros as initialized in is_static=True)
     assert jnp.all(moe.wi_0_scale.value == 0)
     print("  Content Verification: PASSED")
 
+
 if __name__ == "__main__":
     try:
         test_epmoe_block_quant_logic(None, 1, 1)
diff --git a/python/sgl_jax/test/kernels/quantized_linear_test.py b/python/sgl_jax/test/kernels/quantized_linear_test.py
@@ -5,6 +5,7 @@
 import jax.numpy as jnp
 import numpy as np
 import pytest
+from flax import nnx
 from jax.sharding import Mesh
 
 import sgl_jax.srt.kernels.quantized_matmul.kernel as quant_kernel
@@ -101,7 +102,8 @@ def test_quantized_linear_offline_scale_formats(scale_format):
     )
 
     ref_out = jnp.dot(x, w_fp.T)
-    out = quant_linear(x)
+    out, bias = quant_linear(x)
+    assert bias is None
 
     _assert_close(f"Offline QuantizedLinear ({scale_format})", out, ref_out)
 
@@ -198,7 +200,8 @@ def __init__(self, mesh):
             )
 
     mesh = _create_single_device_mesh()
-    model = DummyModel(mesh)
+    with jax.set_mesh(mesh):
+        model = DummyModel(mesh)
 
     class FakeModelConfig:
         pass
@@ -230,9 +233,108 @@ class FakeModelConfig:
     assert model.proj.weight_scale.value.ndim == 1
 
 
+def test_linear_return_contract_with_bias():
+    mesh = _create_single_device_mesh()
+    x = jnp.ones((2, 64), dtype=jnp.bfloat16)
+
+    with jax.set_mesh(mesh):
+        linear = LinearBase(
+            input_size=64,
+            output_size=32,
+            use_bias=True,
+            mesh=mesh,
+            kernel_axes=(None, None),
+            params_dtype=jnp.bfloat16,
+            scope_name="biased_proj",
+        )
+        out, bias = linear(x)
+
+    assert out.shape == (2, 32)
+    assert bias is None
+
+    with jax.set_mesh(mesh):
+        quant_linear = QuantizedLinear.from_linear(
+            linear,
+            weight_dtype=jnp.int8,
+            activation_dtype=None,
+            is_static_input=False,
+        )
+        q_out, q_bias = quant_linear(x)
+
+    assert q_out.shape == (2, 32)
+    assert q_bias is None
+
+
+def test_ignored_layers_only_skips_requested_paths():
+    class SelfAttn(nnx.Module):
+        def __init__(self, mesh):
+            self.q_proj = LinearBase(
+                input_size=64,
+                output_size=32,
+                use_bias=False,
+                mesh=mesh,
+                kernel_axes=(None, None),
+                params_dtype=jnp.bfloat16,
+                scope_name="q_proj",
+            )
+            self.o_proj = LinearBase(
+                input_size=64,
+                output_size=32,
+                use_bias=False,
+                mesh=mesh,
+                kernel_axes=(None, None),
+                params_dtype=jnp.bfloat16,
+                scope_name="o_proj",
+            )
+
+    class DummyBlock(nnx.Module):
+        def __init__(self, mesh):
+            self.self_attn = SelfAttn(mesh)
+
+    class FakeModelConfig:
+        pass
+
+    def _make_config(ignored_layers):
+        model_config = FakeModelConfig()
+        model_config.quantization_config = type(
+            "FakeQuantConfig",
+            (),
+            {
+                "get_linear_rules": staticmethod(
+                    lambda: [
+                        {
+                            "module_path": ".*",
+                            "weight_dtype": "int8",
+                            "activation_dtype": None,
+                            "weight_block_size": None,
+                        }
+                    ]
+                ),
+                "ignored_layers": ignored_layers,
+                "weight_block_size": [128, 128],
+            },
+        )()
+        return model_config
+
+    mesh = _create_single_device_mesh()
+    with jax.set_mesh(mesh):
+        model = DummyBlock(mesh)
+    apply_linear_quantization(_make_config(["some_other_layer"]), model, is_static_input=False)
+    assert isinstance(model.self_attn.q_proj, QuantizedLinear)
+    assert isinstance(model.self_attn.o_proj, QuantizedLinear)
+
+    with jax.set_mesh(mesh):
+        model = DummyBlock(mesh)
+    apply_linear_quantization(_make_config(["self_attn.o_proj"]), model, is_static_input=False)
+    assert isinstance(model.self_attn.q_proj, QuantizedLinear)
+    assert isinstance(model.self_attn.o_proj, LinearBase)
+
+
 if __name__ == "__main__":
     for fmt in ("per_channel", "block_channel", "block_quant"):
         test_quantized_linear_offline_scale_formats(fmt)
     test_xla_quantized_matmul_block_quant_all()
     _assert_blockwise_tuning_fallback_uses_compatible_seed()
     test_linear_rule_weight_block_size_override()
+    test_linear_return_contract_with_bias()
+    test_ignored_layers_only_skips_requested_paths()