[GPT-OSS] support fp8 online quantization for gpt-oss bf16 (#18988)

zminglei · web-flow · commit 4bffd3a2323a · 2026-02-20T14:16:57.000-08:00
merge it as all required CI passed
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -677,6 +677,7 @@ def __init__(self, quant_config: Fp8Config):
         self.block_quant = (
             self.use_mxfp8 or self.quant_config.weight_block_size is not None
         )
+        self.with_bias = False
         if get_moe_runner_backend().is_cutlass():
             assert (
                 cutlass_fp8_supported()
@@ -706,8 +707,10 @@ def create_weights(
         hidden_size: int,
         intermediate_size_per_partition: int,
         params_dtype: torch.dtype,
+        with_bias: bool = False,
         **extra_weight_attrs,
     ):
+        self.with_bias = with_bias
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
 
         if self.quant_config.is_checkpoint_fp8_serialized:
@@ -782,6 +785,27 @@ def create_weights(
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
+        # BIAS (optional, e.g. GPT-OSS)
+        if self.with_bias:
+            w13_up_dim = (
+                2 * intermediate_size_per_partition
+                if layer.moe_runner_config.is_gated
+                else intermediate_size_per_partition
+            )
+            w13_weight_bias = torch.nn.Parameter(
+                torch.empty(num_experts, w13_up_dim, dtype=torch.float32),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_bias", w13_weight_bias)
+            set_weight_attrs(w13_weight_bias, extra_weight_attrs)
+
+            w2_weight_bias = torch.nn.Parameter(
+                torch.empty(num_experts, hidden_size, dtype=torch.float32),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_bias", w2_weight_bias)
+            set_weight_attrs(w2_weight_bias, extra_weight_attrs)
+
         # WEIGHT_SCALES
         if self.block_quant:
             scale_dtype = torch.uint8 if self.use_mxfp8 else torch.float32
@@ -1507,6 +1531,8 @@ def apply(
             quant_info = TritonMoeQuantInfo(
                 w13_weight=layer.w13_weight,
                 w2_weight=layer.w2_weight,
+                b13=getattr(layer, "w13_weight_bias", None),
+                b2=getattr(layer, "w2_weight_bias", None),
                 use_fp8_w8a8=True,
                 w13_scale=(
                     layer.w13_weight_scale_inv
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -1390,7 +1390,11 @@ def _handle_model_specific_adjustments(self):
                     logger.warning(
                         "Detected ROCm with SGLANG_USE_AITER for GPT-OSS bf16 model, using triton MOE kernel."
                     )
-                elif self.ep_size == 1 and is_triton_kernels_available():
+                elif (
+                    self.ep_size == 1
+                    and is_triton_kernels_available()
+                    and self.quantization is None
+                ):
                     self.moe_runner_backend = "triton_kernel"
                     logger.warning(
                         "Detected GPT-OSS model, enabling triton_kernels MOE kernel."