EPv2: address review (graph gating, fp8 fail-fast, helper scope, robustness)

menyu · menyu · commit 905803106e6e · 2026-06-27T17:49:41.000Z
Two rounds of code-review fixes:

- server_args: disable the prefill CUDA graph under EPv2 -- only the direct-mode
  decode masked-GEMM path is capture-safe; the direct-mode prefill (extend) path
  uses a non-masked layout with a host readback and is not capture-validated.
- fp8: DeepGEMM UE8M0 weight requant now fails fast for the EPv2 FusedMoE layer
  with a clear message, instead of asserting isinstance(layer, DeepEPMoE).
- deepseek_v2: revert the 3 invented a2a helpers back to the original inline
  backend checks plus `or is_epv2()`, so EPv2 integration is purely additive. Fix
  two over-broad sites where a wide helper had replaced narrower checks: the AMD
  gfx95 allocator-size path (restore is_deepep_class_backend + epv2) and
  enable_a2a_moe (restore is_deepep/is_mooncake + epv2) -- unrelated backends
  (nixl / ascend / flashinfer / megamoe) keep their original behavior.
- epv2: dispatch_b/combine_b raise a clear RuntimeError when called without a
  preceding dispatch_a/combine_a, aligned with the combine_a stage check.
- kernels: document that the masked-slab overflow fast-fail is skipped during
  CUDA graph capture, and that safety then relies on the static
  max_m = cap * ep_group_size upper bound.
- utils: drop the now-unused a2a helpers; clarify the capability-resolver comment
  (it reads runner flags to build the contract, like the DeepEP dispatcher does;
  the dispatcher itself only consumes the resolved contract).

No functional or perf change for EPv2 or DeepEP: re-verified chat-completions
3-question correctness (direct + hybrid), unit tests (7 passed), and 4 throughput
points (decode/prefill, EPv2 vs DeepEP) -- all within run-to-run noise of the
pre-fix numbers.
diff --git a/python/sglang/srt/layers/moe/ep_moe/kernels.py b/python/sglang/srt/layers/moe/ep_moe/kernels.py
@@ -1731,7 +1731,7 @@ def fp8_per_token_to_per_tensor_quant_triton(
 
 
 # ---------------------------------------------------------------------------
-# EPv2 decode masked-GEMM bridge (Claude): repack the expanded expert-packed
+# EPv2 decode masked-GEMM bridge: repack the expanded expert-packed
 # dispatch buffer into a regular [E_local, max_m, hidden] slab so DeepGEMM's
 # *masked* grouped GEMM can bound compute by per-expert real counts (masked_m)
 # instead of the dispatch capacity. All-GPU, static shapes -> cuda-graph safe.
@@ -1875,6 +1875,11 @@ def expand_to_masked_slab(
     # Outside cuda graph capture, fail fast on slab overflow rather than return a
     # silently truncated result. During capture we skip the host read to keep the
     # path graph-safe; the eager warmup forward validates representative shapes.
+    # Safety under graph replay therefore relies on the static upper bound
+    # max_m = cap * ep_group_size holding: each rank sends at most `cap` tokens
+    # (enforced by the dispatch-entry assert) and a token contributes at most once
+    # per local expert, so no expert can exceed max_m. If those invariants change,
+    # graph replay would NOT fail-fast on overflow — re-validate before relying on it.
     if not torch.cuda.is_current_stream_capturing() and int(overflow.item()) != 0:
         raise RuntimeError(
             f"EPv2 masked slab overflow: an expert received more than max_m="
diff --git a/python/sglang/srt/layers/moe/token_dispatcher/epv2.py b/python/sglang/srt/layers/moe/token_dispatcher/epv2.py
@@ -527,6 +527,10 @@ def dispatch_a(self, hidden_states: torch.Tensor, topk_output: TopKOutput) -> No
         self._dispatch_state = self._impl.dispatch_a(hidden_states, topk_output)
 
     def dispatch_b(self) -> DispatchOutput:
+        if self._dispatch_state is None:
+            raise RuntimeError(
+                "DeepEP v2 dispatch_b() called without a preceding dispatch_a()"
+            )
         out = self._impl.dispatch_b(*self._dispatch_state)
         self._dispatch_state = None
         self._stage = _Stage.AFTER_DISPATCH
@@ -553,6 +557,10 @@ def combine_a(self, combine_input: CombineInput) -> None:
         self._combine_state = self._impl.combine_a(combine_input)
 
     def combine_b(self) -> torch.Tensor:
+        if self._combine_state is None:
+            raise RuntimeError(
+                "DeepEP v2 combine_b() called without a preceding combine_a()"
+            )
         try:
             return self._impl.combine_b(*self._combine_state)
         finally:
diff --git a/python/sglang/srt/layers/moe/utils.py b/python/sglang/srt/layers/moe/utils.py
@@ -164,8 +164,10 @@ class EpV2RunnerCapability(NamedTuple):
     """
     Describes the EPv2 dispatcher contract required by the active MoE runner.
 
-    The dispatcher should depend on this explicit contract instead of peeking at
-    runner implementation details such as DeepGEMM JIT flags.
+    This capability is resolved once (in get_epv2_runner_capability, which reads
+    runner-side flags such as DeepGEMM JIT TMA/UE8M0 settings) and then consumed
+    by the dispatcher. The dispatcher depends only on this resolved contract and
+    does not peek at runner implementation details itself.
     """
 
     output_dtype: EpV2OutputDtype
@@ -468,48 +470,6 @@ def is_deepep_class_backend() -> bool:
     return b.is_deepep() or b.is_mooncake() or b.is_mori()
 
 
-def uses_a2a_moe_forward() -> bool:
-    """Return whether the active backend uses the A2A MoE forward path."""
-    b = get_moe_a2a_backend()
-    return (
-        b.is_deepep()
-        or b.is_mooncake()
-        or b.is_nixl()
-        or b.is_mori()
-        or b.is_ascend_fuseep()
-        or b.is_flashinfer()
-        or b.is_epv2()
-    )
-
-
-def uses_a2a_expert_parallel_metadata() -> bool:
-    """Return whether the backend needs EP metadata on DeepSeek MoE layers."""
-    b = get_moe_a2a_backend()
-    return (
-        b.is_deepep()
-        or b.is_mooncake()
-        or b.is_nixl()
-        or b.is_mori()
-        or b.is_ascend_fuseep()
-        or b.is_epv2()
-    )
-
-
-def requires_shared_expert_tp1() -> bool:
-    """Return whether shared experts should be materialized with TP=1."""
-    b = get_moe_a2a_backend()
-    return (
-        b.is_deepep()
-        or b.is_mooncake()
-        or b.is_nixl()
-        or b.is_mori()
-        or b.is_ascend_fuseep()
-        or b.is_flashinfer()
-        or b.is_megamoe()
-        or b.is_epv2()
-    )
-
-
 def is_flashinfer_cutedsl_v1_path() -> bool:
     """CuteDSL v1 + DeepEP low-latency path (no MoeRunner, no autotune)."""
     return (
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -1389,9 +1389,20 @@ def process_weights_after_loading_block_quant(self, layer: Module) -> None:
                 and will_use_deepgemm
                 and not layer.w13_weight_scale_inv.format_ue8m0
             ):
-                assert isinstance(
-                    layer, DeepEPMoE
-                ), "DeepGemm MoE is only supported with DeepEPMoE"
+                if not isinstance(layer, DeepEPMoE):
+                    # UE8M0 in-place weight requant is only wired for the
+                    # DeepEPMoE layer (legacy deepep backend). The EPv2 backend
+                    # uses FusedMoE, so fail fast with a clear message instead of
+                    # asserting; use a pre-requantized FP8 checkpoint or
+                    # --moe-a2a-backend deepep for checkpoints that need UE8M0
+                    # requant at load time.
+                    raise NotImplementedError(
+                        "DeepGEMM UE8M0 weight requant requires the DeepEPMoE "
+                        f"layer, got {type(layer).__name__}. The EPv2 backend "
+                        "does not support FP8 checkpoints that need load-time "
+                        "UE8M0 requant yet; use a pre-requantized checkpoint or "
+                        "--moe-a2a-backend deepep."
+                    )
                 weight_block_size = self.quant_config.weight_block_size
                 requant_weight_ue8m0_inplace(
                     layer.w13_weight, layer.w13_weight_scale_inv, weight_block_size
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -101,9 +101,6 @@
     is_deepep_class_backend,
     is_sbo_enabled,
     is_tbo_enabled,
-    requires_shared_expert_tp1,
-    uses_a2a_expert_parallel_metadata,
-    uses_a2a_moe_forward,
 )
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import Fp8Config
@@ -712,7 +709,14 @@ def __init__(
             # explicitly requested for DSV4 checkpoints whose shared scales are
             # not divisible by the global TP size.
             _shared_expert_use_tp1 = (
-                requires_shared_expert_tp1()
+                get_moe_a2a_backend().is_deepep()
+                or get_moe_a2a_backend().is_mooncake()
+                or get_moe_a2a_backend().is_nixl()
+                or get_moe_a2a_backend().is_mori()
+                or get_moe_a2a_backend().is_ascend_fuseep()
+                or get_moe_a2a_backend().is_flashinfer()
+                or get_moe_a2a_backend().is_megamoe()
+                or get_moe_a2a_backend().is_epv2()
                 or should_use_flashinfer_cutlass_moe_fp4_allgather()
                 or envs.SGLANG_SHARED_EXPERT_TP1.get()
             )
@@ -787,7 +791,14 @@ def __init__(
 
         self.top_k = config.num_experts_per_tok
 
-        if uses_a2a_expert_parallel_metadata():
+        if (
+            get_moe_a2a_backend().is_deepep()
+            or get_moe_a2a_backend().is_mooncake()
+            or get_moe_a2a_backend().is_nixl()
+            or get_moe_a2a_backend().is_mori()
+            or get_moe_a2a_backend().is_ascend_fuseep()
+            or get_moe_a2a_backend().is_epv2()
+        ):
             # TODO: we will support tp < ep in the future
             self.ep_size = get_parallel().moe_ep_size
             self.num_experts = (
@@ -803,7 +814,15 @@ def __init__(
                 else None
             )
 
-        self._enable_a2a_moe = uses_a2a_moe_forward()
+        self._enable_a2a_moe = (
+            get_moe_a2a_backend().is_deepep()
+            or get_moe_a2a_backend().is_mooncake()
+            or get_moe_a2a_backend().is_nixl()
+            or get_moe_a2a_backend().is_mori()
+            or get_moe_a2a_backend().is_ascend_fuseep()
+            or get_moe_a2a_backend().is_flashinfer()
+            or get_moe_a2a_backend().is_epv2()
+        )
         self._fuse_shared_experts_inside_sbo = SboFlags.fuse_shared_experts_inside_sbo()
 
     def get_moe_weights(self):
@@ -2424,9 +2443,12 @@ def __init__(
             for i in range(len(self.layers)):
                 if isinstance(self.layers[i].mlp, DeepseekV2MoE):
                     # tp_size = get_parallel().tp_size
-                    # requires_shared_expert_tp1() is epv2-aware (is_deepep_class_backend
-                    # is not); keep it so EPv2 also materializes shared experts at TP=1.
-                    is_a2a_moe = requires_shared_expert_tp1()
+                    # Keep the original deepep-class scope here and only add EPv2,
+                    # so unrelated backends' allocator sizing is unchanged.
+                    is_a2a_moe = (
+                        is_deepep_class_backend()
+                        or get_moe_a2a_backend().is_epv2()
+                    )
                     tp_size = 1 if is_a2a_moe else get_parallel().tp_size
                     intermediate_size = (
                         config.moe_intermediate_size * config.n_shared_experts
@@ -2446,7 +2468,11 @@ def __init__(
                 )
             )
         self.layers_to_capture = []
-        self.enable_a2a_moe = uses_a2a_moe_forward()
+        self.enable_a2a_moe = (
+            get_moe_a2a_backend().is_deepep()
+            or get_moe_a2a_backend().is_mooncake()
+            or get_moe_a2a_backend().is_epv2()
+        )
 
         # llama_4_scaling: for supporting Mistral-Large-3 model
         self.llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -5547,6 +5547,13 @@ def _handle_a2a_moe(self):
             if not epv2_graph_ok:
                 self.cuda_graph_config.decode.backend = Backend.DISABLED
                 self.cuda_graph_config.prefill.backend = Backend.DISABLED
+            else:
+                # Only the direct-mode decode masked-GEMM path is capture-safe
+                # (static shapes, no host readback). The direct-mode prefill
+                # (extend) path goes through the non-masked contiguous layout with
+                # a host readback and is not capture-validated, so keep the decode
+                # graph but always disable the prefill graph under EPv2.
+                self.cuda_graph_config.prefill.backend = Backend.DISABLED
             logger.warning(
                 f"DeepEP v2 MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )