fix(deepep): fix moe overlap error with sync-free moe 2 and 3.

zhenhuang12 · zhenhuang12 · commit b34e01943c78 · 2025-10-26T21:36:02.000-05:00
diff --git a/primus/backends/megatron/core/extensions/primus_turbo.py b/primus/backends/megatron/core/extensions/primus_turbo.py
@@ -9,7 +9,6 @@
 
 import grouped_gemm
 import primus_turbo.pytorch as pt
-import primus_turbo.pytorch.ops.activation as turbo_moe_activation
 import torch
 import torch.nn.functional as F
 import transformer_engine as te
@@ -37,9 +36,6 @@
     ScalingStrategy,
     check_fp8_support,
 )
-from primus_turbo.pytorch.ops.moe.tokens_per_expert_to_mask import (
-    tokens_per_expert_to_mask as turbo_tokens_per_expert_to_mask,
-)
 from torch import Tensor
 from transformer_engine.pytorch.fp8 import (
     DelayedScaling,
@@ -756,17 +752,17 @@ def __init__(
             assert self.config.gated_linear_unit, "turbo_fused_act_with_probs only support with GLU."
 
             if self.config.activation_func == F.silu:
-                turbo_fused_act_with_probs = turbo_moe_activation.swiglu_with_probs
+                turbo_fused_act_with_probs = pt.ops.swiglu_with_probs
             elif self.config.activation_func == F.gelu:
-                turbo_fused_act_with_probs = turbo_moe_activation.geglu_with_probs
+                turbo_fused_act_with_probs = pt.ops.geglu_with_probs
             else:
                 raise ValueError("Activation function must be silu or gelu when using GroupedMLP.")
 
             def _activation_func_with_probs(x, probs, tokens_per_experts):
                 assert x.ndim == 2
                 assert probs.ndim == 1
                 num_tokens = x.shape[0]
-                row_mask = turbo_tokens_per_expert_to_mask(tokens_per_experts, num_tokens)
+                row_mask = pt.ops.tokens_per_expert_to_mask(tokens_per_experts, num_tokens)
                 return turbo_fused_act_with_probs(x, probs, row_mask)
 
             self.activation_func_with_probs = _activation_func_with_probs
diff --git a/primus/modules/trainer/megatron/utils.py b/primus/modules/trainer/megatron/utils.py
@@ -430,10 +430,11 @@ def validate_args_on_rocm(args):
     # sync-free MoE
     if args.turbo_sync_free_moe_stage > 0:
         assert args.enable_primus_turbo, "Please set `enable_primus_turbo=True` to enable sync-free MoE."
-        assert (
-            args.turbo_sync_free_moe_stage > 1 and args.moe_use_legacy_grouped_gemm
-        ), "Sync-Free MoE require PrimusTurboGroupedMLP, please set `moe_use_legacy_grouped_gemm=True`"
 
+        if args.turbo_sync_free_moe_stage > 1 and not args.moe_use_legacy_grouped_gemm:
+            raise ValueError(
+                "Sync-Free MoE stage 2 or 3 require PrimusTurboGroupedMLP, please set `moe_use_legacy_grouped_gemm=True`"
+            )
         options = _get_sync_free_moe_options(args.turbo_sync_free_moe_stage)
         print_rank_last(
             f"========== Enable Sync-Free MoE Stage {args.turbo_sync_free_moe_stage} (Auto-Enabled Options) =========="