fix: change MoE combine (#438)

rebel-ykchoi · rebel-jangys · rebel-yhboo · web-flow · commit a99eff307505 · 2026-04-09T16:59:10.000+09:00
Co-authored-by: Jangys &lt;jangys@rebellions.ai&gt;
Co-authored-by: yhboo &lt;yhboo@rebellions.ai&gt;
Co-authored-by: Myeongbo Shim &lt;myeongbo.shim@rebellions.ai&gt;
Co-authored-by: Jang Yeongsang &lt;122958878+rebel-jangys@users.noreply.github.com&gt;
diff --git a/vllm_rbln/model_executor/layers/fused_moe/layer.py b/vllm_rbln/model_executor/layers/fused_moe/layer.py
@@ -431,15 +431,27 @@ def fused_moe_forward_rbln(
 
     if self.dp_size > 1:
         # output all_reduce == dp all_reduce + tp all_reduce
-        all_hidden_states = get_dp_group().all_reduce(final_hidden_states)
-        hidden_shape_dp = (-1, 1, org_hidden_shape[-1])
-        final_hidden_states = all_hidden_states.reshape(hidden_shape_dp)
+        if envs.VLLM_RBLN_MOE_REDUCE_SCATTER:
+            hidden_shape_dp = (-1, 1, org_hidden_shape[-1])
+            all_hidden_states = final_hidden_states.reshape(hidden_shape_dp)
+            assert all_hidden_states.shape[0] % self.dp_size == 0
 
-        max_pad = get_forward_context().dp_metadata.max_pads_across_dp.shape[0]
-        num_tokens = org_hidden_shape[:-1].numel()  # noqa: F841
-        start = self.dp_rank * max_pad
-        end = start + num_tokens
-        final_hidden_states = final_hidden_states[start:end]
+            hidden_states = get_dp_group().reduce_scatter(all_hidden_states, dim=0)
+            max_pad = get_forward_context().dp_metadata.max_pads_across_dp.shape[0]
+            assert hidden_states.shape[0] == max_pad
+
+            num_tokens = org_hidden_shape[:-1].numel()  # noqa: F841
+            final_hidden_states = hidden_states[:num_tokens]
+        else:
+            all_hidden_states = get_dp_group().all_reduce(final_hidden_states)
+            hidden_shape_dp = (-1, 1, org_hidden_shape[-1])
+            final_hidden_states = all_hidden_states.reshape(hidden_shape_dp)
+
+            max_pad = get_forward_context().dp_metadata.max_pads_across_dp.shape[0]
+            num_tokens = org_hidden_shape[:-1].numel()  # noqa: F841
+            start = self.dp_rank * max_pad
+            end = start + num_tokens
+            final_hidden_states = final_hidden_states[start:end]
 
         final_hidden_states = final_hidden_states.reshape(org_hidden_shape)
 
diff --git a/vllm_rbln/model_executor/layers/quantization/mxfp4.py b/vllm_rbln/model_executor/layers/quantization/mxfp4.py
@@ -410,11 +410,10 @@ def apply(
                 expert_map_list = layer.expert_map.tolist()
                 expert_map_const = torch.tensor(expert_map_list, dtype=torch.int32)
 
-            use_moe_tokens_mask = envs.VLLM_RBLN_USE_MOE_TOKENS_MASK
             tokens_mask = None
+            use_moe_tokens_mask = envs.VLLM_RBLN_USE_MOE_TOKENS_MASK
             if use_moe_tokens_mask:
-                tokens_mask = get_tokens_mask(num_tokens, 0.0, float("-inf"))
-                router_logits = router_logits + tokens_mask
+                tokens_mask = get_tokens_mask(num_tokens)
 
             final_hidden_states = torch.ops.rbln_custom_ops.custom_moe_glu_mxfp4(
                 hidden_states,
@@ -433,6 +432,7 @@ def apply(
                 layer.top_k,
                 layer.renormalize,
                 expert_map_const,
+                tokens_mask,
             )
         else:
             raise NotImplementedError(layer.activation)
diff --git a/vllm_rbln/rbln_envs.py b/vllm_rbln/rbln_envs.py
@@ -46,6 +46,7 @@
     VLLM_RBLN_DECODE_BATCH_BUCKET_MANUAL_BUCKETS: list[int] = []
     VLLM_RBLN_USE_CUSTOM_KERNEL: bool = False
     VLLM_RBLN_AUTO_PORT: bool = True
+    VLLM_RBLN_MOE_REDUCE_SCATTER: bool = False
 
 
 def get_dp_impl() -> str:
@@ -254,6 +255,13 @@ def get_decode_batch_bucket_manual_buckets() -> list[int]:
             os.environ.get("RBLN_USE_CUSTOM_KERNEL", "False").lower() in ("true", "1")
         )
     ),
+    # Use reduce_scatter instead of all_reduce in MoE combine phase
+    "VLLM_RBLN_MOE_REDUCE_SCATTER": (
+        lambda: (
+            os.environ.get("VLLM_RBLN_MOE_REDUCE_SCATTER", "False").lower()
+            in ("true", "1")
+        )
+    ),
     "VLLM_RBLN_PROFILER": (
         lambda: os.environ.get("RBLN_PROFILER", "False").lower() in ("true", "1")
     ),