fix moe data parallel for v1 engine (#252)

rebel-ykchoi · rebel-jonghewk · web-flow · commit a60592fe5e0f · 2026-01-19T17:33:49.000+09:00
* fix dp for v1

- remove DP padding support in v1 worker
- add validation for DP implementation constraints in v1 worker
- apply token mask to custom MOE kernel router logits
- update default environment variables:
  - VLLM_RBLN_DP_IMPL: "dummy_prefill" -&gt; "padded_decode"
  - VLLM_RBLN_USE_MOE_TOKENS_MASK: False -&gt; True
- fix DP metadata handling in forward context
- add is_prefills field to RBLNFlashAttentionMetadata

* fix test_rbln_envs.py

- VLLM_RBLN_DP_IMPL should be padded_decode by default

* fix DPMetadata for tokens mask

- remove is_prefills field and related logic from DP metadata
- fix get_tokens_mask() for non-DP case

---------

Co-authored-by: rebel-jonghewk &lt;142865404+rebel-jonghewk@users.noreply.github.com&gt;
diff --git a/tests/torch_compile/common/test_rbln_envs.py b/tests/torch_compile/common/test_rbln_envs.py
@@ -49,8 +49,8 @@ def test_rbln_envs():
             ), f"Expected VLLM_RBLN_DISABLE_MM to be False, \
         got {rbln_envs.VLLM_RBLN_DISABLE_MM}"
 
-    assert (rbln_envs.VLLM_RBLN_DP_IMPL == "dummy_prefill"
-            ), f"Expected VLLM_RBLN_DP_IMPL to be dummy_prefill, \
+    assert (rbln_envs.VLLM_RBLN_DP_IMPL == "padded_decode"
+            ), f"Expected VLLM_RBLN_DP_IMPL to be padded_decode, \
         got {rbln_envs.VLLM_RBLN_DP_IMPL}"
 
     assert (not rbln_envs.VLLM_RBLN_ENFORCE_MODEL_FP32
diff --git a/vllm_rbln/forward_context.py b/vllm_rbln/forward_context.py
@@ -54,12 +54,6 @@ def make(
             # for v0 attention backends
             batchsize = attn_metadata.num_prefill_tokens + \
                 attn_metadata.num_decode_tokens
-
-            disable_dp = dp_size == 1
-            use_dummy_prefill = envs.VLLM_RBLN_DP_IMPL == "dummy_prefill"
-            if (disable_dp or use_dummy_prefill) and \
-                attn_metadata.num_decode_tokens > 0:
-                max_pad = scheduler_config.max_num_seqs
         else:
             # for v1 attention backends or no attn_metadata
             batchsize = num_tokens
diff --git a/vllm_rbln/model_executor/layers/fused_moe/layer.py b/vllm_rbln/model_executor/layers/fused_moe/layer.py
@@ -238,13 +238,17 @@ def unquantized_fused_moe_method_rbln(
     return final_hidden_states.reshape(orig_shape)
 
 
-def _get_tokens_mask():
-    num_tokens = \
+def get_tokens_mask(num_tokens: int, left=1.0, right=float('-inf')):
+    num_tokens_across_dp = \
         get_forward_context().dp_metadata.num_tokens_across_dp_cpu
-    num_tokens = num_tokens.unsqueeze(1)
-    max_pad = get_forward_context().dp_metadata.max_pads_across_dp
+    num_tokens_across_dp = num_tokens_across_dp.unsqueeze(1)
+    if num_tokens_across_dp.size(0) == 1:
+        max_pad = num_tokens
+    else:
+        max_pad = get_forward_context().dp_metadata.max_pads_across_dp
     pos = torch.arange(max_pad, dtype=torch.int32).unsqueeze(0)  # [1, max_pad]
-    tokens_mask = torch.where(pos < num_tokens, 1.0, 0.0)  # [dp_size, max_pad]
+    tokens_mask = torch.where(pos < num_tokens_across_dp, left,
+                              right)  # [dp_size, max_pad]
     tokens_mask = tokens_mask.reshape(-1, 1)  #[dp_size * max_pad, 1]
     return tokens_mask
 
@@ -268,7 +272,7 @@ def get_masked_routing_weights(router_logits, top_k, renormalize, expert_map):
 
     use_moe_tokens_mask = envs.VLLM_RBLN_USE_MOE_TOKENS_MASK
     if use_moe_tokens_mask:
-        tokens_mask = _get_tokens_mask()
+        tokens_mask = get_tokens_mask(router_logits.shape[0], 1.0, 0.0)
         selected_weights = selected_weights * tokens_mask
 
     n_expert = router_logits.shape[1]
@@ -393,6 +397,11 @@ def unquantized_fused_optimize_moe_method_custom(
         expert_map_list = expert_map.tolist()
         expert_map_const = torch.tensor(expert_map_list, dtype=torch.int32)
 
+    use_moe_tokens_mask = envs.VLLM_RBLN_USE_MOE_TOKENS_MASK
+    if use_moe_tokens_mask:
+        tokens_mask = get_tokens_mask(num_tokens)
+        router_logits = router_logits * tokens_mask
+
     # optimum-rbln/src/optimum/rbln/transformers/models/qwen3_moe/
     # qwen3_moe_architecture.py
     final_hidden_states = torch.ops.rbln_custom_ops.custom_moe_glu(
diff --git a/vllm_rbln/rbln_envs.py b/vllm_rbln/rbln_envs.py
@@ -26,8 +26,8 @@
     VLLM_RBLN_USE_VLLM_MODEL: bool = False
     VLLM_RBLN_FLASH_CAUSAL_ATTN: bool = True
     VLLM_RBLN_DISABLE_MM: bool = False
-    VLLM_RBLN_DP_IMPL: str = "dummy_prefill"
-    VLLM_RBLN_USE_MOE_TOKENS_MASK: bool = False
+    VLLM_RBLN_DP_IMPL: str = "padded_decode"
+    VLLM_RBLN_USE_MOE_TOKENS_MASK: bool = True
     VLLM_RBLN_ENFORCE_MODEL_FP32: bool = False
     VLLM_RBLN_MOE_CUSTOM_KERNEL: bool = True
     VLLM_RBLN_MOE_USE_OPT_KERNEL: bool = False
@@ -41,8 +41,9 @@
 def get_dp_impl():
     dp_impl = os.environ.get("VLLM_RBLN_DP_IMPL")
     if dp_impl is None:
-        return "dummy_prefill"
-    # default is dummy_prefill
+        return "padded_decode"
+    # default is padded_decode
+    # dummy_prefill will be deprecated in the future
     choices = set(["padded_decode", "dummy_prefill"])
     current_impl = dp_impl.lower()
     if current_impl not in choices:
@@ -90,8 +91,9 @@ def get_dp_impl():
     "VLLM_RBLN_DP_IMPL":
     get_dp_impl,
     # If true, it uses the tokens mask applied to moe expert kernel
-    "VLLM_RBLN_USE_MOE_TOKENS_MASK": (lambda: os.environ.get(
-        "VLLM_RBLN_USE_MOE_TOKENS_MASK", "False").lower() in ("true", "1")),
+    "VLLM_RBLN_USE_MOE_TOKENS_MASK":
+    (lambda: os.environ.get("VLLM_RBLN_USE_MOE_TOKENS_MASK", "True").lower() in
+     ("true", "1")),
     # enforce model data type into fp32 not model_config.dtype
     "VLLM_RBLN_ENFORCE_MODEL_FP32":
     (lambda: os.environ.get("VLLM_RBLN_ENFORCE_MODEL_FP32", "False").lower() in
diff --git a/vllm_rbln/v1/worker/rbln_model_runner.py b/vllm_rbln/v1/worker/rbln_model_runner.py
@@ -1255,8 +1255,8 @@ def _preprocess(
             num_input_tokens = num_scheduled_tokens
 
         # Padding for DP
-        num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
-        num_input_tokens += num_pad
+        # NOTE(RBLN): RBLN does not support DP padding
+        num_tokens_across_dp = None
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
diff --git a/vllm_rbln/v1/worker/rbln_worker.py b/vllm_rbln/v1/worker/rbln_worker.py
@@ -241,6 +241,18 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
+        if self.parallel_config.data_parallel_size > 1:
+            if envs.VLLM_RBLN_DP_IMPL == "padded_decode":
+                max_num_batched_tokens = \
+                    self.scheduler_config.max_num_batched_tokens
+                max_num_seqs = self.scheduler_config.max_num_seqs
+                # TODO: consider relaxing this constraint
+                assert max_num_batched_tokens % max_num_seqs == 0, \
+                    "max_num_batched_tokens must be divisible by max_num_seqs"
+            elif envs.VLLM_RBLN_DP_IMPL == "dummy_prefill":
+                raise ValueError("dummy_prefill is not supported in v1 worker" \
+                                 "and will be deprecated in the future")
+
         if (self.model_config.enforce_eager or not envs.VLLM_RBLN_COMPILE_MODEL
                 or not envs.VLLM_RBLN_ENABLE_WARM_UP):
             logger.warning("skipping compile_or_warm_up_model")