Add BF16 trtllm-gen MoE: activation_type + routing_replay_out + new BMM cubins

Tomer Natan · Tomer Natan · commit 7f903f7868d5 · 2026-03-30T10:17:07.000-07:00
Cherry-pick of flashinfer-ai#2864 (squashed) plus: - activation_type param for trtllm_bf16_moe/trtllm_bf16_routed_moe (Swiglu=3, Relu2=6) - routing_replay_out param for BF16 kernel (same pattern as FP8) - Updated batched GEMM artifacts and checksums - validateAndCastActivationType for safety - Bf16MoeLauncher::init accepts ActivationType + routing_replay_out
diff --git a/csrc/trtllm_batched_gemm_runner.cu b/csrc/trtllm_batched_gemm_runner.cu
@@ -68,8 +68,8 @@ std::vector<int64_t> prioritizePredefinedConfigs(
   if (n /* out_dim */ == 0 && k /* in_dim */ == 0) {
     auto pred = [](BatchedGemmConfig const& config) {
       BatchedGemmOptions const& options = config.mOptions;
-      return options.mNumStages == 4 && options.mNumStagesMma == 2 && options.mTileK == 256 &&
-             options.mTileScheduler == TileScheduler::Persistent;
+      return options.mNumStagesA == 4 && options.mNumStagesB == 4 && options.mNumStagesMma == 2 &&
+             options.mTileK == 256 && options.mTileScheduler == TileScheduler::Persistent;
     };
     prioritizedIndices = bubbleUpConfig(sortedIndices, pred);
   }
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -476,10 +476,7 @@ class Bf16MoeLauncher : public FusedMoeLauncher {
 
   void init(std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
             int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
-            int64_t weight_layout) {
-    constexpr ActivationType activation_type =
-        ActivationType::Swiglu;  // not exposed in api for now
-
+            int64_t weight_layout, ActivationType activation_type) {
     // Do base class init and perform common checks
     FusedMoeLauncher::init_common(std::move(args), tile_tokens_dim, routing_method_type,
                                   use_shuffled_weight, weight_layout, activation_type);
@@ -1670,7 +1667,8 @@ Array<Tensor> trtllm_bf16_moe(Optional<TensorView> const& routing_logits,
                               int64_t local_expert_offset, int64_t local_num_experts,
                               Optional<double> routed_scaling_factor, int64_t routing_method_type,
                               bool use_shuffled_weight, int64_t weight_layout, bool do_finalize,
-                              bool enable_pdl, Array<int64_t> moe_tactic) {
+                              bool enable_pdl, Array<int64_t> moe_tactic, int64_t activation_type,
+                              Optional<TensorView> routing_replay_out) {
   // Just some basic type validation first and leave more checks to the launcher
   if (routing_logits.has_value()) {
     TVM_FFI_ICHECK(routing_logits.value().dtype() == dl_float32 ||
@@ -1686,6 +1684,20 @@ Array<Tensor> trtllm_bf16_moe(Optional<TensorView> const& routing_logits,
 
   auto const num_tokens = hidden_states.size(0);
   auto const hidden_size = hidden_states.size(1);
+  auto const activation = validateAndCastActivationType(activation_type);
+
+  // Validate routing_replay_out if provided
+  if (routing_replay_out.has_value()) {
+    auto replay = routing_replay_out.value();
+    TVM_FFI_ICHECK(replay.device().device_type == kDLCUDA)
+        << "routing_replay_out must be a CUDA tensor";
+    TVM_FFI_ICHECK(replay.device().device_id == hidden_states.device().device_id)
+        << "routing_replay_out must be on the same device as hidden_states";
+    TVM_FFI_ICHECK(replay.ndim() == 2) << "routing_replay_out must be 2D [num_tokens, top_k]";
+    TVM_FFI_ICHECK(replay.size(1) == top_k) << "routing_replay_out dim1 must equal top_k";
+    TVM_FFI_ICHECK(encode_dlpack_dtype(replay.dtype()) == int16_code)
+        << "routing_replay_out must be int16 dtype";
+  }
 
   // Calculate supported tile sizes
   std::vector<int32_t> mSupportedTileN(Bf16MoeLauncher::mSupportedTileNums.begin(),
@@ -1719,7 +1731,8 @@ Array<Tensor> trtllm_bf16_moe(Optional<TensorView> const& routing_logits,
                                                       expert_weights, hidden_states, gemm1_weights,
                                                       gemm2_weights);
     launcher->init(std::move(args), curr_tile_N, routing_method_type, use_shuffled_weight,
-                   weight_layout);
+                   weight_layout, activation);
+    launcher->set_routing_replay_out(routing_replay_out);
 
     launchers_map[curr_tile_N] = std::move(launcher);
   }
@@ -1751,7 +1764,7 @@ Array<Tensor> trtllm_fp8_per_tensor_scale_moe(
     bool enable_pdl, Array<int64_t> config_index, int64_t activation_type) {
   // Basic type validation
   auto dtype = hidden_states.dtype();
-  auto activation = static_cast<ActivationType>(activation_type);
+  auto activation = validateAndCastActivationType(activation_type);
   if (use_routing_scales_on_input) {
     TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_bfloat16) << "routing_logits must be bfloat16.";
   } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -137,7 +137,7 @@ class ArtifactPath:
 
     TRTLLM_GEN_FMHA: str = "f1ed60e5666a7620683a8c34a41c850a25029b35/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
-        "b55211623be7f5697c5262ffd8361fc06c147bc9/batched_gemm-b3c1646-c111d7c/"
+        "39a9d28268f43475a757d5700af135e1e58c9849/batched_gemm-5ee61af-2b9855b/"
     )
     TRTLLM_GEN_GEMM: str = (
         "1fddc48b7b48af33914d040051b3e2ee9ba4701e/gemm-145d1b1-9b113e3/"
@@ -158,7 +158,7 @@ class CheckSumHash:
         "10a54e8c3175099481aed2739ae30fa0f782368c40f9ad1b423ed8353315d65b"
     )
     TRTLLM_GEN_BMM: str = (
-        "0af823880730c4f0b3832d2208fab035946694b83444410b9309db5613d60195"
+        "db06db7f36a2a9395a2041ff6ac016fe664874074413a2ed90797f91ef17e0f6"
     )
     DEEPGEMM: str = "1a2a166839042dbd2a57f48051c82cd1ad032815927c753db269a4ed10d0ffbf"
     TRTLLM_GEN_GEMM: str = (
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -1132,6 +1132,8 @@ def forward(
                     kwargs["do_finalize"],
                     kwargs["enable_pdl"],
                     [-1, -1] if tactic == -1 else tactic,
+                    self.activation_type,
+                    kwargs.get("routing_replay_out"),
                 )
             elif (
                 self.dtype_act == DtypeTrtllmGen.E4m3
@@ -1339,6 +1341,8 @@ def trtllm_bf16_moe_op(
         do_finalize: bool = True,
         enable_pdl: Optional[bool] = None,
         tune_max_num_tokens: int = 8192,
+        activation_type: int = ActivationType.Swiglu.value,
+        routing_replay_out: Optional[torch.Tensor] = None,
     ) -> List[torch.Tensor]:
         assert routing_logits is not None or topk_ids is not None, (
             "either routing_logits or topk_ids must be provided"
@@ -1387,7 +1391,7 @@ def trtllm_bf16_moe_op(
             intermediate_size=intermediate_size,
             weight_layout=weight_layout,
             use_shuffled_weight=use_shuffled_weight,
-            activation_type=ActivationType.Swiglu,  # Default for BF16
+            activation_type=activation_type,
         )
 
         inputs = [output, routing_logits, topk_ids, expert_weights, hidden_states]
@@ -1411,6 +1415,8 @@ def trtllm_bf16_moe_op(
             weight_layout=weight_layout,
             do_finalize=do_finalize,
             enable_pdl=enable_pdl,
+            activation_type=activation_type,
+            routing_replay_out=routing_replay_out,
         )
 
         # Call the C++ function with the selected tactic
@@ -1437,6 +1443,8 @@ def trtllm_bf16_moe_op(
             do_finalize,
             enable_pdl,
             [-1, -1] if tactic == -1 else tactic,
+            activation_type,
+            routing_replay_out,
         )
         if do_finalize:
             return [output]
@@ -1469,6 +1477,8 @@ def _fake_trtllm_bf16_moe(
         do_finalize: bool = True,
         enable_pdl: Optional[bool] = None,
         tune_max_num_tokens: int = 8192,
+        activation_type: int = ActivationType.Swiglu.value,
+        routing_replay_out: Optional[torch.Tensor] = None,
     ) -> List[torch.Tensor]:
         seq_len = hidden_states.shape[0]
         hidden_size = hidden_states.shape[1]
@@ -2265,6 +2275,8 @@ def trtllm_bf16_moe(
     do_finalize: bool = True,
     enable_pdl: bool = True,
     tune_max_num_tokens: int = 8192,
+    activation_type: int = ActivationType.Swiglu.value,
+    routing_replay_out: Optional[torch.Tensor] = None,
 ) -> Union[List[torch.Tensor], torch.Tensor]:
     """BF16 MoE operation with autotuning support.
 
@@ -2302,6 +2314,9 @@ def trtllm_bf16_moe(
         do_finalize: Whether to finalize the output (default: True).
         enable_pdl: Whether to enable Programmatic Dependent Launch. Auto-enabled for >= sm90.
         tune_max_num_tokens: Maximum number of tokens for autotuning (default: 8192).
+        activation_type (int): Type of activation function (default: 3 - Swiglu)
+            - 3: Swiglu
+            - 6: Relu2
 
     Returns:
         when do_finalize=True, returns the final MoE output.
@@ -2329,6 +2344,8 @@ def trtllm_bf16_moe(
         do_finalize,
         enable_pdl,
         tune_max_num_tokens,
+        activation_type,
+        routing_replay_out,
     )
 
     if do_finalize:
@@ -2360,6 +2377,8 @@ def trtllm_bf16_routed_moe(
     do_finalize: bool = True,
     enable_pdl: bool = True,
     tune_max_num_tokens: int = 8192,
+    activation_type: int = ActivationType.Swiglu.value,
+    routing_replay_out: Optional[torch.Tensor] = None,
 ) -> List[torch.Tensor]:
     """BF16 MoE operation with autotuning support.
 
@@ -2396,6 +2415,9 @@ def trtllm_bf16_routed_moe(
         do_finalize: Whether to finalize the output (default: True).
         enable_pdl: Whether to enable Programmatic Dependent Launch. Auto-enabled for >= sm90.
         tune_max_num_tokens: Maximum number of tokens for autotuning (default: 8192).
+        activation_type (int): Type of activation function (default: 3 - Swiglu)
+            - 3: Swiglu
+            - 6: Relu2
 
     Returns:
         when do_finalize=True, returns the final MoE output.
@@ -2423,6 +2445,8 @@ def trtllm_bf16_routed_moe(
         do_finalize,
         enable_pdl,
         tune_max_num_tokens,
+        activation_type,
+        routing_replay_out,
     )
 
     if do_finalize:

Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ class ArtifactPath:`
`137`	`137`
`138`	`138`	`TRTLLM_GEN_FMHA: str = "f1ed60e5666a7620683a8c34a41c850a25029b35/fmha/trtllm-gen/"`
`139`	`139`	`TRTLLM_GEN_BMM: str = (`
`140`		`- "b55211623be7f5697c5262ffd8361fc06c147bc9/batched_gemm-b3c1646-c111d7c/"`
	`140`	`+ "39a9d28268f43475a757d5700af135e1e58c9849/batched_gemm-5ee61af-2b9855b/"`
`141`	`141`	`)`
`142`	`142`	`TRTLLM_GEN_GEMM: str = (`
`143`	`143`	`"1fddc48b7b48af33914d040051b3e2ee9ba4701e/gemm-145d1b1-9b113e3/"`
`@@ -158,7 +158,7 @@ class CheckSumHash:`
`158`	`158`	`"10a54e8c3175099481aed2739ae30fa0f782368c40f9ad1b423ed8353315d65b"`
`159`	`159`	`)`
`160`	`160`	`TRTLLM_GEN_BMM: str = (`
`161`		`- "0af823880730c4f0b3832d2208fab035946694b83444410b9309db5613d60195"`
	`161`	`+ "db06db7f36a2a9395a2041ff6ac016fe664874074413a2ed90797f91ef17e0f6"`
`162`	`162`	`)`
`163`	`163`	`DEEPGEMM: str = "1a2a166839042dbd2a57f48051c82cd1ad032815927c753db269a4ed10d0ffbf"`
`164`	`164`	`TRTLLM_GEN_GEMM: str = (`