flashinfer-ai · yzh119 · Jan 30, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
@@ -4,7 +4,7 @@
 import numpy as np
 from flashinfer import (
     RoutingMethodType,
-    GatedActType,
+    ActivationType,
     fp4_quantize,
     mxfp8_quantize,
 )
@@ -39,6 +39,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8(
     top_k: int,
     warmups: int,
     iterations: int,
+    activation_type: ActivationType,
 ):
     device = torch.device("cuda:0")
     enable_pdl = device_support_pdl(device)
@@ -97,6 +98,10 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8(
     )
 
     if is_block_scale:
+        if activation_type != ActivationType.Swiglu:
+            raise ValueError(
+                "Only Swiglu activation is supported for FP8 block scale MoE."
+            )
         fn = lambda: trtllm_fp8_block_scale_moe(
             routing_logits,
             routing_bias,
@@ -144,6 +149,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8(
             RoutingMethodType.TopK.value,
             enable_pdl,
             num_tokens if tune_max_num_tokens is None else tune_max_num_tokens,
+            activation_type.value,
         )
 
     def bench(do_autotune):
@@ -288,7 +294,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp4(
         RoutingMethodType.Renormalize.value,
         True,
         enable_pdl,
-        GatedActType.SwiGlu.value,  # gated_act_type
+        ActivationType.Swiglu.value,  # act_type
         None,
         num_tokens if tune_max_num_tokens is None else tune_max_num_tokens,
     )
@@ -348,6 +354,14 @@ def bench(do_autotune):
     parser.add_argument(
         "--iterations", type=int, default=100, help="Number of benchmark iterations"
     )
+    parser.add_argument(
+        "--activation-type",
+        type=ActivationType,
+        choices=list(ActivationType),
+        required=False,
+        default=ActivationType.Swiglu,
+        help=f"Type of gated activation function: {list(ActivationType)}",
+    )
     args = parser.parse_args()
     if args.quant_mode in ["Fp8-Per-Tensor", "Fp8-Block"]:
         bench_trtllm_gen_fused_moe_autotuner_fp8(
@@ -360,6 +374,7 @@ def bench(do_autotune):
             args.top_k,
             args.warmups,
             args.iterations,
+            args.activation_type,
         )
     else:
         bench_trtllm_gen_fused_moe_autotuner_fp4(

@@ -5,6 +5,7 @@
 import torch
 
 import flashinfer
+from flashinfer import ActivationType
 from flashinfer.autotuner import autotune
 from flashinfer.fused_moe import (
     WeightLayout,
@@ -175,12 +176,12 @@ def parse_moe_args(line, parser):
         help="Data type of the weights (before quantization).",
     )
     parser.add_argument(
-        "--gated_act",
-        type=str,
+        "--activation-type",
+        type=ActivationType,
+        choices=list(ActivationType),
         required=False,
-        default="swiglu",
-        choices=["swiglu", "geglu"],
-        help="Type of gated activation function: swiglu | geglu.",
+        default=ActivationType.Swiglu,
+        help=f"Type of gated activation function: {list(ActivationType)}",
     )
     parser.add_argument(
         "--autotune",
@@ -247,13 +248,6 @@ def parse_moe_args(line, parser):
     }
     args.routing_method_type = routing_method_name_to_type[args.routing_method]
 
-    # Normalize gated act type (map string to internal int expected by kernels)
-    gated_act_name_to_type = {
-        "swiglu": 0,
-        "geglu": 1,
-    }
-    args.gated_act_type = gated_act_name_to_type[args.gated_act]
-
     if args.verbose >= 1:
         print(f"[INFO] {args = }")
     return args
@@ -630,7 +624,7 @@ def testTrtllmFp4BlockScaleMoe(args):
     use_shuffled_weight = args.use_shuffled_weight
     weight_layout = args.weight_layout
     is_cuda_graph_compatible = not args.no_cuda_graph
-    gated_act_type = args.gated_act_type
+    activation_type = args.activation_type
     res = []
 
     backends = ["trtllm"]
@@ -795,7 +789,7 @@ def run_fp4_moe(
             local_num_experts=local_num_experts,
             routed_scaling_factor=routed_scaling_factor,
             routing_method_type=routing_method_type,
-            gated_act_type=gated_act_type,
+            activation_type=activation_type.value,
             do_finalize=True,
         )
 
@@ -1671,6 +1665,7 @@ def run_fp8_per_tensor_moe(
         output1_scales_gate_scalar,
         gemm2_weights_fp8,
         output2_scales_scalar,
+        activation_type,
     ):
         # Note: FP8 per-tensor MOE expects int64_t for n_group/topk_group, not Optional[int64_t]
         # So we convert None to 0 to indicate "no groups" mode
@@ -1693,6 +1688,7 @@ def run_fp8_per_tensor_moe(
             routed_scaling_factor=routed_scaling_factor,
             use_routing_scales_on_input=use_routing_scales_on_input,
             routing_method_type=routing_method_type,
+            activation_type=activation_type.value,
         )
 
     # Benchmark timing
@@ -1713,6 +1709,7 @@ def run_fp8_per_tensor_moe(
             output1_scales_gate_scalar,
             gemm2_weights_fp8,
             output2_scales_scalar,
+            args.activation_type,
         ),
     )
 

@@ -101,14 +101,16 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
         options.mTransposeMmaOutput == mOptions.transposeMmaOutput &&
         (!doesRouteImplUseNoRoute(options.mRouteImpl)) == mOptions.routeAct &&
         options.mFusedAct == mOptions.fusedAct && options.mIsStaticBatch == mOptions.staticBatch &&
-        tileSize == mOptions.tileSize &&
-        options.mUseShuffledMatrix == mOptions.useShuffledMatrixA &&
+        tileSize == mOptions.tileSize && options.mUseShuffledMatrix == mOptions.useShuffledMatrix &&
         options.mLayoutA == mOptions.weightLayout) {
       if (options.mFusedAct) {
         if (options.mActType != static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType)) {
           continue;
         }
       }
+      if ((int64_t)options.mEltwiseActType != (int64_t)mOptions.eltwiseActType) {
+        continue;
+      }
 
       if (mOptions.transposeMmaOutput && options.mEpilogueTileM == mOptions.epilogueTileM) {
         mPassingConfigIndices.push_back(i);
@@ -122,6 +124,8 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
             << ", mDtypeB: " << tg::dtypeToString(mOptions.dtypeB)
             << ", mDtypeC: " << tg::dtypeToString(mOptions.dtypeC)
             << ", mUseDeepSeekFp8: " << mOptions.deepSeekFp8
+            << ", mActType: " << (int64_t)mOptions.actType
+            << ", mEltwiseActType: " << (int64_t)mOptions.eltwiseActType
             << ", mTransposeMmaOutput: " << mOptions.transposeMmaOutput
             << ", mRouteAct: " << mOptions.routeAct << ", mFusedAct: " << mOptions.fusedAct
             << ", mIsStaticBatch: " << mOptions.staticBatch << ", mTileSize: " << mOptions.tileSize;
@@ -219,6 +223,8 @@ void TrtllmGenBatchedGemmRunner::run(
   gemmData.mInputBuffers.mPtrSfB = mOptions.transposeMmaOutput ? sfA : sfB;
   gemmData.mInputBuffers.mPtrScaleC = scaleC;
   gemmData.mInputBuffers.mPtrScaleGate = scaleGateC;
+  // For simplicity pass set scaleAct to scaleGateC
+  gemmData.mInputBuffers.mPtrScaleAct = scaleGateC;
   gemmData.mInputBuffers.mPtrPerTokenSfA =
       mOptions.transposeMmaOutput ? perTokensSfB : perTokensSfA;
   gemmData.mInputBuffers.mPtrPerTokenSfB =