support Sigmoid (sigmoid+topk) routing function

EdalatiAli · EdalatiAli · commit 5f2751e707eb · 2026-03-24T00:22:28.000Z
Signed-off-by: EdalatiAli &lt;aliedalati@cohere.com&gt;
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -887,7 +887,9 @@ class Fp8BlockScaleLauncher : public FusedMoeLauncher {
                static_cast<RoutingMethodType>(routing_method_type) ==
                    RoutingMethodType::RenormalizeNaive ||
                static_cast<RoutingMethodType>(routing_method_type) ==
-                   RoutingMethodType::SigmoidRenorm) {
+                   RoutingMethodType::SigmoidRenorm ||
+               static_cast<RoutingMethodType>(routing_method_type) ==
+                   RoutingMethodType::Sigmoid) {
       TVM_FFI_ICHECK(args->top_k <= 10 && args->top_k > 0)
           << "Current routing kernel (no groups) only supports top_k<=10 && top_k>0.";
     } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
diff --git a/csrc/trtllm_fused_moe_runner.cu b/csrc/trtllm_fused_moe_runner.cu
@@ -138,7 +138,8 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
              || routingMethodType == RoutingMethodType::Renormalize      /* TopK -> Softmax */
              || routingMethodType == RoutingMethodType::RenormalizeNaive /* Softmax -> TopK -> Renormalize */
              || routingMethodType == RoutingMethodType::TopK             /* TopK only (no softmax) */
-             || routingMethodType == RoutingMethodType::SigmoidRenorm    /* Sigmoid -> TopK -> Renormalize */) {
+             || routingMethodType == RoutingMethodType::SigmoidRenorm    /* Sigmoid -> TopK -> Renormalize */
+             || routingMethodType == RoutingMethodType::Sigmoid           /* Sigmoid -> TopK */) {
     using namespace moe::dev::routing;
     routingCustom::Data routingData;
 
@@ -164,6 +165,11 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
       routingData.mPreprocessType = RoutingPreprocessType::Sigmoid;
       routingData.mPostprocessType = RoutingPostprocessType::SumNormalize;
       routingData.mNormTopkProb = normTopkProb;
+    } else if (routingMethodType == RoutingMethodType::Sigmoid) {
+      // Sigmoid -> TopK (no renormalization)
+      routingData.mPreprocessType = RoutingPreprocessType::Sigmoid;
+      routingData.mPostprocessType = RoutingPostprocessType::SumNormalize;
+      routingData.mNormTopkProb = false;
     } else if (routingMethodType == RoutingMethodType::Renormalize ||
                routingMethodType == RoutingMethodType::RenormalizeNaive) {
       // TopK -> Softmax (also used for RenormalizeNaive, see comment above)
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -73,8 +73,10 @@ class RoutingMethodType(IntEnum):
     TopK = (5,)
     # SigmoidRenorm: Sigmoid -> TopK -> Renormalize (divide by sum of top-K weights)
     SigmoidRenorm = (6,)
+    # Sigmoid: Sigmoid -> TopK (no renormalization)
+    Sigmoid = (7,)
     # Unspecified
-    Unspecified = (7,)
+    Unspecified = (8,)
 
 
 # Copied from csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
@@ -2302,6 +2304,8 @@ def trtllm_bf16_moe(
             - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
             - 3: Llama4 (Top1 -> Sigmoid)
             - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
+            - 6: SigmoidRenorm (Sigmoid -> TopK -> Renormalize)
+            - 7: Sigmoid (Sigmoid -> TopK)
         use_shuffled_weight: Whether to use shuffled weight layout for optimization (default: True).
         weight_layout: Weight layout format (default: WeightLayout.BlockMajorK).
             - 0: MajorK - K-major layout [Mn, K]
@@ -2397,6 +2401,8 @@ def trtllm_bf16_routed_moe(
             - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
             - 3: Llama4 (Top1 -> Sigmoid)
             - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
+            - 6: SigmoidRenorm (Sigmoid -> TopK -> Renormalize)
+            - 7: Sigmoid (Sigmoid -> TopK)
         use_shuffled_weight: Whether to use shuffled weight layout for optimization (default: True).
         weight_layout: Weight layout format (default: WeightLayout.BlockMajorK).
             - 0: MajorK - K-major layout [Mn, K]
@@ -2832,6 +2838,8 @@ def trtllm_fp4_block_scale_moe(
             - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
             - 3: Llama4 (Top1 -> Sigmoid)
             - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
+            - 6: SigmoidRenorm (Sigmoid -> TopK -> Renormalize)
+            - 7: Sigmoid (Sigmoid -> TopK)
         do_finalize (bool): Whether to finalize the output (default: False)
         enable_pdl (Optional[bool]): Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
         activation_type (int): Type of activation function (default: 3 - Swiglu)
@@ -2967,6 +2975,8 @@ def trtllm_fp4_block_scale_routed_moe(
             - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
             - 3: Llama4 (Top1 -> Sigmoid)
             - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
+            - 6: SigmoidRenorm (Sigmoid -> TopK -> Renormalize)
+            - 7: Sigmoid (Sigmoid -> TopK)
         do_finalize (bool): Whether to finalize the output (default: False)
         activation_type (int): Type of activation function (default: 3 - Swiglu)
             - 3: Swiglu
@@ -3082,6 +3092,8 @@ def trtllm_mxint4_block_scale_moe(
             - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
             - 3: Llama4 (Top1 -> Sigmoid)
             - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
+            - 6: SigmoidRenorm (Sigmoid -> TopK -> Renormalize)
+            - 7: Sigmoid (Sigmoid -> TopK)
         do_finalize (bool): Whether to finalize the output (default: False)
         enable_pdl (Optional[bool]): Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
         tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
diff --git a/include/flashinfer/trtllm/fused_moe/RoutingCustomPolicy.cuh b/include/flashinfer/trtllm/fused_moe/RoutingCustomPolicy.cuh
@@ -453,7 +453,8 @@ struct PolicyTraits<NoOpPreprocess, SoftmaxPostprocess> {
       >;
 };
 
-/// Sigmoid + SumNormalize (SigmoidRenorm: Sigmoid -> TopK -> Renormalize).
+/// Sigmoid + SumNormalize (SigmoidRenorm: Sigmoid -> TopK -> Renormalize,
+///                          Sigmoid: Sigmoid -> TopK with normTopkProb=false).
 template <>
 struct PolicyTraits<SigmoidPreprocess, SumNormalizePostprocess> {
   using Pairs = TierList<Tier<128, 8>,   // Small expert counts (≤128 experts)
diff --git a/include/flashinfer/trtllm/fused_moe/runner.h b/include/flashinfer/trtllm/fused_moe/runner.h
@@ -50,8 +50,10 @@ enum class RoutingMethodType : int64_t {
   TopK = 5,
   // SigmoidRenorm: Sigmoid -> TopK -> Renormalize (divide by sum of top-K weights)
   SigmoidRenorm = 6,
+  // Sigmoid: Sigmoid -> TopK (no renormalization)
+  Sigmoid = 7,
   // Unspecified
-  Unspecified = 7,
+  Unspecified = 8,
 };
 
 inline int32_t maybeGetMinTokenCount(int32_t numPaddedTokens, int32_t hiddenSize,
@@ -77,6 +79,8 @@ inline std::string serializeMoeRoutingMethodType(RoutingMethodType routingMethod
       return "TopK";
     case RoutingMethodType::SigmoidRenorm:
       return "SigmoidRenorm";
+    case RoutingMethodType::Sigmoid:
+      return "Sigmoid";
     default:
       return "InvalidRountingMethod";  // TODO throw error
   };
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -2748,6 +2748,10 @@ def run_moe_test(
         permute_info, scores = routing_reference_sigmoid_renorm(
             expert_logits, top_k, num_experts, padding, norm_topk_prob=norm_topk_prob
         )
+    elif routing_method_type == RoutingMethodType.Sigmoid:
+        permute_info, scores = routing_reference_sigmoid_renorm(
+            expert_logits, top_k, num_experts, padding, norm_topk_prob=False
+        )
     elif routing_method_type == RoutingMethodType.Llama4:
         permute_info, scores = routing_reference_no_aux(
             expert_logits,
@@ -3242,6 +3246,119 @@ def test_sigmoid_renorm_routing(
     )
 
 
+# Test: Sigmoid routing (Sigmoid -> TopK, no renormalization)
+@pytest.mark.parametrize("num_tokens", [8, 768, 3072])
+@pytest.mark.parametrize("hidden_size", [1024])
+@pytest.mark.parametrize("intermediate_size", [1024, 768, 512, 384])
+@pytest.mark.parametrize(
+    "moe_impl",
+    [
+        pytest.param(BF16Moe(), id="BF16xBF16"),
+        pytest.param(
+            FP8BlockScaleMoe(fp8_quantization_type=QuantMode.FP8_BLOCK_SCALE_DEEPSEEK),
+            id="FP8_Block_DeepSeek",
+        ),
+        pytest.param(
+            FP8BlockScaleMoe(fp8_quantization_type=QuantMode.FP8_BLOCK_SCALE_MXFP8),
+            id="FP8_Block_MxFp8",
+        ),
+        pytest.param(FP8PerTensorMoe(), id="FP8_Tensor"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4xBf16"),
+        pytest.param(MxInt4BlockScaleMoe(), id="MxInt4xBf16"),
+    ],
+)
+@pytest.mark.parametrize(
+    "routing_config",
+    [
+        pytest.param(
+            {
+                "num_experts": 128,
+                "top_k": 8,
+                "padding": 8,
+                "n_groups": None,
+                "top_k_groups": None,
+                "routed_scaling": None,
+                "has_routing_bias": False,
+                "routing_method_type": RoutingMethodType.Sigmoid,
+                "compatible_moe_impls": [
+                    FP8PerTensorMoe,
+                    FP8BlockScaleMoe,
+                    FP4Moe,
+                    BF16Moe,
+                    MxInt4BlockScaleMoe,
+                ],
+                "compatible_intermediate_size": [384, 768, 1024],
+                "enable_autotune": True,
+            },
+            id="Sigmoid_128e_top8",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "weight_processing",
+    [
+        pytest.param(
+            {
+                "use_shuffled_weight": False,
+                "layout": WeightLayout.MajorK,
+                "compatible_moe_impls": [FP8BlockScaleMoe],
+            },
+            id="NoShuffle_MajorK",
+        ),
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.MajorK,
+                "compatible_moe_impls": [FP4Moe, FP8PerTensorMoe, FP8BlockScaleMoe],
+            },
+            id="Shuffled_MajorK",
+        ),
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.BlockMajorK,
+                "compatible_moe_impls": [
+                    FP8BlockScaleMoe,
+                    BF16Moe,
+                    MxInt4BlockScaleMoe,
+                ],
+            },
+            id="Shuffled_BlockMajorK",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "activation_type",
+    [
+        pytest.param(ActivationType.Swiglu.value, id="Swiglu"),
+        pytest.param(ActivationType.Geglu.value, id="Geglu"),
+    ],
+)
+def test_sigmoid_routing(
+    num_tokens,
+    hidden_size,
+    intermediate_size,
+    moe_impl,
+    routing_config,
+    weight_processing,
+    activation_type,
+    cache_permute_indices,
+):
+    """Test Sigmoid routing configurations (Sigmoid -> TopK, no renormalization)."""
+    run_moe_test(
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        moe_impl,
+        routing_config,
+        weight_processing,
+        activation_type,
+        cache_permute_indices,
+    )
+
+
 # Test: DeepSeekV3 routing
 @pytest.mark.parametrize("num_tokens", [8, 768, 3072])
 @pytest.mark.parametrize("hidden_size", [1024])