[FusedMoE] fix topk>1 acc issue and support different activation (vllm-project#66)

Liangliang-Ma · web-flow · commit 2d200858ad64 · 2025-11-13T16:34:38.000+08:00
* fix topk&gt;1 acc issue

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* format

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* Update fused_moe_interface.py

* add activition types support

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

* fix test

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;

---------

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;
diff --git a/tests/fused_moe/test_fused_moe.py b/tests/fused_moe/test_fused_moe.py
@@ -82,12 +82,7 @@ def test_grouped_gemm(m, n, k, e, topk, dtype, has_bias):
         pre_token_sum += cur_token_num
     ref = torch.cat(ref, dim=0)
 
-    try:
-        torch.testing.assert_close(output, ref, rtol=1e-2, atol=1e-2)
-        print("a and b close enough")
-    except AssertionError as e:
-        print("a and b diffs")
-        print(e)
+    torch.testing.assert_close(output, ref, rtol=2e-2, atol=1e-2)
 
 
 def ref_fused_moe(x, w13, w13_bias, w2, w2_bias, flat_expert_weights,
diff --git a/vllm_xpu_kernels/fused_moe_interface.py b/vllm_xpu_kernels/fused_moe_interface.py
@@ -2,6 +2,7 @@
 import torch
 
 try:
+    from . import _C  # noqa: F401
     from . import _xpu_C  # noqa: F401
     FUSEDMOE_UNAVAILABLE_REASON = None
     FUSEDMOE_AVAILABLE = True
@@ -57,6 +58,10 @@ def xpu_fused_moe(hidden_states, w13, w13_bias, w2, w2_bias, topk_weights,
                   topk_ids, n_experts_per_token, activation, num_experts):
 
     output = torch.zeros_like(hidden_states)
+    if w13.is_contiguous():
+        # transpose and replace original data once
+        w13.data = w13.transpose(-1, -2).contiguous().transpose(-1, -2)
+        w2.data = w2.transpose(-1, -2).contiguous().transpose(-1, -2)
 
     # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
     num_rows, hidden_size = list(hidden_states.shape)
@@ -106,6 +111,8 @@ def config_ws(name, size):
     workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
     torch.ops._xpu_C.fused_moe(output=output,
                                input=hidden_states,
                                token_selected_experts=topk_ids,
@@ -143,13 +150,12 @@ def config_ws(name, size):
             w2_bias = w2_bias.repeat_interleave(expert_token_count,
                                                 dim=0).float()
     expert_token_count = expert_token_count.cpu()
-
     gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
                                dtype=hidden_states.dtype,
                                device=hidden_states.device)
 
     ########### gemm1 ##################
-    input_B = w13.transpose(-1, -2).contiguous().transpose(-1, -2)
+    input_B = w13
 
     torch.ops._xpu_C.cutlass_grouped_gemm(
         ptr_A=gemm1_input,
@@ -163,13 +169,21 @@ def config_ws(name, size):
         groups=num_experts_per_node)
 
     # act
-    gate, up_ = torch.split(gemm1_output, inter_size, dim=1)
-    act = torch.nn.SiLU()
-    act_output = act(gate) * up_
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        torch.ops._C.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        torch.ops._C.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        torch.ops._C.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
 
     ########### gemm2 ##################
     input_A = act_output.contiguous()
-    input_B = w2.transpose(-1, -2).contiguous().transpose(-1, -2)
+    input_B = w2
     gemm2_output = torch.empty((num_moe_inputs, hidden_size),
                                dtype=hidden_states.dtype,
                                device=hidden_states.device)
@@ -184,23 +198,23 @@ def config_ws(name, size):
         K=inter_size,
         groups=num_experts_per_node)
 
-    topk_weights = topk_weights.view(-1, 1)
     expert_cache = output
 
-    for expert_id, end_idx in enumerate(expert_first_token_offset):
-        start_idx = 0 if expert_id == 0 else expert_first_token_offset[
-            expert_id - 1]
+    iter_for_weight_apply = expert_first_token_offset[1:]
+    for expert_id, end_idx in enumerate(iter_for_weight_apply):
+        start_idx = 0 if expert_id == 0 else iter_for_weight_apply[expert_id -
+                                                                   1]
         if start_idx == end_idx:
             continue
 
-        exp_token_idxs = permuted_row_to_unpermuted_row[
-            start_idx:end_idx] % num_rows
+        exp_token_idxs = permuted_row_to_unpermuted_row[start_idx:end_idx]
+        scores_token_ids = exp_token_idxs % num_rows
+        scores_k_slot = exp_token_idxs // num_rows
+        scores = topk_weights[scores_token_ids, scores_k_slot]
         expert_out = gemm2_output[start_idx:end_idx]
-        expert_out.mul_(
-            topk_weights[permuted_row_to_unpermuted_row[start_idx:end_idx] %
-                         num_rows])
+        expert_out.mul_(scores.view(-1, 1))
         expert_cache.scatter_reduce_(0,
-                                     exp_token_idxs.view(-1, 1).repeat(
+                                     scores_token_ids.view(-1, 1).repeat(
                                          1, hidden_size),
                                      expert_out,
                                      reduce='sum')