remove xpu_fused_moe weights handling (vllm-project#163)

mayuyuace · jikunshang · web-flow · commit 377e7ebcf952 · 2026-03-27T22:11:48.000+08:00
Signed-off-by: mayuyuace &lt;qiming1.zhang@intel.com&gt;
Co-authored-by: Kunshang Ji &lt;kunshang.ji@intel.com&gt;
diff --git a/tests/fused_moe/test_fused_moe.py b/tests/fused_moe/test_fused_moe.py
@@ -231,6 +231,9 @@ def test_fused_moe(m, n, k, e, topk, dtype, w_dtype, has_bias):
                             flat_expert_weights, flat_expert_indices, topk,
                             "silu", e)
 
+    w13.data = w13.transpose(-1, -2).contiguous()
+    w2.data = w2.transpose(-1, -2).contiguous()
+
     output = xpu_fused_moe(hidden_states=a,
                            w13=w13,
                            w13_scales=w13_scales,
@@ -562,6 +565,9 @@ def test_fused_moe_ep(m, n, k, e, topk, ep_rank, ep_size, dtype, w_dtype,
     expert_start_id = e * ep_rank
     expert_end_id = expert_start_id + e
 
+    w13.data = w13.transpose(-1, -2).contiguous()
+    w2.data = w2.transpose(-1, -2).contiguous()
+
     output = xpu_fused_moe(hidden_states=a,
                            w13=w13[expert_start_id:expert_end_id],
                            w13_scales=w13_scales[expert_start_id:expert_end_id]
diff --git a/vllm_xpu_kernels/fused_moe_interface.py b/vllm_xpu_kernels/fused_moe_interface.py
@@ -154,21 +154,17 @@ def xpu_fused_moe(hidden_states,
     else:
         assert output.shape == hidden_states.shape, \
             "output shape must be the same as hidden_states shape"
-    inter_size = list(w13.shape)[-2] // 2
-
-    assert w13.is_contiguous() and w2.is_contiguous()
 
     # 4bits support [E, N, K]
     # other types [E, K, N]
     if not is_int4 and not is_mxfp4:
-        if not hasattr(w13, 'xpu_fused_moe'):
-            w13.data = w13.transpose(-1, -2).contiguous()
-            w2.data = w2.transpose(-1, -2).contiguous()
-            w13.xpu_fused_moe = True
-            w13.inter_size = inter_size
-        else:
-            inter_size = w13.inter_size
+        inter_size = list(w13.shape)[-1] // 2
+    else:
+        inter_size = list(w13.shape)[-2] // 2
+
+    assert w13.is_contiguous() and w2.is_contiguous()
 
+    # FIXME: move this to vllm
     if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
         w13_tmp = torch.empty_like(w13)
         w2_tmp = torch.empty_like(w2)
@@ -257,7 +253,7 @@ def xpu_fused_moe(hidden_states,
         torch.ops._C.silu_and_mul(act_output, gemm1_output)
     elif activation == "gelu":
         torch.ops._C.gelu_and_mul(act_output, gemm1_output)
-    elif activation == "swigluoai":
+    elif activation == "swigluoai" or ("SWIGLUOAI" in str(activation)):
         torch.ops._C.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
     else:
         raise ValueError(f"Unsupported FusedMoe activation: {activation}.")