diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index f2621473cb49..3467b22ee2d6 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -678,11 +678,14 @@ def fused_experts_impl( routed_scaling_factor, ) elif _is_xpu: - moe_sum_reduce( - intermediate_cache3.view(*intermediate_cache3.shape), - out_hidden_states[begin_chunk_idx:end_chunk_idx], - routed_scaling_factor, - ) + if topk_ids.shape[1] == 1 and routed_scaling_factor == 1.0: + pass # we write directly into out_hidden_states + else: + moe_sum_reduce( + intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx], + routed_scaling_factor, + ) else: if _has_vllm_ops: vllm_ops.moe_sum(