diff --git a/tests/kernels/moe/test_unquantized_backend_selection.py b/tests/kernels/moe/test_unquantized_backend_selection.py index bf5a547fe3df..c04994a8718f 100644 --- a/tests/kernels/moe/test_unquantized_backend_selection.py +++ b/tests/kernels/moe/test_unquantized_backend_selection.py @@ -133,7 +133,48 @@ def test_select_cuda_flashinfer_cutlass_backend( selected_backend = select_unquantized_moe_backend( moe_config=moe_config, use_ep=True, # CUTLASS requires EP - use_dp=False, # CUTLASS doesn't support DP + use_dp=False, + ) + + assert selected_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS + + +@patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer", + return_value=True, +) +@patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.is_supported_config_trtllm_bf16", + return_value=(False, None), +) +@pytest.mark.skipif( + not current_platform.is_cuda(), reason="Only supported on NVIDIA platforms." +) +def test_select_cuda_flashinfer_cutlass_backend_with_dp( + mock_has_flashinfer, mock_is_supported_trtllm, monkeypatch +): + """Test CUDA backend selection picks FlashInfer CUTLASS when both + DP and EP are enabled. The runner handles DP communication externally + so the kernel works correctly with DP+EP.""" + with patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform" + ) as mock_platform: + mock_platform.is_cuda.return_value = True + mock_platform.is_rocm.return_value = False + mock_platform.is_cpu.return_value = False + mock_platform.is_xpu.return_value = False + mock_platform.is_tpu.return_value = False + mock_platform.is_out_of_tree.return_value = False + mock_platform.has_device_capability.return_value = True # SM90+ + + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1") + + moe_config = make_dummy_moe_config() + + selected_backend = select_unquantized_moe_backend( + moe_config=moe_config, + use_ep=True, + use_dp=True, ) assert selected_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index 9c31da10dd94..9d724dcd961e 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -97,7 +97,6 @@ def _make_log_backend(backend: UnquantizedMoeBackend): flashinfer_cutlass_available = ( has_flashinfer_cutlass_fused_moe() and use_ep - and (not use_dp) and current_platform.has_device_capability(90) ) flashinfer_trtllm_moe_enabled = ( @@ -161,18 +160,13 @@ def _make_log_backend(backend: UnquantizedMoeBackend): "to enable it for better performance.", scope="local", ) - elif use_ep and (not use_dp): + elif use_ep: logger.info_once( "FlashInfer MoE is available for EP" " but not enabled, consider setting" " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.", scope="local", ) - elif use_dp: - logger.info_once( - "FlashInfer CUTLASS MoE is currently not available for DP.", - scope="local", - ) backend = UnquantizedMoeBackend.TRITON if current_platform.is_xpu(): backend = UnquantizedMoeBackend.XPU