[Perf][gpt-oss] Downgrade triton_kernels to v3.5.1 (vllm-project#43135)

mgoin · Liuweixiong0118 · commit bab9e360d68f · 2026-06-01T10:56:03.000+08:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
Signed-off-by: Liuweixiong0118 &lt;lwx34158427@gmail.com&gt;
diff --git a/cmake/external_projects/triton_kernels.cmake b/cmake/external_projects/triton_kernels.cmake
@@ -1,6 +1,6 @@
 # Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
 
-set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
+set(DEFAULT_TRITON_KERNELS_TAG "v3.5.1")
 
 # Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
 # be directly set to the triton_kernels python directory.
diff --git a/tests/kernels/quantization/test_mxfp4_triton_ep.py b/tests/kernels/quantization/test_mxfp4_triton_ep.py
@@ -4,35 +4,33 @@
 Tests that triton_kernel_moe_forward correctly applies expert_map
 remapping when expert parallelism (EP) is enabled.
 
-Both EP and non-EP paths use topk + make_routing_data. When expert_map
-is provided, global expert IDs are remapped to local IDs before building
-routing structures.
+When expert_map is provided, global expert IDs are remapped to local IDs
+via topk + expert_map remap + make_routing_data before building routing
+structures, and the expert_map passed downstream to triton_kernel_fused_experts
+is None (already applied).
 """
 
 from unittest.mock import MagicMock, patch
 
-import pytest
 import torch
 
 
 class TestTritonMoeForwardExpertMap:
     """Test that triton_kernel_moe_forward applies expert_map remapping
     when expert_map is provided (EP active)."""
 
-    @pytest.mark.parametrize("expert_map_present", [False, True])
-    def test_routing_path_selection(self, expert_map_present):
-        """Verify that both EP and non-EP paths use topk + make_routing_data,
-        and that expert_map remapping is applied when present."""
-
+    def test_expert_map_remap(self):
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        mock_expert_map = (
-            torch.tensor([0, -1, 1, -1], device=device) if expert_map_present else None
-        )
+        mock_expert_map = torch.tensor([0, -1, 1, -1], device=device)
 
         from vllm.utils.import_utils import import_triton_kernels
 
         import_triton_kernels()
 
+        mock_routing_data = MagicMock()
+        mock_gather = MagicMock()
+        mock_scatter = MagicMock()
+
         with (
             patch("triton_kernels.topk.topk") as mock_topk,
             patch(
@@ -48,14 +46,11 @@ def test_routing_path_selection(self, expert_map_present):
                 triton_kernel_moe_forward,
             )
 
-            mock_routing_data = MagicMock()
-            mock_gather = MagicMock()
-            mock_scatter = MagicMock()
-
             sparse_result = MagicMock()
             sparse_result.indx = torch.tensor([[0, 2]], dtype=torch.int32)
             sparse_result.vals = torch.tensor([[0.6, 0.4]])
             mock_topk.return_value = sparse_result
+
             mock_make_routing.return_value = (
                 mock_routing_data,
                 mock_gather,
@@ -79,14 +74,10 @@ def test_routing_path_selection(self, expert_map_present):
                 expert_map=mock_expert_map,
             )
 
-            # Both paths use topk + make_routing_data
             mock_topk.assert_called_once()
             mock_make_routing.assert_called_once()
 
-            if expert_map_present:
-                # expert_map should be None in the fused_experts call
-                # (already applied)
-                call_kwargs = mock_fused_experts.call_args
-                assert call_kwargs[1].get("expert_map") is None or (
-                    len(call_kwargs[0]) > 0
-                )
+            # expert_map should be None in the fused_experts call
+            # (already applied).
+            call_kwargs = mock_fused_experts.call_args
+            assert call_kwargs[1].get("expert_map") is None or (len(call_kwargs[0]) > 0)
diff --git a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
@@ -209,6 +209,16 @@ def _make_bitmatrix_metadata_pow2_safe(nonzero_indx, bitmatrix):
     _bm.make_bitmatrix_metadata = _make_bitmatrix_metadata_pow2_safe
 
 
+# Two API generations of triton_kernels are supported:
+#   - v3.5.1 (the version bundled with vLLM): exposes `routing()` and
+#     `routing_from_bitmatrix()` in triton_kernels.routing; the `Bitmatrix`
+#     constructor takes a `scratchpad` argument.
+#   - v3.6.0+: removes the `routing` module in favor of a `SparseMatrix`
+#     based path, and adds a `dtype=BIT` kwarg to `Bitmatrix`. Used only
+#     when the user has triton_kernels installed system-wide at v3.6.0+.
+#
+# `use_legacy_triton_kernels` selects between them at import time based on
+# whether `SparseMatrix` is importable.
 use_legacy_triton_kernels = False
 
 if has_triton_kernels():
@@ -233,11 +243,10 @@ def _make_bitmatrix_metadata_pow2_safe(nonzero_indx, bitmatrix):
                 make_ragged_tensor_metadata,
             )
         except ImportError:
-            if current_platform.is_rocm():
-                logger.warning_once("Using legacy triton_kernels on ROCm")
-                use_legacy_triton_kernels = True
-            else:
-                raise
+            # TODO(mgoin): drop the v3.5.1 pin and remove this fallback once
+            # the gpt-oss perf regression in v3.6.0+ is resolved upstream.
+            # Tracking: https://github.com/triton-lang/triton/issues/9969
+            use_legacy_triton_kernels = True
         if not use_legacy_triton_kernels:
             _patch_make_bitmatrix_metadata()
     except (AttributeError, ImportError) as e:
@@ -311,38 +320,54 @@ def triton_kernel_moe_forward(
     unpadded_N_w2=None,
     unpadded_K_w2=None,
 ) -> torch.Tensor:
-    from triton_kernels.topk import topk as topk_fn
-
     sm_first = not renormalize
-    logits = gating_output
-    if sm_first:
-        logits = torch.softmax(logits, dim=-1)
-    topk_result = topk_fn(logits, topk, apply_softmax=not sm_first)
-    # topk may return a tuple (vals, indx, bitmatrix) or a
-    # SparseMatrix depending on the triton_kernels version.
-    if isinstance(topk_result, tuple):
-        topk_weights, topk_ids_raw, _ = topk_result
-    else:
-        topk_weights = topk_result.vals
-        topk_ids_raw = topk_result.indx
 
-    if expert_map is not None:
-        # topk_ids_raw contains global expert IDs - remap to local.
-        topk_ids = expert_map[topk_ids_raw.to(torch.long)]
-        local_num_experts = w1.shape[0]
-        routing_data, gather_idx, scatter_idx = make_routing_data(
-            topk_ids, topk_weights, local_num_experts
+    # When no expert map is provided (no EP), call the fused `routing()`
+    # kernel directly. It combines softmax, topk, bitmatrix packing, and
+    # routing-metadata construction in a single launch, instead of the
+    # three separate kernels used by the generic path below.
+    # Only available in the legacy (v3.5.1) API; the v3.6.0+ path inlines
+    # equivalent logic via SparseMatrix in `make_routing_data`.
+    if use_legacy_triton_kernels and expert_map is None:
+        from triton_kernels.routing import routing as fused_routing
+
+        routing_data, gather_idx, scatter_idx = fused_routing(
+            gating_output, topk, sm_first=sm_first
         )
-        # expert_map already applied; pass None downstream.
         effective_expert_map = None
-        effective_global_num_experts = local_num_experts
-    else:
-        topk_ids = topk_ids_raw.to(torch.long)
-        routing_data, gather_idx, scatter_idx = make_routing_data(
-            topk_ids, topk_weights, gating_output.shape[-1]
-        )
-        effective_expert_map = expert_map
         effective_global_num_experts = global_num_experts
+    else:
+        from triton_kernels.topk import topk as topk_fn
+
+        logits = gating_output
+        if sm_first:
+            logits = torch.softmax(logits, dim=-1)
+        topk_result = topk_fn(logits, topk, apply_softmax=not sm_first)
+        # topk may return a tuple (vals, indx, bitmatrix) or a
+        # SparseMatrix depending on the triton_kernels version.
+        if isinstance(topk_result, tuple):
+            topk_weights, topk_ids_raw, _ = topk_result
+        else:
+            topk_weights = topk_result.vals
+            topk_ids_raw = topk_result.indx
+
+        if expert_map is not None:
+            # topk_ids_raw contains global expert IDs - remap to local.
+            topk_ids = expert_map[topk_ids_raw.to(torch.long)]
+            local_num_experts = w1.shape[0]
+            routing_data, gather_idx, scatter_idx = make_routing_data(
+                topk_ids, topk_weights, local_num_experts
+            )
+            # expert_map already applied; pass None downstream.
+            effective_expert_map = None
+            effective_global_num_experts = local_num_experts
+        else:
+            topk_ids = topk_ids_raw.to(torch.long)
+            routing_data, gather_idx, scatter_idx = make_routing_data(
+                topk_ids, topk_weights, gating_output.shape[-1]
+            )
+            effective_expert_map = expert_map
+            effective_global_num_experts = global_num_experts
 
     output = torch.empty_like(hidden_states)
     effective_quant_config = (