[BENCH] fix swiglu routing and simplify args (#6743)

aeng-openai · web-flow · commit b2d9ec4723ca · 2025-05-07T21:25:47.000-04:00
bench_mlp.py was wrong since it did not pass num_experts to the swiglu.
This arg defaulted to zero, causing the kernel to load the wrong slot in
the expert data.

Fix this, and remove the num_experts arg so this is more foolproof. We
can get the number of experts from the routing data.
diff --git a/bench/tests/test_swiglu.py b/bench/tests/test_swiglu.py
@@ -37,6 +37,6 @@ def test_op(M, N, limit, device, alpha=0.5):
     # initialize data
     x = alloc_rand([n_tokens, N], device=device, dtype=torch.bfloat16)
     precision_config = PrecisionConfig(limit=limit)
-    tri_y = swiglu(x, alpha, precision_config, routing_data, n_expts_tot)
+    tri_y = swiglu(x, alpha, precision_config, routing_data)
     ref_y = swiglu_torch(x, alpha, precision_config)
     assert_close(tri_y, ref_y)
diff --git a/bench/triton_bench/swiglu.py b/bench/triton_bench/swiglu.py
@@ -23,7 +23,7 @@ class PrecisionConfig:
 class SwiGLU(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, a, alpha, precision_config, routing_data, num_experts):
+    def forward(ctx, a, alpha, precision_config, routing_data):
         N = a.shape[-1]
         M = a.numel() // N
         assert a.stride()[-1] == 1
@@ -48,9 +48,9 @@ def forward(ctx, a, alpha, precision_config, routing_data, num_experts):
                 grid = (8 * num_sms, )
             else:
                 grid = (min(M_BLOCKS * N_BLOCKS, 4 * num_sms), )
-        expt_data = None
+        n_tokens = None
         if routing_data is not None:
-            expt_data = compute_metadata(routing_data, M, BLOCK_M).buffer
+            n_tokens = compute_metadata(routing_data, M, BLOCK_M).offs[routing_data.n_expts_tot]
         _swiglu[grid](
             flex_ctx.out_data.reinterpret(out),
             flex_ctx.out_data.expected_scale,
@@ -66,8 +66,7 @@ def forward(ctx, a, alpha, precision_config, routing_data, num_experts):
             out.shape[-1],
             1,
             precision_config.limit,
-            expt_data,
-            num_experts,
+            n_tokens,
             BLOCK_M=BLOCK_M,
             BLOCK_N=BLOCK_N,
             EVEN_N=(N // 2) % 2 == 0,
@@ -81,8 +80,8 @@ def forward(ctx, a, alpha, precision_config, routing_data, num_experts):
         return out
 
 
-def swiglu(a, alpha, precision_config, routing_data=None, num_experts=0):
-    return SwiGLU.apply(a, alpha, precision_config, routing_data, num_experts)
+def swiglu(a, alpha, precision_config, routing_data=None):
+    return SwiGLU.apply(a, alpha, precision_config, routing_data)
 
 
 def swiglu_torch(a, alpha, precision_config):
diff --git a/bench/triton_bench/swiglu_details/_swiglu.py b/bench/triton_bench/swiglu_details/_swiglu.py
@@ -36,10 +36,10 @@ def swiglu_launch_metadata(grid, kernel, args):
 
 @triton.jit(repr=swiglu_repr, launch_metadata=swiglu_launch_metadata)
 def _swiglu(Out, OutExpectedScale, OutActualScale, OutChecksumScale, A, AScale, alpha, M, N, stride_am, stride_an,
-            stride_outm, stride_outn, limit: tl.constexpr, ExptData, NUM_EXPERTS: tl.constexpr, BLOCK_M: tl.constexpr,
-            BLOCK_N: tl.constexpr, EVEN_N: tl.constexpr, M_BLOCKS, N_BLOCKS, flexpoint_saturate_inf: tl.constexpr):
-    if ExptData is not None:
-        M = tl.load(ExptData + 2 * NUM_EXPERTS)
+            stride_outm, stride_outn, limit: tl.constexpr, NTokens, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
+            EVEN_N: tl.constexpr, M_BLOCKS, N_BLOCKS, flexpoint_saturate_inf: tl.constexpr):
+    if NTokens is not None:
+        M = tl.load(NTokens)
         M_BLOCKS = (M + BLOCK_M - 1) // BLOCK_M
 
     local_max = tl.full([tl.extra.cuda.num_threads()], 0.0, tl.float32)