Fix bench_mlp.py (#9919)

CliveUnger · web-flow · commit fa4db31206d0 · 2026-04-16T21:57:31.000Z
Fix's API issues in the bench_mlp.py script. `python/triton_kernels/bench/bench_mlp.py` no longer ran with current Triton code. Running: ``` torchrun --nproc-per-node=1 python/triton_kernels/bench/bench_mlp.py ``` fails with ``` [rank0]: Traceback (most recent call last): [rank0]: File "/workspace/triton-source/python/triton_kernels/bench/bench_mlp.py", line 230, in <module> [rank0]: roofline_mlp(batch_sizes, 5760, 5760, 128, 4, dtypes[0], dtypes[1], ep, name="mlp_moe") [rank0]: File "/workspace/triton-source/python/triton_kernels/bench/bench_mlp.py", line 194, in roofline_mlp [rank0]: csv_path = roofline.compute_roofline(dim1, dim2, n_expts_tot, n_expts_act, parse_dtype(x_dtype), [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/usr/local/lib/python3.12/dist-packages/triton_kernels/roofline.py", line 73, in compute_roofline [rank0]: perf = inject_proxy_and_call(val, args, kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/usr/local/lib/python3.12/dist-packages/triton_kernels/roofline.py", line 64, in inject_proxy_and_call [rank0]: return bench_fn(*args_list, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/workspace/triton-source/python/triton_kernels/bench/bench_mlp.py", line 100, in bench_mlp [rank0]: symm_mem_pool = SymmetricMemoryPool() [rank0]: ^^^^^^^^^^^^^^^^^^^^^ [rank0]: TypeError: SymmetricMemoryPool.__init__() missing 1 required positional argument: 'mesh' E0403 20:14:38.021000 2225 torch/distributed/elastic/multiprocessing/api.py:988] failed (exitcode: 1) local_rank: 0 (pid: 2258) of binary: /usr/bin/python3 Traceback (most recent call last): File "/usr/local/bin/torchrun", line 6, in <module> sys.exit(main()) ^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 367, in wrapper return f(*args, **kwargs) ^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 1016, in main run(args) File "/usr/local/lib/python3.12/dist-packages/torch/distributed/run.py", line 1007, in run elastic_launch( File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 184, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/distributed/launcher/api.py", line 332, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ python/triton_kernels/bench/bench_mlp.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2026-04-03_20:14:38 host : ab3ee0d0c408 rank : 0 (local_rank: 0) exitcode : 1 (pid: 2258) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ```
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -116,7 +116,7 @@ def bench_mlp(batch_per_expt, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_d
         n_expts_act=n_expts_act,
         n_expts_tot=n_expts_tot,
         dtype=x_dtype,
-        device=torch.cuda.current_device(),
+        device=torch.device(dev),
     )
 
     # -- init prameters --
@@ -137,10 +137,10 @@ def bench_mlp(batch_per_expt, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_d
     if w_dtype == FP4:
         num_warps = 4 if batch <= 512 else 8
         value_layout = layout.make_default_matmul_mxfp4_w_layout(
-            mx_axis=1,
+            mx_axis=-2,
             allow_blackwell_value_shuffle=shuffle_mx4,
         )
-        scale_layout = layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
+        scale_layout = layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=-2, num_warps=num_warps)
         opt1 = {
             "value_layout": value_layout,
             "scale_layout": scale_layout,
@@ -187,7 +187,8 @@ def bench_mlp(batch_per_expt, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_d
         fc2_constraints["num_stages"] = num_stages_fc2
 
     fpath = Path(f"profile_{rank}")
-    # warmup
+    # Compile and warm up outside the profiler so subsequent profiled launches
+    # retain launch metadata needed by roofline.parse_profile.
     run_mlp(x_dp_local_bf16, x_dp_local_fp8,  #
             wg_global, bg_global, pcg,  #
             w1_ep_local, b1_ep_local, pc1, act1,  #
diff --git a/python/triton_kernels/bench/bench_utils.py b/python/triton_kernels/bench/bench_utils.py
@@ -26,8 +26,8 @@ def _quantize_weight(w, dtype, **opt):
         assert dtype == "mx4", f"{dtype=}"
         w, w_scale = downcast_to_mxfp(w.to(torch.bfloat16), torch.uint8, axis=1)
         if opt:
-            w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"], **opt["value_layout_opts"])
-            w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"], **opt["scale_layout_opts"])
+            w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"])
+            w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"])
         return w, InFlexData(), w_scale
 
 
@@ -53,13 +53,11 @@ def _make_mx4_quantization_opts(batch: int, w_dtype: str) -> dict:
     if w_dtype != "mx4" or is_hip():
         return {}
     num_warps = 4 if batch <= 512 and cuda_capability_geq(10, 0) else 8
-    value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
-    scale_layout, scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
+    value_layout = layout.make_default_matmul_mxfp4_w_layout(mx_axis=-2)
+    scale_layout = layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=-2, num_warps=num_warps)
     return {
         "value_layout": value_layout,
-        "value_layout_opts": value_layout_opts,
         "scale_layout": scale_layout,
-        "scale_layout_opts": scale_layout_opts,
     }