Add Diode Inductor max-autotune benchmarks to gemm, addmm, bmm

jananisriram · facebook-github-bot · commit cdcb2bd5564d · 2025-12-22T16:43:09.000-08:00
Summary:
Add Inductor max-autotune benchmarking using Diode to TritonBench for the `gemm`, `addmm`, and `bmm` operators. This diff reduces friction in verifying Diode results against the base max-autotune results.

NOTE: The TritonBench benchmark currently includes the overhead of setting up and tearing down Diode within the benchmark, i.e. metrics like tflops and latency. Actual Diode benchmarking numbers should be slightly better than what TritonBench reports.

Differential Revision: D89398916
diff --git a/tritonbench/operators/addmm/operator.py b/tritonbench/operators/addmm/operator.py
@@ -1,5 +1,6 @@
 import argparse
 import itertools
+import logging
 from typing import Any, Callable, Generator, List, Optional, Tuple
 
 import torch
@@ -16,6 +17,7 @@
 with try_import("HAS_STREAMK"):
     from tritonbench.operators.gemm.stream_k import streamk_cuda_matmul
 
+from tritonbench.utils.diode_utils import setup_diode_model, teardown_diode_model
 from tritonbench.utils.triton_op import (
     BenchmarkOperator,
     BenchmarkOperatorMetrics,
@@ -85,6 +87,9 @@
 
 BATCH_SCALING_SHAPES = [(1 << i, 512, 512, False) for i in range(6, 21)]
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
 
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["tflops", "best_config"]
@@ -149,6 +154,24 @@ def pt2_addmm_maxautotune(self, a, mat1, mat2) -> Callable:
             compiled(a, mat1, mat2)
         return lambda: compiled(a, mat1, mat2)
 
+    @register_benchmark(enabled=False)
+    def pt2_addmm_maxautotune_diode(self, a, mat1, mat2) -> Callable:
+        torch._dynamo.reset()
+        logger.info("[DIODE][TritonBench] Run PT2 addmm Max-Autotune Diode benchmark")
+        old_diode_configs = setup_diode_model()
+
+        with inductor_config.patch(
+            max_autotune=True,
+            max_autotune_gemm_backends="ATEN,TRITON",
+            autotune_num_choices_displayed=None,
+        ):
+            f = lambda a, mat1, mat2: torch.addmm(a, mat1, mat2)
+            compiled = torch.compile(f, dynamic=False)
+            compiled(a, mat1, mat2)
+
+        teardown_diode_model(old_diode_configs)
+        return lambda: compiled(a, mat1, mat2)
+
     @register_metric()
     def gbps(
         self, fn_name: str, example_inputs: Any, metrics: BenchmarkOperatorMetrics
diff --git a/tritonbench/operators/gemm/operator.py b/tritonbench/operators/gemm/operator.py
@@ -2,6 +2,7 @@
 import contextlib
 import csv
 import itertools
+import logging
 import os
 from typing import Any, Callable, Generator, List, Optional, Tuple
 
@@ -19,6 +20,7 @@
     blackwell_matmul_tma,
     blackwell_matmul_tma_persistent,
 )
+from tritonbench.utils.diode_utils import setup_diode_model, teardown_diode_model
 from tritonbench.utils.triton_utils import has_tlx
 
 if has_tlx():
@@ -136,6 +138,9 @@ def _tlx_matmul(*args, **kwargs):
 
 PERSISTENT_TUTORIAL_SHAPES = [(8192, 8192, 1 << k, None) for k in range(9, 15)]
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
 
 @contextlib.contextmanager
 def set_env_variable(key, value):
@@ -365,6 +370,27 @@ def pt2_matmul_maxautotune(self, a, b, bias) -> Callable:
 
         return lambda: compiled(a, b)
 
+    @register_benchmark(enabled=False)
+    def pt2_matmul_maxautotune_diode(self, a, b, bias) -> Callable:
+        torch._dynamo.reset()
+        logger.info("[DIODE][TritonBench] Run PT2 gemm Max-Autotune Diode benchmark")
+        old_diode_configs = setup_diode_model()
+
+        with inductor_config.patch(
+            max_autotune=True,
+            max_autotune_gemm_backends="ATEN,TRITON",
+            autotune_num_choices_displayed=self.inductor_autotune_num_choices_displayed,
+        ):
+            if bias is not None:
+                f = lambda a, b: a.matmul(b) + bias
+            else:
+                f = lambda a, b: a.matmul(b)
+            compiled = torch.compile(f, dynamic=False)
+            compiled(a, b)
+
+        teardown_diode_model(old_diode_configs)
+        return lambda: compiled(a, b)
+
     @register_benchmark(enabled=not is_cuda())
     def streamk_matmul(self, a, b, bias) -> Callable:
         return lambda: (
diff --git a/tritonbench/utils/diode_utils.py b/tritonbench/utils/diode_utils.py
@@ -0,0 +1,44 @@
+"""Diode (ML model for pruning autotuning configs) utils for TritonBench operators."""
+
+import diode.torch_diode.config as diode_config
+from diode.torch_diode.choices import DiodeInductorChoices
+from diode.torch_diode.models.triton_gemm.model import (
+    GEMMModelV2,
+    MODEL_CONFIGS,
+)
+from diode.torch_diode.registry import register, get_registry
+import logging
+from torch._inductor.virtualized import V
+from torch._inductor.choices import InductorChoices
+
+DIODE_MODEL_CONFIGS_VERSION = "v3_12_04_2025"
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+def setup_diode_model(topk: int = 1, expand_search_space: bool = True) -> tuple[int, bool]:
+    logger.info("[DIODE][TritonBench] Setup Diode model.")
+
+    old_topk = diode_config.topk
+    old_expand_search_space = diode_config.expand_search_space
+
+    diode_config.topk = topk
+    diode_config.expand_search_space = expand_search_space
+
+    gemm_diode_model: GEMMModelV2 = GEMMModelV2(
+        model_config=MODEL_CONFIGS[DIODE_MODEL_CONFIGS_VERSION]
+    )
+    register(gemm_diode_model)
+
+    V.set_choices_handler(DiodeInductorChoices())
+
+    return old_topk, old_expand_search_space
+
+def teardown_diode_model(old_configs):
+    logger.info("[DIODE][TritonBench] Teardown Diode model.")
+
+    old_topk, old_expand_search_space = old_configs
+    diode_config.topk = old_topk
+    diode_config.expand_search_space = old_expand_search_space
+    get_registry().clear()
+    V.set_choices_handler(InductorChoices())