Enable full autotuning for AMD

njriasan · facebook-github-bot · commit 391846f28421 · 2025-03-18T12:00:53.000-07:00
Summary: Enables full autotuning for AMD so we can test across the full suite of configurations. Requires setting an evironment variable.

Reviewed By: PaulZhang12

Differential Revision: D70587961

fbshipit-source-id: 8fb0faffb49bb9fed7907e0ec66e0ba39fb3ee0f
diff --git a/tritonbench/operators/gemm/kernels/matmul.py b/tritonbench/operators/gemm/kernels/matmul.py
@@ -4,6 +4,8 @@
 
 from triton import autotune, cdiv, Config, heuristics, jit, language as tl
 
+from ..triton_matmul_configs import get_full_amd_config_space, init_to_zero
+
 from .matmul_perf_model import early_config_prune, estimate_matmul_time
 
 _ordered_datatypes = [torch.int8, torch.float16, torch.bfloat16, torch.float32]
@@ -31,10 +33,6 @@ def get_higher_dtype(a, b):
             return a
 
 
-def init_to_zero(name):
-    return lambda nargs: nargs[name].zero_()
-
-
 def get_configs_io_bound():
     configs = []
     for num_stages in [2, 3, 4, 5, 6]:
@@ -85,9 +83,10 @@ def get_configs_io_bound():
     else {}
 )
 
-
-@autotune(
-    configs=[
+if os.environ.get("FULL_AUTOTUNING_AMD", "0") == "1" and torch.version.hip is not None:
+    tuning_configs = get_full_amd_config_space(True)
+else:
+    tuning_configs = [
         # basic configs for compute-bound matmuls
         Config(
             {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1, "GROUP_M": 8},
@@ -198,8 +197,11 @@ def get_configs_io_bound():
             num_stages=5,
             num_warps=2,
         ),
-    ]
-    + get_configs_io_bound(),
+    ] + get_configs_io_bound()
+
+
+@autotune(
+    configs=tuning_configs,
     key=["M", "N", "K"],
     prune_configs_by=prune_configs_by,
 )
diff --git a/tritonbench/operators/gemm/persistent_matmul.py b/tritonbench/operators/gemm/persistent_matmul.py
@@ -8,6 +8,8 @@
 from tritonbench.utils.env_utils import is_cuda
 from tritonbench.utils.triton_op import IS_FBCODE
 
+from .triton_matmul_configs import get_full_amd_config_space
+
 if not IS_FBCODE:
     import triton.tools.experimental_descriptor
 
@@ -96,8 +98,14 @@ def _matmul_launch_metadata(grid, kernel, args):
     return ret
 
 
+if os.environ.get("FULL_AUTOTUNING_AMD", "0") == "1" and torch.version.hip is not None:
+    tuning_configs = get_full_amd_config_space(False)
+else:
+    tuning_configs = persistent_matmul_configs()
+
+
 @triton.autotune(
-    configs=persistent_matmul_configs(),
+    configs=tuning_configs,
     key=["M", "N", "K"],
 )
 @triton.jit(launch_metadata=_matmul_launch_metadata)
diff --git a/tritonbench/operators/gemm/stream_k.py b/tritonbench/operators/gemm/stream_k.py
@@ -15,28 +15,33 @@
 
 from tritonbench.utils.env_utils import is_hip_mi300
 
-tuning_configs = [
-    triton.Config(
-        {
-            "BLOCK_M": 128,
-            "BLOCK_N": 128,
-            "BLOCK_K": 64,
-            "GROUP_M": 8,
-        },
-        num_stages=2,
-        num_warps=8,
-    ),
-    triton.Config(
-        {
-            "BLOCK_M": 64,
-            "BLOCK_N": 64,
-            "BLOCK_K": 128,
-            "GROUP_M": 8,
-        },
-        num_stages=2,
-        num_warps=8,
-    ),
-]
+from .triton_matmul_configs import get_full_amd_config_space
+
+if os.environ.get("FULL_AUTOTUNING_AMD", "0") == "1" and torch.version.hip is not None:
+    tuning_configs = get_full_amd_config_space(False)
+else:
+    tuning_configs = [
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 128,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "BLOCK_K": 128,
+                "GROUP_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+    ]
 
 
 @triton.autotune(
diff --git a/tritonbench/operators/gemm/triton_matmul.py b/tritonbench/operators/gemm/triton_matmul.py
@@ -10,7 +10,12 @@
 import triton
 import triton.language as tl
 
-from .triton_matmul_configs import configs
+from .triton_matmul_configs import configs, get_full_amd_config_space
+
+if os.environ.get("FULL_AUTOTUNING_AMD", "0") == "1" and torch.version.hip is not None:
+    tuning_configs = get_full_amd_config_space(False)
+else:
+    tuning_configs = configs
 
 
 # `triton.jit`'ed functions can be auto-tuned by using the `triton.autotune` decorator, which consumes:
@@ -19,7 +24,7 @@
 #   - An auto-tuning *key* whose change in values will trigger evaluation of all the
 #       provided configs
 @triton.autotune(
-    configs=configs,
+    configs=tuning_configs,
     key=["M", "N", "K"],
 )
 @triton.jit
diff --git a/tritonbench/operators/gemm/triton_matmul_configs.py b/tritonbench/operators/gemm/triton_matmul_configs.py
@@ -1,3 +1,4 @@
+import numpy as np
 import torch
 
 import triton
@@ -259,3 +260,57 @@
             num_warps=2,
         ),
     ]
+
+
+def init_to_zero(name):
+    return lambda nargs: nargs[name].zero_()
+
+
+def get_full_amd_config_space(use_splitk: bool):
+    configs = []
+
+    block_mn_range = [16, 32, 64, 128, 256]
+    block_k_range = [16, 32, 64, 128, 256]
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [8]
+    waves_per_eu_range = [0, 1, 2, 4]
+
+    for block_m in block_mn_range:
+        for block_n in block_mn_range:
+            for block_k in block_k_range:
+                for num_warps in num_warps_range:
+                    for group_m in group_m_range:
+                        for waves_per_eu in waves_per_eu_range:
+                            base_config_dict = {
+                                "BLOCK_M": block_m,
+                                "BLOCK_N": block_n,
+                                "BLOCK_K": block_k,
+                                "GROUP_M": group_m,
+                                "waves_per_eu": waves_per_eu,
+                                "kpack": 2,
+                            }
+                            config_dicts = []
+                            if use_splitk:
+                                max_k_pow2 = np.int64(np.log2(block_k))
+                                split_k_range = [2**i for i in range(max_k_pow2)]
+                                for split_k in split_k_range:
+                                    config_dicts.append(
+                                        {
+                                            **base_config_dict,
+                                            "SPLIT_K": split_k,
+                                        }
+                                    )
+                            else:
+                                config_dicts.append(base_config_dict)
+                            for config_dict in config_dicts:
+                                configs.append(
+                                    triton.Config(
+                                        config_dict,
+                                        num_warps=num_warps,
+                                        num_stages=2,
+                                        pre_hook=init_to_zero("C")
+                                        if use_splitk
+                                        else None,
+                                    )
+                                )
+    return configs