Add L2 Cache optimization to PartitionK matmul kernel

PaulZhang12 · facebook-github-bot · commit 62d4b4fdc756 · 2025-03-18T14:05:13.000-07:00
Summary: Implement the L2 cache optimization for PartitionK, which previously wasn't there. Furthermore, tune on the GROUP_SIZE_M. Benchmark results from TritonBench are in the test plan based on separate partitionK values. Large partitionK values lead to accuracy issues + poor performance. TODO: Currently unable to tune on partitionK parameter. Asked question in slack along the lines of the following: > For context, I am working with a version of the partition-k matmul kernel on triton, which contains an intermediate tensor for storing the of each K before reduction, of shape (M, N, partitionK). I want to be able to autotune on this value of partitionK, allocating the intermediate buffer like the following: ``` def allocate_c_buf(nargs, **kwargs): nargs["c_buf_ptr"] = torch.empty((nargs["M"], nargs["N"], nargs["PARTITION_SIZE_K"]), device=nargs["c_buf_ptr"].device, dtype=nargs["c_buf_ptr"].dtype) ``` > It seems like the Triton autotuner does not use the modification of the nargs dictionary: https://github.com/triton-lang/triton/blob/main/python/triton/runtime/autotuner.py#L151-L154, just the original args. Therefore, the pre_hook doesn't actually do anything here. What is the recommended approach to do this? Can the autotuner code be modified to take the modified version of full_nargs and run the kernel with that? TLDR: Triton autotuning only supports modifying the tensor args in-place from what it seems like. There is no way to extend out the memory of a tensor in place. Reviewed By: sijiac Differential Revision: D71368870 fbshipit-source-id: 93312b1763317a670099f528aa6f369717d3104d
diff --git a/tritonbench/operators/gemm/partition_k.py b/tritonbench/operators/gemm/partition_k.py
@@ -4,8 +4,8 @@
 import triton.language as tl
 
 
-@triton.autotune(
-    configs=[
+def get_mm_configs():
+    configs = [
         triton.Config(
             {
                 "BLOCK_SIZE_M": 32,
@@ -60,7 +60,27 @@
             num_stages=6,
             num_warps=2,
         ),
-    ],
+    ]
+
+    partition_k_configs = []
+    for config in configs:
+        for GROUP_SIZE_M in [1, 4, 8]:
+            partition_k_configs.append(
+                triton.Config(
+                    {
+                        **config.kwargs,
+                        "GROUP_SIZE_M": GROUP_SIZE_M,
+                    },
+                    num_stages=config.num_stages,
+                    num_warps=config.num_warps,
+                )
+            )
+
+    return partition_k_configs
+
+
+@triton.autotune(
+    configs=get_mm_configs(),
     key=["M", "N", "K", "PK"],
 )
 @triton.jit
@@ -89,6 +109,7 @@ def _matmul_partition_k(
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,  #
+    GROUP_SIZE_M: tl.constexpr,
 ):
     """Kernel for computing the matmul C = A x B.
     A has shape (M, K), B has shape (K, N) and C has shape (M, N)
@@ -97,21 +118,22 @@ def _matmul_partition_k(
     # Map program ids `pid` to the block of C it should compute.
     # This is done in a grouped ordering to promote L2 data reuse.
     # See above `L2 Cache Optimizations` section for details.
-    pid_m = tl.program_id(axis=0)
-    pid_n = tl.program_id(axis=1)
-    pid_pk = tl.program_id(axis=2)
-    # num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    # num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    # num_pid_pk = PK
-    # num_pid_nk = num_pid_n * num_pid_pk
-    # num_pid_in_group = GROUP_SIZE_M * num_pid_nk
-    # group_id = pid // num_pid_in_group
-    # first_pid_m = group_id * GROUP_SIZE_M
-    # group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    # pid_m = first_pid_m + (pid % group_size_m)
-    # pid_nk = (pid % num_pid_in_group) // group_size_m
-    # pid_n = pid_nk // num_pid_n
-    # pid_pk = pid_nk % num_pid_n
+    # pid_m = tl.program_id(axis=0)
+    # pid_n = tl.program_id(axis=1)
+    # pid_pk = tl.program_id(axis=2)
+    pid = tl.program_id(0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_pk = PK
+    num_pid_nk = num_pid_n * num_pid_pk
+    num_pid_in_group = GROUP_SIZE_M * num_pid_nk
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_nk = (pid % num_pid_in_group) // group_size_m
+    pid_n = pid_nk // num_pid_pk
+    pid_pk = pid_nk % num_pid_pk
 
     # ----------------------------------------------------------
     # Create pointers for the first blocks of A and B.
@@ -198,7 +220,8 @@ def matmul_partition_k(a, b, triton_reduce=False):
     assert a.is_contiguous(), "Matrix A must be contiguous"
     assert b.is_contiguous(), "Matrix B must be contiguous"
 
-    partitionK = 64
+    # TODO: Tune on this parameter, currently 32 is best performing
+    partitionK = 32
 
     M, K = a.shape
     K, N = b.shape
@@ -210,9 +233,9 @@ def matmul_partition_k(a, b, triton_reduce=False):
     # 1D launch kernel where each block gets its own program.
 
     grid = lambda META: (
-        triton.cdiv(M, META["BLOCK_SIZE_M"]),
-        triton.cdiv(N, META["BLOCK_SIZE_N"]),
-        partitionK,
+        triton.cdiv(M, META["BLOCK_SIZE_M"])
+        * triton.cdiv(N, META["BLOCK_SIZE_N"])
+        * partitionK,
     )
     _matmul_partition_k[grid](
         a,