torch.compile reduction + cast

PaulZhang12 · facebook-github-bot · commit 7e3982d3ee0c · 2025-03-19T13:02:42.000-07:00
Summary: torch.compile the reduction + cast for fusing the 2 kernels. PartitionK now performs better than cuBLAS

Reviewed By: sijiac

Differential Revision: D71483304

fbshipit-source-id: 060d4a8c1be2fe3487876c6b7993e50e7d90fc1a
diff --git a/tritonbench/operators/gemm/partition_k.py b/tritonbench/operators/gemm/partition_k.py
@@ -213,6 +213,13 @@ def _reduce(
     tl.store(c_ptrs, reduced_k)
 
 
+def torch_reduction(c_buf, a):
+    return c_buf.sum(dim=2).to(a.dtype)
+
+
+compiled_reduction = torch.compile(torch_reduction)
+
+
 def matmul_partition_k(a, b, triton_reduce=False):
     # Check constraints.
     assert a.shape[1] == b.shape[0], "Incompatible dimensions"
@@ -276,4 +283,4 @@ def matmul_partition_k(a, b, triton_reduce=False):
         )
         return c
     else:
-        return c_buf.sum(dim=2).to(a.dtype)
+        return compiled_reduction(c_buf, a)