[Kernels] Enable persistent matmul for fp32 inputs (#9393)

ThomasRaoux · web-flow · commit f2895fae1b93 · 2026-02-20T15:06:41.000-08:00
diff --git a/python/triton_kernels/triton_kernels/matmul_details/_common.py b/python/triton_kernels/triton_kernels/matmul_details/_common.py
@@ -9,6 +9,7 @@
 @triton.constexpr_function
 def get_scaled_dot_format_string(dtype: tl.dtype):
     mapping = {
+        tl.float32: "fp32",
         tl.float16: "fp16",
         tl.bfloat16: "bf16",
         tl.uint8: "e2m1",
diff --git a/python/triton_kernels/triton_kernels/matmul_details/_matmul.py b/python/triton_kernels/triton_kernels/matmul_details/_matmul.py
@@ -20,7 +20,8 @@
 
 @triton.jit
 def round_f32_to_tf32(x: tl.tensor):
-    ASM: tl.constexpr = "cvt.rna.tf32.f32 $0, $1;"
+    # use cvt.rn on Hopper+ to match the rounding of TMA.
+    ASM: tl.constexpr = "cvt.rn.tf32.f32 $0, $1;" if cuda_capability_geq(9, 0) else "cvt.rna.tf32.f32 $0, $1;"
     return tl.inline_asm_elementwise(ASM, "=r, r", [x], dtype=tl.float32, is_pure=True, pack=1)
 
 _matmul_repr = make_matmul_repr("_matmul", [0, 1, 2])
diff --git a/python/triton_kernels/triton_kernels/matmul_details/_p_matmul.py b/python/triton_kernels/triton_kernels/matmul_details/_p_matmul.py
@@ -46,6 +46,12 @@ def _load_writeback_idx_and_mask(WriteBackIndx, writeback_size, offs, mask):
     return (offs, mask)
 
 
+@triton.jit
+def round_f32_to_tf32(x: tl.tensor):
+    ASM: tl.constexpr = "cvt.rn.tf32.f32 $0, $1;" if cuda_capability_geq(9, 0) else "cvt.rna.tf32.f32 $0, $1;"
+    return tl.inline_asm_elementwise(ASM, "=r, r", [x], dtype=tl.float32, is_pure=True, pack=1)
+
+
 _matmul_repr = make_matmul_repr("_p_matmul", [0, 1, 2])
 @triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"],
             repr=_matmul_repr, launch_metadata=matmul_launch_metadata)
@@ -312,7 +318,9 @@ def _p_matmul(
                         x = tl.load(XPtrs)
                 else:
                     x = tl.load(XPtrs, mask=mask_k[None, :], other=0.0)
-
+                if x.dtype == tl.float32 and ALLOW_TF32:
+                    # since data are not loaded from TMA we need to explicitly round to tf32.
+                    x = round_f32_to_tf32(x)
             # --- load x_scale ---
             x_format: tl.constexpr = get_scaled_dot_format_string(x.dtype)
             if is_x_microscaled:
diff --git a/python/triton_kernels/triton_kernels/matmul_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_details/opt_flags.py
@@ -248,7 +248,7 @@ def _is_layout_strided(layout: Layout | None) -> bool:
         is_persistent = True
     else:
         has_simple_epilogue = precision_config.max_num_imprecise_acc is None
-        is_persistent = supports_persistent and has_simple_epilogue and (tiles_per_sm >= 2.0 or lhs_dtype.bitwidth <= 8) and out_dtype.bitwidth < 32
+        is_persistent = supports_persistent and has_simple_epilogue and (tiles_per_sm >= 2.0 or lhs_dtype.bitwidth <= 8) and (out_dtype.bitwidth < 32 or lhs_dtype.bitwidth == 32 or rhs_dtype.bitwidth == 32)
         # TMA is slower for batched matmuls with small m/n/k.
         if m * n * k < 131072:
             is_persistent = False
diff --git a/python/triton_kernels/triton_kernels/matmul_details/opt_flags_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_details/opt_flags_details/opt_flags_nvidia.py
@@ -2,7 +2,7 @@
 import triton
 from triton_kernels import target_info
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
-from triton_kernels.tensor import FP4, Tensor, FP16, BF16
+from triton_kernels.tensor import FP4, FP16, FP32, BF16, Tensor
 from triton_kernels.tensor_details.layout import HopperMXScaleLayout
 from triton_kernels.tensor_details.layout_details.blackwell_scale import BlackwellActMXScaleLayout
 
@@ -143,7 +143,15 @@ def compute_num_stages(
         if x_transpose:
             smem_capacity -= block_m * block_k * (max(8, lhs_dtype.bitwidth) // 8)
 
+    # Persistent fp32 kernels need extra smem headroom (metadata/barriers/TMA state)
+    # that is not fully captured by the simple stage_size model above.
+    if is_persistent and (lhs_dtype == FP32 or rhs_dtype == FP32):
+        smem_capacity -= 32 * 1024
+    smem_capacity = max(smem_capacity, 0)
     num_stages = min(smem_capacity // int(stage_size), 4)
+    # Keep one stage of headroom for persistent fp32 to avoid launch-time OOR.
+    if is_persistent and (lhs_dtype == FP32 or rhs_dtype == FP32):
+        num_stages = min(num_stages, 3)
     if num_stages == 0:
         num_stages = 1
     return num_stages