[GPUHeuristics] Improve large GEMM intrinsic selection on CDNA4 (#24115)

yzhang93 · claude · web-flow · commit 47d41ab733a0 · 2026-04-17T12:03:01.000-07:00
Extend the compute-throughput-first intrinsic preference to LargeGemm
shapes, preferring MFMA_F32_32x32x16_F16 over MFMA_F32_16x16x32_F16 (4x
more output per instruction). Add VGPR pressure cap to prevent spilling
when MNT boost sets high tile counts with 32x32 intrinsics.

Top GEMM improvements on MI355X:
```
4096x1024x150000:  2112us -&gt; 1538us (1.37x)
2268x4096x150000: 11359us -&gt; 8529us (1.33x)
1024x4096x150000:  1982us -&gt; 1573us (1.26x)
4096x2048x150000:  4015us -&gt; 3307us (1.21x)
2048x8192x4096:     183us -&gt;  154us (1.19x)
```

Top conv improvements on MI355X (NHWC, fp16):
```
n32 c256 H100xW100 k2376 3x3 wgrad: 7983us -&gt; 6634us (1.20x)
n32 c256 H25xW25   k2376 3x3 wgrad:  777us -&gt;  664us (1.17x)
n32 c256 H100xW100 k2376 3x3 fwd:   7042us -&gt; 6122us (1.15x)
n32 c256 H25xW25   k2376 3x3 fwd:    452us -&gt;  405us (1.12x)
n32 c256 H50xW50   k2376 3x3 fwd:   1711us -&gt; 1541us (1.11x)
```

Overall GEMM benchmark: **+6.3%** geomean speedup.
Overall Proxy conv benchmark: **+2.5%** geomean speedup.

Some regressions exist in K-dominated wgrad shapes due to larger
workgroup tiles, but overall improvements outweigh regressions across
all benchmarks.

---------

Signed-off-by: yzhang93 &lt;zhyuhang88@gmail.com&gt;
Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -697,7 +697,8 @@ static double computeMNUtilization(const GPUMatmulShapeType &problem,
 /// returns true if the lhs is ordered before rhs.
 static bool compareIntrinsics(const GPUMatmulShapeType &problem,
                               const GPUIntrinsicType &lhs,
-                              const GPUIntrinsicType &rhs) {
+                              const GPUIntrinsicType &rhs,
+                              bool preferHighComputeIntrinsic = false) {
   // When both M and N need padding, prefer the intrinsic with better M*N
   // utilization. This targets grouped convolutions where per-group channels
   // are small (e.g., 8x8 problem: 16x16 at 25% util >> 32x32 at 6.25%).
@@ -775,7 +776,7 @@ static bool compareIntrinsics(const GPUMatmulShapeType &problem,
   // (compute=8192, area=512) because throughput matters more. Among
   // 16x16x32 and 32x32x16 (both area=1024), prefer smaller K (16 vs 32)
   // for less operand staging pressure.
-  if (problem.gemmSize == GemmSizeKind::VeryLargeGemm) {
+  if (preferHighComputeIntrinsic) {
     int64_t lhsCompute = intrinsicCompute(lhs);
     int64_t rhsCompute = intrinsicCompute(rhs);
     if (lhsCompute != rhsCompute) {
@@ -806,11 +807,12 @@ static bool compareIntrinsics(const GPUMatmulShapeType &problem,
 
 static SmallVector<GPUIntrinsicType>
 sortMMAIntrinsics(GPUMatmulShapeType problem,
-                  ArrayRef<GPUIntrinsicType> intrinsics) {
+                  ArrayRef<GPUIntrinsicType> intrinsics,
+                  bool preferHighComputeIntrinsic = false) {
   SmallVector<GPUIntrinsicType> sortedIntrinsics(intrinsics);
   llvm::stable_sort(sortedIntrinsics, [&](const GPUIntrinsicType &lhs,
                                           const GPUIntrinsicType &rhs) {
-    return compareIntrinsics(problem, lhs, rhs);
+    return compareIntrinsics(problem, lhs, rhs, preferHighComputeIntrinsic);
   });
   return sortedIntrinsics;
 }
@@ -834,14 +836,16 @@ static int64_t computeEstimatedWorkgroupCount(const GPUMMAHeuristicSeeds &seeds,
 }
 
 /// Adjust M*N tile-count (bestMNTileCountPerSubgroup) seeds based on target
-/// hardware and problem characteristics. Three independent adjustments, applied
+/// hardware and problem characteristics. Four independent adjustments, applied
 /// in order:
 /// 1. Baseline (all targets): reduces bestMNTileCountPerSubgroup until the
 ///    estimated workgroup count fills all CUs.
 /// 2. Tile-count boost (when boostMNTileCountPerSubgroup is set): for GEMMs
 ///    with balanced K, boosts tile count to the architecture-specific target.
 /// 3. Utilization guard (when minUtilizationThreshold is set): halves tile
 ///    count until GPU utilization meets the threshold.
+/// 4. VGPR pressure cap: limits MN tile count based on per-thread output
+///    register pressure from the selected intrinsic, preventing spilling.
 static void adjustSeedsForTarget(GPUMMAHeuristicSeeds &seeds,
                                  const GPUMatmulShapeType &problem,
                                  const GPUIntrinsicType &intrinsic,
@@ -898,6 +902,12 @@ static void adjustSeedsForTarget(GPUMMAHeuristicSeeds &seeds,
           std::max(seeds.bestMNTileCountPerSubgroup, boostMNT);
       LDBG() << "Boosting MNT to " << seeds.bestMNTileCountPerSubgroup
              << " for balanced large gemm";
+      // Halve subgroup count to offset the MNT boost, keeping the total
+      // workgroup resource footprint (threads, LDS) in check for occupancy.
+      seeds.bestSubgroupCountPerWorkgroup =
+          std::max<int64_t>(1, seeds.bestSubgroupCountPerWorkgroup / 2);
+      LDBG() << "Halving subgroup count to "
+             << seeds.bestSubgroupCountPerWorkgroup << " to offset MNT boost";
     }
   }
 
@@ -928,6 +938,27 @@ static void adjustSeedsForTarget(GPUMMAHeuristicSeeds &seeds,
              << seeds.bestMNTileCountPerSubgroup;
     }
   }
+
+  // Cap per-subgroup MN tile count based on output VGPR pressure from the
+  // selected intrinsic. Only applies when the MNT boost (step 2) is
+  // configured, since the boost can push MN tile counts high enough to
+  // cause spilling with large-output intrinsics (32x32). Capping at 128
+  // output VGPRs per thread (8 MN tiles for 32x32, 32 for 16x16) prevents
+  // spilling while preserving the boost for intrinsics that can handle
+  // higher tile counts.
+  if (seeds.maxOutputVGPRsPerThread) {
+    int64_t subgroupSize = target.getPreferredSubgroupSize();
+    int64_t outputVGPRsPerTile =
+        (intrinsic.mSizes[0] * intrinsic.nSizes[0]) / subgroupSize;
+    int64_t maxMNTiles = *seeds.maxOutputVGPRsPerThread / outputVGPRsPerTile;
+    if (seeds.bestMNTileCountPerSubgroup > maxMNTiles) {
+      LDBG() << "VGPR cap: reducing bestMNTileCountPerSubgroup from "
+             << seeds.bestMNTileCountPerSubgroup << " to " << maxMNTiles
+             << " (intrinsic " << intrinsic.mSizes[0] << "x"
+             << intrinsic.nSizes[0] << ")";
+      seeds.bestMNTileCountPerSubgroup = maxMNTiles;
+    }
+  }
 }
 
 FailureOr<GPUMMASchedule> deduceMMASchedule(
@@ -938,8 +969,19 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
     bool useDirectLoad, int64_t prefetchNumStages, bool mustBeAligned,
     bool doCPromotion, int64_t splitReductionTripCnt) {
 
+  // Prefer higher-compute intrinsics (e.g., 32x32x16 over 16x16x32) for:
+  //  - VeryLargeGemm: always compute-bound, higher throughput wins.
+  //  - LargeGemm on architectures with MNT boost (e.g., CDNA4): the boost
+  //    indicates the target benefits from larger output tiles. Gated by
+  //    !doCPromotion to avoid regressing addmm shapes that need accumulator
+  //    promotion to shared memory.
+  bool isLargeGemmWithBoost = problem.gemmSize == GemmSizeKind::LargeGemm &&
+                              seeds.boostMNTileCountPerSubgroup.has_value() &&
+                              !doCPromotion;
+  bool preferHighComputeIntrinsic =
+      problem.gemmSize == GemmSizeKind::VeryLargeGemm || isLargeGemmWithBoost;
   SmallVector<GPUIntrinsicType> sortedIntrinsics =
-      sortMMAIntrinsics(problem, intrinsics);
+      sortMMAIntrinsics(problem, intrinsics, preferHighComputeIntrinsic);
 
   // Compute product of M and N problem sizes to decide if block intrinsics
   // should be considered. If both M and N products exceed the threshold, skip
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
@@ -102,6 +102,10 @@ struct GPUMMAHeuristicSeeds {
   // per workgroup), which can improve performance when the GPU has enough work
   // to stay saturated.
   std::optional<int64_t> boostMNTileCountPerSubgroup = std::nullopt;
+  // Maximum output VGPRs per thread for the VGPR pressure cap. When set,
+  // adjustSeedsForTarget will reduce bestMNTileCountPerSubgroup to keep
+  // per-thread output register pressure within this limit.
+  std::optional<int64_t> maxOutputVGPRsPerThread = std::nullopt;
 };
 
 struct GPUMMASchedule {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -1351,9 +1351,10 @@ static constexpr ArchSeedSet kCDNA4Seeds = {
     /*gemm=*/{
         /*SmallGemm=*/     {2, 2,  4, 2 * kCacheLineSizeBits},
         /*MediumGemm=*/    {4, 8,  4, 2 * kCacheLineSizeBits},
-        /*LargeGemm=*/     {4, 16, 2, kCacheLineSizeBits / 2,
+        /*LargeGemm=*/     {8, 16, 2, kCacheLineSizeBits / 2,
                             /*minUtilizationThreshold=*/0.50,
-                            /*boostMNTileCountPerSubgroup=*/32},
+                            /*boostMNTileCountPerSubgroup=*/32,
+                            /*maxOutputVGPRsPerThread=*/128},
         /*VeryLargeGemm=*/ {4, 16, 2, kCacheLineSizeBits / 2},
     },
     /*scaledGemm=*/{
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir
@@ -412,13 +412,13 @@ func.func @matmul_large_symmetric_f16(
 
 // MI355X-LABEL: func.func @matmul_large_symmetric_f16
 //  MI355X-SAME:   #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse>
-//  MI355X-SAME:   workgroup_size = [256, 1, 1] subgroup_size = 64
+//  MI355X-SAME:   workgroup_size = [512, 1, 1] subgroup_size = 64
 //       MI355X:   linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
-//  MI355X-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>
+//  MI355X-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x16_F16>
 //  MI355X-SAME:     promote_operands = [0, 1]
-//  MI355X-SAME:     reduction = [0, 0, 1]
-//  MI355X-SAME:     subgroup = [4, 8, 0]
-//  MI355X-SAME:     workgroup = [128, 256, 0]
+//  MI355X-SAME:     reduction = [0, 0, 2]
+//  MI355X-SAME:     subgroup = [2, 4, 0]
+//  MI355X-SAME:     workgroup = [256, 256, 0]
 
 // -----
 
@@ -437,13 +437,13 @@ func.func @matmul_large_tall_m_f16(
 
 // MI355X-LABEL: func.func @matmul_large_tall_m_f16
 //  MI355X-SAME:   #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse>
-//  MI355X-SAME:   workgroup_size = [256, 1, 1] subgroup_size = 64
+//  MI355X-SAME:   workgroup_size = [512, 1, 1] subgroup_size = 64
 //       MI355X:   linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
-//  MI355X-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>
+//  MI355X-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x16_F16>
 //  MI355X-SAME:     promote_operands = [0, 1]
-//  MI355X-SAME:     reduction = [0, 0, 1]
-//  MI355X-SAME:     subgroup = [4, 8, 0]
-//  MI355X-SAME:     workgroup = [128, 256, 0]
+//  MI355X-SAME:     reduction = [0, 0, 2]
+//  MI355X-SAME:     subgroup = [2, 4, 0]
+//  MI355X-SAME:     workgroup = [256, 256, 0]
 
 // -----
 
@@ -462,13 +462,13 @@ func.func @matmul_large_wide_n_f16(
 
 // MI355X-LABEL: func.func @matmul_large_wide_n_f16
 //  MI355X-SAME:   #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse>
-//  MI355X-SAME:   workgroup_size = [256, 1, 1] subgroup_size = 64
+//  MI355X-SAME:   workgroup_size = [512, 1, 1] subgroup_size = 64
 //       MI355X:   linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
-//  MI355X-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>
+//  MI355X-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x16_F16>
 //  MI355X-SAME:     promote_operands = [0, 1]
-//  MI355X-SAME:     reduction = [0, 0, 1]
-//  MI355X-SAME:     subgroup = [4, 8, 0]
-//  MI355X-SAME:     workgroup = [128, 256, 0]
+//  MI355X-SAME:     reduction = [0, 0, 2]
+//  MI355X-SAME:     subgroup = [2, 4, 0]
+//  MI355X-SAME:     workgroup = [256, 256, 0]
 
 // -----
 
@@ -490,11 +490,11 @@ func.func @matmul_large_very_tall_m_f16(
 //  MI355X-SAME:   #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse>
 //  MI355X-SAME:   workgroup_size = [256, 1, 1] subgroup_size = 64
 //       MI355X:   linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
-//  MI355X-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>
+//  MI355X-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x16_F16>
 //  MI355X-SAME:     padding = [128, 256, 32]
 //  MI355X-SAME:     promote_operands = [0, 1]
-//  MI355X-SAME:     reduction = [0, 0, 1]
-//  MI355X-SAME:     subgroup = [4, 8, 0]
+//  MI355X-SAME:     reduction = [0, 0, 2]
+//  MI355X-SAME:     subgroup = [2, 4, 0]
 //  MI355X-SAME:     workgroup = [128, 256, 0]
 
 // -----