[Codegen][GPU] Fix f32 attention compilation failure when head_dim=128 (#24138)

keshavvinayak01 · web-flow · commit d84dc737644e · 2026-04-22T21:21:10.000-07:00
Fix attention schedule deduction where QK intrinsic N was incorrectly set to PV intrinsic K, causing K2 tile to be smaller than the QK accumulator inner tile for f32, which later fails the packing in `GPUPackToIntrinsics`. Added regression test for f32 attention with hd=128 Fixes #24135 --------- Signed-off-by: Keshav Vinayak Jha <keshavvinayakjha@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -1206,10 +1206,22 @@ FailureOr<std::pair<GPUMMASchedule, GPUMMASchedule>> deduceAttentionSchedule(
     int64_t intrinsicAN = intrinsicA.nSizes[0];
     int64_t intrinsicAK = intrinsicA.kSizes[0];
     auto isValidSchedule = [&](const GPUMMASchedule &schedule) -> bool {
+      // The output of the QK matmul must be a valid LHS of the PV matmul.
+      // The total LHS tile (M x K) of the PV matmul must be a multiple of
+      // the output tile (M x N) of the intrinsic used for the QK matmul.
+      int64_t pvMTile = schedule.getTotalMTileSize() *
+                        schedule.getTotalMSize() *
+                        schedule.getTotalMSubgroupCount();
+      int64_t pvKTile = schedule.getTotalKTileSize() * schedule.getTotalKSize();
+      if (pvMTile % intrinsicAM != 0 || pvKTile % intrinsicAN != 0) {
+        return false;
+      }
+
       // Create a mma schedule for qkMatmul in attention.
       // qkMatmul.M = pvMatmul.M
       // qkMatmul.N = pvMatmul.K
-      // qkMatmul.K = problem.K
+      // qkMatmul.K = problem.K1
+      int64_t qkNTiles = pvKTile / intrinsicAN;
       SmallVector<int64_t, 2> qkKSizes = qkMatmul.kSizes;
       qkKSizes.back() = qkMatmul.kSizes.back() / intrinsicAK;
       GPUMMASchedule qkSchedule{
@@ -1220,7 +1232,7 @@ FailureOr<std::pair<GPUMMASchedule, GPUMMASchedule>> deduceAttentionSchedule(
           /*mSubgroupCount=*/schedule.mSubgroupCounts,
           /*nSubgroupCount=*/SmallVector<int64_t>(qkMatmul.nSizes.size(), 1),
           schedule.mTileSizes,
-          schedule.kTileSizes,
+          {qkNTiles},
           qkKSizes};
 
       bool isQKAligned =
@@ -1262,18 +1274,21 @@ FailureOr<std::pair<GPUMMASchedule, GPUMMASchedule>> deduceAttentionSchedule(
     // Create a mma schedule for qkMatmul in attention.
     // qkMatmul.M = pvMatmul.M
     // qkMatmul.N = pvMatmul.K
-    // qkMatmul.K = problem.K
+    // qkMatmul.K = problem.K1
+    int64_t pvKTile =
+        pvSchedule->getTotalKTileSize() * pvSchedule->getTotalKSize();
+    int64_t qkNTiles = pvKTile / intrinsicAN;
     SmallVector<int64_t, 2> qkKSizes = qkMatmul.kSizes;
     qkKSizes.back() = qkMatmul.kSizes.back() / intrinsicAK;
     GPUMMASchedule qkSchedule{
         intrinsicA.mmaKind,
         pvSchedule->mSizes,
-        pvSchedule->kSizes,
+        {intrinsicAN},
         {intrinsicAK},
         /*mSubgroupCount=*/pvSchedule->mSubgroupCounts,
         /*nSubgroupCount=*/SmallVector<int64_t>(qkMatmul.nSizes.size(), 1),
         pvSchedule->mTileSizes,
-        pvSchedule->kTileSizes,
+        {qkNTiles},
         qkKSizes};
 
     return std::pair(qkSchedule, pvSchedule.value());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -869,7 +869,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig(
   Type f32Type = b.getF32Type();
   GPUMatmulShapeType qkMatmul{
       /*m=*/getDimBounds(mDims),
-      /*n=*/getDimBounds(nDims),
+      /*n=*/getDimBounds(k2Dims),
       /*k=*/getDimBounds(k1Dims),
       /*batch=*/getDimBounds(batchDims),
       /*a=*/qElementType,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx942.mlir
@@ -524,3 +524,55 @@ func.func @attention_multi_m_dynamic(%arg0 : tensor<20x8x?x16x64xf16>, %arg1 : t
 // CHECK-SAME:           #iree_gpu.lowering_config
 // CHECK-SAME:                           reduction =  [0, 0, 0, 0, 0, 64, 0]
 // CHECK-SAME:                           workgroup =  [1, 4, 1, 16, 0, 0, 64]
+
+// -----
+
+// CHECK:      #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<VectorDistribute>
+
+// CHECK-LABEL: func.func @attention_f32_20x4096x128x4096x128()
+
+#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>
+#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()>
+#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>
+#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @attention_f32_20x4096x128x4096x128() {
+  %cst = arith.constant 1.250000e-01 : f32
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<20x4096x128xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<20x4096x128xf32>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<20x4096x128xf32>>
+  %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<20x4096x128xf32>>
+  %4 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 128], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<20x4096x128xf32>> -> tensor<20x4096x128xf32>
+  %5 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 128], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<20x4096x128xf32>> -> tensor<20x4096x128xf32>
+  %6 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 128], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<20x4096x128xf32>> -> tensor<20x4096x128xf32>
+  %7 = tensor.empty() : tensor<20x4096x128xf32>
+  %8 = tensor.empty() : tensor<20x4096xf32>
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %cst_1 = arith.constant -3.40282347E+38 : f32
+  %cst_2 = arith.constant 0.000000e+00 : f32
+  %9 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<20x4096x128xf32>) -> tensor<20x4096x128xf32>
+  %10 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<20x4096xf32>) -> tensor<20x4096xf32>
+  %11 = linalg.fill ins(%cst_2 : f32) outs(%8 : tensor<20x4096xf32>) -> tensor<20x4096xf32>
+  %12:3 = iree_linalg_ext.online_attention {indexing_maps = [#map, #map1, #map2, #map3, #map4, #map5, #map5]} ins(%4, %5, %6, %cst : tensor<20x4096x128xf32>, tensor<20x4096x128xf32>, tensor<20x4096x128xf32>, f32) outs(%9, %10, %11 : tensor<20x4096x128xf32>, tensor<20x4096xf32>, tensor<20x4096xf32>) {
+  ^bb0(%arg0: f32):
+    iree_linalg_ext.yield %arg0 : f32
+  } -> tensor<20x4096x128xf32>, tensor<20x4096xf32>, tensor<20x4096xf32>
+  iree_tensor_ext.dispatch.tensor.store %12#0, %3, offsets = [0, 0, 0], sizes = [20, 4096, 128], strides = [1, 1, 1] : tensor<20x4096x128xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<20x4096x128xf32>>
+  return
+}
+
+// CHECK:                #iree_gpu.lowering_config
+// CHECK-SAME:            mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
+// CHECK-SAME:           #iree_gpu.lowering_config
+// CHECK-SAME:            mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
+// CHECK-SAME:           #iree_gpu.lowering_config
+// CHECK-SAME:            reduction =  [0, 0, 0, 16, 0]
+// CHECK-SAME:            workgroup =  [1, 64, 0, 0, 64]