[AutoWS] Fix AutoWS Hopper GEMM partition structure with DP=2 (#1322)

njriasan · meta-codesync[bot] · commit 17cc629b0ca1 · 2026-04-27T04:22:24.000-07:00
Summary: Fixes the partition structure with DP=2. Previously these were being mapped to the same partition so there was no benefit. Now these are mapped to separate partitions, allowing greater parallelism and the opportunity to use ping pong. This also fixes several structural bugs in the compiler. Pull Request resolved: #1322 Reviewed By: manman-ren Differential Revision: D102372962 Pulled By: njriasan fbshipit-source-id: 33ee700faa08086a798dd21f0d078264d61e3f78
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp
@@ -164,6 +164,7 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
   SmallVector<unsigned> maxTensorRegs;
   for (Region *partition : wsOp.getPartitionRegions()) {
     unsigned &tensorRegs = maxTensorRegs.emplace_back(0);
+
     partition->walk([&](Operation *op) {
       for (Type type :
            llvm::concat<Type>(op->getOperandTypes(), op->getResultTypes())) {
@@ -207,7 +208,9 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
       if (isa<ttng::AsyncTMAGatherOp, ttng::AsyncTMAScatterOp>(op))
         *minWarps = 2;
       // TMEM ops require at least 4 warps to be able to read all lanes.
-      else if (isa<ttng::TMEMLoadOp, ttng::TMEMStoreOp, ttng::TMEMAllocOp>(op))
+      // WarpGroupDotOp requires a full warp group (4 warps).
+      else if (isa<ttng::TMEMLoadOp, ttng::TMEMStoreOp, ttng::TMEMAllocOp,
+                   ttng::WarpGroupDotOp>(op))
         *minWarps = 4;
     });
   }
@@ -306,7 +309,6 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
   for (auto [partition, newNumWarps, prevNumWarps, tensorRegs, estRegs] :
        llvm::zip(wsOp.getPartitionRegions(), partitionNumWarps,
                  wsOp.getPartitionNumWarps(), maxTensorRegs, estRegUsage)) {
-
     // "Guess" the register usage for each partition.
     estRegs = tensorRegs ? maxRegAutoWS : minRegAutoWS;
 
diff --git a/python/test/unit/language/test_tutorial09_warp_specialization.py b/python/test/unit/language/test_tutorial09_warp_specialization.py
@@ -1110,6 +1110,7 @@ def test_hopper_matmul_tma_warp_specialize(
     """Test matmul_kernel_tma with warp_specialize=True on Hopper (K-loop based)."""
     if DATA_PARTITION_FACTOR != 1 and BLOCK_SIZE_M != 128:
         pytest.skip("DATA_PARTITION_FACTOR != 1 requires BLOCK_SIZE_M == 128")
+
     if BLOCK_SIZE_N == 256 and BLOCK_SIZE_K == 128 and not (BLOCK_SIZE_M == 64 and num_stages == 2):
         pytest.skip("OOM: shared memory exceeds H100 limit")
 
@@ -1169,6 +1170,7 @@ def alloc_fn(size, align, stream):
             num_warps=num_warps,
             early_tma_store_lowering=use_early_tma_store_lowering,
             pingpongAutoWS=enable_pingpong,
+            maxRegAutoWS=208 if DATA_PARTITION_FACTOR > 1 else 252,
         )
 
         ttgir = kernel.asm["ttgir"]
@@ -1221,6 +1223,7 @@ def test_hopper_matmul_tma_persistent_warp_specialize(
     """
     if DATA_PARTITION_FACTOR != 1 and BLOCK_SIZE_M != 128:
         pytest.skip("DATA_PARTITION_FACTOR != 1 requires BLOCK_SIZE_M == 128")
+
     if BLOCK_SIZE_N == 256 and BLOCK_SIZE_K == 128 and not (BLOCK_SIZE_M == 64 and num_stages == 2):
         pytest.skip("OOM: shared memory exceeds H100 limit")
 
@@ -1295,6 +1298,7 @@ def alloc_fn(size, align, stream):
             num_warps=num_warps,
             early_tma_store_lowering=use_early_tma_store_lowering,
             pingpongAutoWS=enable_pingpong,
+            maxRegAutoWS=208 if DATA_PARTITION_FACTOR > 1 else 252,
         )
 
         ttgir = kernel.asm["ttgir"]
@@ -1348,6 +1352,7 @@ def test_hopper_matmul_descriptor_persistent_warp_specialize(
     """
     if DATA_PARTITION_FACTOR != 1 and BLOCK_SIZE_M != 128:
         pytest.skip("DATA_PARTITION_FACTOR != 1 requires BLOCK_SIZE_M == 128")
+
     if BLOCK_SIZE_N == 256 and BLOCK_SIZE_K == 128 and not (BLOCK_SIZE_M == 64 and num_stages == 2):
         pytest.skip("OOM: shared memory exceeds H100 limit")
 
@@ -1407,6 +1412,7 @@ def alloc_fn(size, align, stream):
             num_warps=num_warps,
             early_tma_store_lowering=use_early_tma_store_lowering,
             pingpongAutoWS=enable_pingpong,
+            maxRegAutoWS=208 if DATA_PARTITION_FACTOR > 1 else 252,
         )
 
         ttgir = kernel.asm["ttgir"]
diff --git a/test/Hopper/WarpSpecialization/partition-scheduling-meta-hopper-gemm-data-partition.mlir b/test/Hopper/WarpSpecialization/partition-scheduling-meta-hopper-gemm-data-partition.mlir
@@ -0,0 +1,134 @@
+// RUN: triton-opt %s --nvgpu-partition-scheduling-meta --verify-each=false | FileCheck %s
+
+// Tests that on Hopper (cuda:90) with DATA_PARTITION_FACTOR=2 and
+// WarpGroupDotOp, the partition scheduler correctly creates per-dpId
+// computation partitions using the WarpGroupDotOp fallback (since
+// WSDataPartition already split the dots, leaving no DataPartition-
+// categorized ops in backward slices). Epilogue is merged into
+// computation partitions so each MMA's truncf + TMA store lives
+// alongside it.
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+
+// CHECK-LABEL: hopper_data_partitioned_gemm
+//
+// --- Inner k-loop: descriptor_loads and local_allocs → load partition ---
+// CHECK: descriptor_load{{.*}}ttg.partition = array<i32: [[LOAD:[0-9]+]]>
+// CHECK: descriptor_load{{.*}}ttg.partition = array<i32: [[LOAD]]>
+// CHECK: descriptor_load{{.*}}ttg.partition = array<i32: [[LOAD]]>
+// CHECK: local_alloc{{.*}}ttg.partition = array<i32: [[LOAD]]>
+// CHECK: local_alloc{{.*}}ttg.partition = array<i32: [[LOAD]]>
+// CHECK: local_alloc{{.*}}ttg.partition = array<i32: [[LOAD]]>
+//
+// --- Inner k-loop: each warp_group_dot in its own computation partition ---
+// CHECK: warp_group_dot{{.*}}ttg.partition = array<i32: [[COMP_A:[0-9]+]]>
+// CHECK: warp_group_dot{{.*}}ttg.partition = array<i32: [[COMP_B:[0-9]+]]>
+//
+// --- Epilogue: each half's truncf + TMA store in same partition as its MMA ---
+// CHECK: truncf{{.*}}ttg.partition = array<i32: [[COMP_A]]>
+// CHECK: truncf{{.*}}ttg.partition = array<i32: [[COMP_B]]>
+// CHECK: async_tma_copy_local_to_global{{.*}}ttg.partition = array<i32: [[COMP_A]]>
+// CHECK: async_tma_copy_local_to_global{{.*}}ttg.partition = array<i32: [[COMP_B]]>
+//
+// --- Partition types: computation partitions before load ---
+// CHECK: partition.types = ["computation", "computation", "load"
+tt.func public @hopper_data_partitioned_gemm(
+    %a_desc: !tt.tensordesc<tensor<64x64xf16, #shared>>,
+    %b_desc: !tt.tensordesc<tensor<128x64xf16, #shared>>,
+    %c_desc: !tt.tensordesc<tensor<64x128xf16, #shared>>,
+    %M: i32 {tt.divisibility = 16 : i32},
+    %N: i32 {tt.divisibility = 16 : i32},
+    %K: i32 {tt.divisibility = 16 : i32}
+) {
+  %c132_i32 = arith.constant 132 : i32
+  %c8_i32 = arith.constant 8 : i32
+  %c128_i32 = arith.constant 128 : i32
+  %c64_i32 = arith.constant 64 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c127_i32 = arith.constant 127 : i32
+  %cst = arith.constant dense<0.000000e+00> : tensor<64x128xf32, #mma>
+
+  %start_pid = tt.get_program_id x : i32
+  %num_pid_m = arith.addi %M, %c127_i32 : i32
+  %num_pid_m_div = arith.divsi %num_pid_m, %c128_i32 : i32
+  %num_pid_n = arith.addi %N, %c127_i32 : i32
+  %num_pid_n_div = arith.divsi %num_pid_n, %c128_i32 : i32
+  %k_tiles = arith.addi %K, %c64_i32 : i32
+  %k_tiles_div = arith.divsi %k_tiles, %c64_i32 : i32
+  %num_tiles = arith.muli %num_pid_m_div, %num_pid_n_div : i32
+  %tile_id_c_init = arith.subi %start_pid, %c132_i32 : i32
+  %num_pid_in_group = arith.muli %num_pid_n_div, %c8_i32 : i32
+
+  %tile_id_c_out = scf.for %tile_id = %start_pid to %num_tiles step %c132_i32
+      iter_args(%tile_id_c = %tile_id_c_init) -> (i32) : i32 {
+    %group_id = arith.divsi %tile_id, %num_pid_in_group : i32
+    %first_pid_m = arith.muli %group_id, %c8_i32 : i32
+    %group_size_m = arith.subi %num_pid_m_div, %first_pid_m : i32
+    %group_size_m_clamped = arith.minsi %group_size_m, %c8_i32 : i32
+    %pid_m = arith.remsi %tile_id, %group_size_m_clamped : i32
+    %pid_m_final = arith.addi %first_pid_m, %pid_m : i32
+    %pid_n_tmp = arith.remsi %tile_id, %num_pid_in_group : i32
+    %pid_n = arith.divsi %pid_n_tmp, %group_size_m_clamped : i32
+    %offs_am = arith.muli %pid_m_final, %c128_i32 : i32
+    %offs_am_1 = arith.addi %offs_am, %c64_i32 : i32
+    %offs_bn = arith.muli %pid_n, %c128_i32 : i32
+
+    // Inner k-loop with two WarpGroupDotOps (data-partitioned)
+    %acc:2 = scf.for %ki = %c0_i32 to %k_tiles_div step %c1_i32
+        iter_args(%acc0 = %cst, %acc1 = %cst) -> (tensor<64x128xf32, #mma>, tensor<64x128xf32, #mma>) : i32 {
+      %offs_k = arith.muli %ki, %c64_i32 {loop.cluster = 1 : i32, loop.stage = 0 : i32} : i32
+
+      %a0 = tt.descriptor_load %a_desc[%offs_am, %offs_k] {loop.cluster = 1 : i32, loop.stage = 0 : i32} : !tt.tensordesc<tensor<64x64xf16, #shared>> -> tensor<64x64xf16, #blocked>
+      %a1 = tt.descriptor_load %a_desc[%offs_am_1, %offs_k] {loop.cluster = 1 : i32, loop.stage = 0 : i32} : !tt.tensordesc<tensor<64x64xf16, #shared>> -> tensor<64x64xf16, #blocked>
+      %b = tt.descriptor_load %b_desc[%offs_bn, %offs_k] {loop.cluster = 1 : i32, loop.stage = 0 : i32} : !tt.tensordesc<tensor<128x64xf16, #shared>> -> tensor<128x64xf16, #blocked>
+
+      %a0_smem = ttg.local_alloc %a0 {loop.cluster = 0 : i32, loop.stage = 1 : i32} : (tensor<64x64xf16, #blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem>
+      %a1_smem = ttg.local_alloc %a1 {loop.cluster = 0 : i32, loop.stage = 1 : i32} : (tensor<64x64xf16, #blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem>
+      %b_smem = ttg.local_alloc %b {loop.cluster = 0 : i32, loop.stage = 1 : i32} : (tensor<128x64xf16, #blocked>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
+      %b_trans = ttg.memdesc_trans %b_smem {loop.cluster = 0 : i32, loop.stage = 1 : i32, order = array<i32: 1, 0>} : !ttg.memdesc<128x64xf16, #shared, #smem> -> !ttg.memdesc<64x128xf16, #shared1, #smem>
+
+      %dot0 = ttng.warp_group_dot %a0_smem, %b_trans, %acc0 {inputPrecision = 0 : i32, loop.cluster = 0 : i32, loop.stage = 1 : i32} : !ttg.memdesc<64x64xf16, #shared, #smem> * !ttg.memdesc<64x128xf16, #shared1, #smem> -> tensor<64x128xf32, #mma>
+      %dot1 = ttng.warp_group_dot %a1_smem, %b_trans, %acc1 {inputPrecision = 0 : i32, loop.cluster = 0 : i32, loop.stage = 1 : i32} : !ttg.memdesc<64x64xf16, #shared, #smem> * !ttg.memdesc<64x128xf16, #shared1, #smem> -> tensor<64x128xf32, #mma>
+
+      scf.yield %dot0, %dot1 : tensor<64x128xf32, #mma>, tensor<64x128xf32, #mma>
+    } {tt.scheduled_max_stage = 1 : i32}
+
+    // Epilogue
+    %tile_id_c_next = arith.addi %tile_id_c, %c132_i32 : i32
+    %group_id_c = arith.divsi %tile_id_c_next, %num_pid_in_group : i32
+    %first_pid_m_c = arith.muli %group_id_c, %c8_i32 : i32
+    %group_size_m_c = arith.subi %num_pid_m_div, %first_pid_m_c : i32
+    %group_size_m_c_clamped = arith.minsi %group_size_m_c, %c8_i32 : i32
+    %pid_m_c = arith.remsi %tile_id_c_next, %group_size_m_c_clamped : i32
+    %pid_m_c_final = arith.addi %first_pid_m_c, %pid_m_c : i32
+    %pid_n_c_tmp = arith.remsi %tile_id_c_next, %num_pid_in_group : i32
+    %pid_n_c = arith.divsi %pid_n_c_tmp, %group_size_m_c_clamped : i32
+    %offs_am_c = arith.muli %pid_m_c_final, %c128_i32 : i32
+    %offs_am_c_1 = arith.addi %offs_am_c, %c64_i32 : i32
+    %offs_bn_c = arith.muli %pid_n_c, %c128_i32 : i32
+
+    %c0_f16 = arith.truncf %acc#0 : tensor<64x128xf32, #mma> to tensor<64x128xf16, #mma>
+    %c1_f16 = arith.truncf %acc#1 : tensor<64x128xf32, #mma> to tensor<64x128xf16, #mma>
+    %c0_cvt = ttg.convert_layout %c0_f16 : tensor<64x128xf16, #mma> -> tensor<64x128xf16, #blocked1>
+    %c1_cvt = ttg.convert_layout %c1_f16 : tensor<64x128xf16, #mma> -> tensor<64x128xf16, #blocked1>
+    %c0_smem = ttg.local_alloc %c0_cvt : (tensor<64x128xf16, #blocked1>) -> !ttg.memdesc<64x128xf16, #shared, #smem, mutable>
+    %store_tok0 = ttng.async_tma_copy_local_to_global %c_desc[%offs_am_c, %offs_bn_c] %c0_smem : !tt.tensordesc<tensor<64x128xf16, #shared>>, !ttg.memdesc<64x128xf16, #shared, #smem, mutable> -> !ttg.async.token
+    ttng.async_tma_store_token_wait %store_tok0 : !ttg.async.token
+    %c1_smem = ttg.local_alloc %c1_cvt : (tensor<64x128xf16, #blocked1>) -> !ttg.memdesc<64x128xf16, #shared, #smem, mutable>
+    %store_tok1 = ttng.async_tma_copy_local_to_global %c_desc[%offs_am_c_1, %offs_bn_c] %c1_smem : !tt.tensordesc<tensor<64x128xf16, #shared>>, !ttg.memdesc<64x128xf16, #shared, #smem, mutable> -> !ttg.async.token
+    ttng.async_tma_store_token_wait %store_tok1 : !ttg.async.token
+
+    scf.yield %tile_id_c_next : i32
+  } {tt.data_partition_factor = 2 : i32, tt.smem_alloc_algo = 0 : i32, tt.warp_specialize}
+  tt.return
+}
+
+} // module
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -393,8 +393,6 @@ def make_ttgir(mod, metadata, opt, capability):
                 passes.ttgpuir.add_assign_latencies(pm, opt.num_stages, use_meta_swp_schedule)
                 passes.ttgpuir.add_schedule_loops(pm, opt.num_stages, use_meta_swp_schedule)
             passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled)
-            if knobs.nvidia.use_meta_ws:
-                passes.ttgpuir.add_optimize_partition_warps(pm)
         elif capability // 10 >= 10:
             if not knobs.nvidia.use_modulo_schedule:
                 passes.ttgpuir.add_fuse_nested_loops(pm)
@@ -464,7 +462,7 @@ def make_ttgir(mod, metadata, opt, capability):
         passes.common.add_symbol_dce(pm)
         # Optimize the number of warps and registers after TMA lowering, so
         # that any local loads eliminated by TMA lowering do not inflate them.
-        if capability // 10 >= 10 and knobs.nvidia.use_meta_ws:
+        if capability // 10 >= 9 and knobs.nvidia.use_meta_ws:
             passes.ttgpuir.add_optimize_partition_warps(pm)
         nvidia.passes.ttnvgpuir.add_fence_insertion(pm, capability)
         nvidia.passes.ttnvgpuir.add_lower_mma(pm)
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/CodePartitionUtility.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/CodePartitionUtility.cpp
@@ -2717,8 +2717,6 @@ static void createChannelPost(Operation *allocOp, mlir::DominanceInfo &dom,
   if (!producerOp)
     return;
   auto producerTaskIds = getAsyncTaskIds(producerOp);
-  assert(producerTaskIds.size() == 1);
-  auto producerTaskId = producerTaskIds.front();
   // Collect consumer task IDs from all consumers. With data partitioning,
   // different consumers may have different task IDs (e.g., K/V buffers
   // consumed by multiple computation partitions).
@@ -2730,6 +2728,25 @@ static void createChannelPost(Operation *allocOp, mlir::DominanceInfo &dom,
         consumerTaskIds.push_back(id);
     }
   }
+
+  // When a producer has multiple task IDs (e.g., a shared local_alloc
+  // consumed by data-partitioned computation groups), no channel is needed
+  // for any producer that is co-located with a consumer. It is unclear if
+  // is sufficient when there are multiple consumers.
+  AsyncTaskId producerTaskId = -1;
+  if (producerTaskIds.size() > 1 && consumerTaskIds.size() == 1) {
+    auto consumerTaskId = consumerTaskIds.front();
+    for (auto id : producerTaskIds) {
+      if (id != consumerTaskId) {
+        assert(producerTaskId == -1 &&
+               "Multiple producers encountered for 1 consumer");
+        producerTaskId = id;
+      }
+    }
+  } else {
+    assert(producerTaskIds.size() == 1);
+    producerTaskId = producerTaskIds.front();
+  }
   // Remove producer task id from consumerTaskIds.
   auto iter = std::remove(consumerTaskIds.begin(), consumerTaskIds.end(),
                           producerTaskId);
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/PartitionSchedulingMeta.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/PartitionSchedulingMeta.cpp