[AMD][TDM] Add lit tests for warp_bases and rename to "partial TDM copy"

jungpark-mlir · jungpark-mlir · commit 8f34e7ff0793 · 2026-04-16T17:01:11.000Z
Add verifier negative tests (wrong size, non-contiguous prefix, greedy
mismatch) and lowering tests (predication logic, partitioned layout
instruction count) for the warp_bases attribute.

Rename "warp specialization" to "partial TDM copy" in all TDM
warp_bases-related comments and docs to better describe the mechanism.
diff --git a/python/triton/experimental/gluon/language/amd/gfx1250/tdm.py b/python/triton/experimental/gluon/language/amd/gfx1250/tdm.py
@@ -144,7 +144,7 @@ def make_tensor_descriptor(base: ttgl.tensor, shape: List[ttgl.constexpr | ttgl.
 
 
 def _validate_warp_bases(warp_bases, block_shape, num_warps):
-    """Validate warp_bases for TDM warp specialization.
+    """Validate warp_bases for partial TDM copy.
 
     warp_bases must be log2(num_warps) entries where the non-zero entries form
     a contiguous prefix matching the greedy distribution for block_shape over
@@ -214,7 +214,7 @@ def async_load(src: tensor_descriptor, offsets: List[ttgl.constexpr | ttgl.tenso
         dest (shared_memory_descriptor): the shared memory destination to store the loaded data.
         pred (int, optional): Predicate to enable or disable the load. Defaults to 1.
         mbarrier (shared_memory_descriptor, optional): The barrier object to signal "arrive" on.
-        warp_bases (List[List[int]], optional): Per-bit warp-to-offset mapping for TDM warp specialization.
+        warp_bases (List[List[int]], optional): Per-bit warp-to-offset mapping for partial TDM copy.
             Each entry maps one bit of warpId to an element offset in the tensor coordinate space.
             A zero basis means that bit contributes no offset (duplicate warp, gets pred=0).
     """
diff --git a/test/Conversion/amd/tritongpu_tdm_to_llvm.mlir b/test/Conversion/amd/tritongpu_tdm_to_llvm.mlir
@@ -234,3 +234,55 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+// Partial TDM copy: 4 active warps out of 8, verify predication logic
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: tdm_load_warp_bases_predication
+  tt.func public @tdm_load_warp_bases_predication(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
+    %c_shape = arith.constant 256 : i32
+    %c_stride0 = arith.constant 256 : i64
+    %c_stride1 = arith.constant 1 : i64
+    %c_offset = arith.constant 0 : i32
+    %c_pred = arith.constant 1 : i32
+    %0 = tt.make_tensor_descriptor %arg0, [%c_shape, %c_shape], [%c_stride0, %c_stride1] : <f16>, <256x64xf16, #shared>
+    %1 = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #shared, #smem, mutable>
+    // warp_bases for 4 active warps: pred = user_pred AND (warpId < 4)
+    // CHECK-DAG: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK: %[[IS_ACTIVE:.*]] = llvm.icmp "ult" %{{.*}}, %[[C4]] : i32
+    // CHECK: %[[LAYOUT_PRED:.*]] = llvm.select %[[IS_ACTIVE]], %{{.*}}, %{{.*}} : i1, i32
+    // CHECK: llvm.and %{{.*}}, %[[LAYOUT_PRED]] : i32
+    // CHECK: "llvm.amdgcn.tensor.load.to.lds"
+    %2 = amdg.async_tdm_copy_global_to_local %0[%c_offset, %c_offset] into %1, pred = %c_pred {warp_bases = array<i64: 64, 0, 128, 0, 0, 0>} : !tt.tensordesc<256x64xf16, #shared> -> !ttg.memdesc<256x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+// Partial TDM copy with partitioned layout: effectiveWarps controls TDM instruction count.
+// Without warp_bases (4 warps), all 4 logical pieces fit in 1 instruction.
+// With warp_bases for 2 active warps, gcd(2,4)=2 → ceil(4/2)=2 instructions.
+#shared_inner = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#partitioned = #ttg.partitioned_shared<{numPartitions = 2, numGroups = 2, partitionDim = 0, partitionLayout = #shared_inner}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: tdm_load_warp_bases_partitioned
+  tt.func public @tdm_load_warp_bases_partitioned(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
+    %c_shape = arith.constant 256 : i32
+    %c_stride0 = arith.constant 256 : i64
+    %c_stride1 = arith.constant 1 : i64
+    %c_offset = arith.constant 0 : i32
+    %c_pred = arith.constant 1 : i32
+    %0 = tt.make_tensor_descriptor %arg0, [%c_shape, %c_shape], [%c_stride0, %c_stride1] : <f16>, <128x16xf16, #partitioned>
+    %1 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #partitioned, #smem, mutable>
+    // 2 active warps on partitioned layout → 2 TDM instructions
+    // CHECK: llvm.icmp "ult"
+    // CHECK-COUNT-2: "llvm.amdgcn.tensor.load.to.lds"
+    %2 = amdg.async_tdm_copy_global_to_local %0[%c_offset, %c_offset] into %1, pred = %c_pred {warp_bases = array<i64: 64, 0, 0, 0>} : !tt.tensordesc<128x16xf16, #partitioned> -> !ttg.memdesc<128x16xf16, #partitioned, #smem, mutable>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/amd/invalid.mlir b/test/TritonGPU/amd/invalid.mlir
@@ -282,3 +282,43 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+// warp_bases validation tests
+#shared_wb = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem_wb = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func @warp_bases_wrong_size(
+    %tensorDesc: !tt.tensordesc<256x64xf16>,
+    %memDesc: !ttg.memdesc<256x64xf16, #shared_wb, #smem_wb, mutable>,
+    %pred: i32
+  ) {
+    %c0 = arith.constant 0 : i32
+    // expected-error @+1 {{warp_bases must have log2(num_warps) * ndim = 6 elements, got 4}}
+    %0 = amdg.async_tdm_copy_global_to_local %tensorDesc[%c0, %c0] into %memDesc, pred = %pred {warp_bases = array<i64: 64, 0, 128, 0>} : !tt.tensordesc<256x64xf16> -> !ttg.memdesc<256x64xf16, #shared_wb, #smem_wb, mutable>
+    tt.return
+  }
+
+  tt.func @warp_bases_non_contiguous_prefix(
+    %tensorDesc: !tt.tensordesc<256x64xf16>,
+    %memDesc: !ttg.memdesc<256x64xf16, #shared_wb, #smem_wb, mutable>,
+    %pred: i32
+  ) {
+    %c0 = arith.constant 0 : i32
+    // expected-error @+1 {{warp_bases non-zero entries must form a contiguous prefix; found non-zero basis at bit 1 after a zero basis}}
+    %0 = amdg.async_tdm_copy_global_to_local %tensorDesc[%c0, %c0] into %memDesc, pred = %pred {warp_bases = array<i64: 0, 0, 64, 0, 0, 0>} : !tt.tensordesc<256x64xf16> -> !ttg.memdesc<256x64xf16, #shared_wb, #smem_wb, mutable>
+    tt.return
+  }
+
+  tt.func @warp_bases_greedy_mismatch(
+    %tensorDesc: !tt.tensordesc<256x64xf16>,
+    %memDesc: !ttg.memdesc<256x64xf16, #shared_wb, #smem_wb, mutable>,
+    %pred: i32
+  ) {
+    %c0 = arith.constant 0 : i32
+    // expected-error @+1 {{warp_bases mismatch at bit 0 dim 0: expected 64 but got 0; non-zero bases must match the greedy distribution for block_shape over active_warps=4}}
+    %0 = amdg.async_tdm_copy_global_to_local %tensorDesc[%c0, %c0] into %memDesc, pred = %pred {warp_bases = array<i64: 0, 32, 0, 64, 0, 0>} : !tt.tensordesc<256x64xf16> -> !ttg.memdesc<256x64xf16, #shared_wb, #smem_wb, mutable>
+    tt.return
+  }
+}
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
@@ -751,7 +751,7 @@ def AsyncTDMCopyGlobalToLocalOp : TT_AMDGPU_Op<"async_tdm_copy_global_to_local",
     The operation can also take an optional 64bit LDS barrier address, in which case
     it sends an "LDS atomic arrive" to signal its completion.
 
-    `warp_bases` is an optional attribute for TDM warp specialization.
+    `warp_bases` is an optional attribute for partial TDM copy.
     Each entry maps one bit of warpId to an element offset in the tensor
     coordinate space.  A `[0, ..., 0]` basis means that bit of warpId
     contributes no offset (degenerate / duplicate warp). Duplicate warps
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.cpp
@@ -534,7 +534,7 @@ swapOutDimSemantics(const triton::LinearLayout &layout, StringAttr dimA,
 // Fill TDM descriptor for regular load/store operations (1D-5D tensors).
 // activeWarps: number of warps that actually issue TDM copies (power of two,
 // <= numWarps).  Warps with warpId >= activeWarps get pred=0 (hardware no-op).
-// A value of 0 means all warps are active (no warp specialization).
+// A value of 0 means all warps are active (no partial TDM copy).
 void fillTDMDescriptor(
     RewriterBase &rewriter, Location loc,
     const LLVMTypeConverter *typeConverter, Type elementType,
@@ -584,7 +584,7 @@ void fillTDMDescriptor(
               : std::nullopt,
           numDims);
 
-  // When warp specialization is active, the per-warp block shape differs from
+  // When partial TDM copy is active, the per-warp block shape differs from
   // what createTDMDescriptor encoded (which used numWarps). Re-encode the
   // correct per-warp tile dimensions based on warpsPerCTA (from activeWarps).
   if (activeWarps > 0) {
@@ -737,7 +737,7 @@ void fillTDMDescriptor(
   Value globalAddr = b.ptrtoint(i64_ty, srcPtr);
   Value ldsAddr = b.ptrtoint(i32_ty, dstPtr);
 
-  // Combine user predicate with layout predicate for warp specialization.
+  // Combine user predicate with layout predicate for partial TDM copy.
   // Duplicate warps (warpId >= activeWarps) get pred=0 (hardware no-op).
   if (activeWarps > 0 && activeWarps < numWarps) {
     Value isActive = b.icmp_ult(warpId, b.i32_val(activeWarps));
@@ -1105,7 +1105,7 @@ void emitTDMLoadStore(RewriterBase &rewriter, Location loc,
     activeWarps = 1 << activeCount;
   }
 
-  // When warp specialization is active, compute the warp distribution based
+  // When partial TDM copy is active, compute the warp distribution based
   // on activeWarps instead of numWarps.
   int effectiveWarps = (activeWarps > 0) ? activeWarps : numWarps;
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.h
@@ -44,7 +44,7 @@ TDMDescriptor createTDMDescriptor(RewriterBase &rewriter, Location loc,
 // For partitioned shared memory, dstPtrs contains multiple base pointers and
 // the correct one is selected based on sharedLayout's partition dimension.
 // activeWarps: number of warps that actually issue TDM copies (power of two,
-// <= numWarps).  0 means all warps are active (no warp specialization).
+// <= numWarps).  0 means all warps are active (no partial TDM copy).
 void fillTDMDescriptor(
     RewriterBase &rewriter, Location loc,
     const LLVMTypeConverter *typeConverter, Type elementType,
diff --git a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
@@ -100,7 +100,7 @@ def gemm_tdm_pipelined_warp_pipelined_kernel(a_ptr, b_ptr, c_ptr,  #
 
 
 # ---------------------------------------------------------------------------
-# TDM warp-specialized variant: only a subset of warps issue TDM copies.
+# Partial TDM copy variant: only a subset of warps issue TDM copies.
 # Duplicate warps get pred=0 (hardware no-op), freeing TDM bandwidth.
 # ---------------------------------------------------------------------------
 
@@ -194,7 +194,7 @@ def gemm_tdm_specialized_pipelined_warp_pipelined_kernel(a_ptr, b_ptr, c_ptr,  #
 # ---------------------------------------------------------------------------
 
 def _compute_tdm_warp_bases(block_shape, num_warps, active_warps):
-    """Compute warp_bases for TDM specialization with the given active warp count.
+    """Compute warp_bases for partial TDM copy with the given active warp count.
 
     Returns a tuple of tuples suitable for passing as a constexpr.
     """
@@ -311,7 +311,7 @@ def test_runtime_gemm_tdm_specialized_pipelined(BLOCK_M, BLOCK_N, BLOCK_K, NUM_B
     num_warps = 8
     WARP_BASES = [(0, 1), (1, 0), (2, 0)]
 
-    # 4-warp TDM specialization: warps 4-7 duplicate 0-3 (pred=0, hardware no-op)
+    # 4-warp partial TDM copy: warps 4-7 duplicate 0-3 (pred=0, hardware no-op)
     tdm_warp_bases = _compute_tdm_warp_bases([BLOCK_M, BLOCK_K], num_warps, 4)
 
     warp_bases = tuple(WARP_BASES)
@@ -348,7 +348,7 @@ def test_runtime_gemm_tdm_specialized_pipelined(BLOCK_M, BLOCK_N, BLOCK_K, NUM_B
     parser.add_argument("-K", type=int, default=1024, help='problem K size')
     parser.add_argument("--num-buffers", type=int, choices=[2, 3, 4], default=3, help='num shared memory buffers')
     parser.add_argument("--4warp-tdm", action="store_true", dest="four_warp_tdm",
-                        help="Use 4-warp TDM specialization (warps 4-7 skip TDM copies)")
+                        help="Use 4-warp partial TDM copy (warps 4-7 skip TDM copies)")
     parser.add_argument("--dump", action="store_true", help="Print out result/golden tensors")
     args = parser.parse_args()