[AMD][gluon][gfx1250] Add tensor async gather support using TDM (#9313)

jammm · web-flow · commit 72d0d9090550 · 2026-02-01T11:06:21.000-08:00
Implements tensor async_gather using TDM in a similar fashion to #9299 on Gluon.
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -981,6 +981,12 @@ void init_gluon_ir(py::module &&m) {
              self.create<ttag::AsyncTDMScatterOp>(descPtr, dstRowIndices,
                                                   dstColOffset, src, barrier);
            })
+      .def("create_async_tdm_gather",
+           [](GluonOpBuilder &self, Value descPtr, Value srcRowIndices,
+              Value srcColOffset, Value dst, Value barrier) {
+             self.create<ttag::AsyncTDMGatherOp>(descPtr, srcRowIndices,
+                                                 srcColOffset, dst, barrier);
+           })
       .def("create_tdm_prefetch",
            [](GluonOpBuilder &self, Value descPtr, std::vector<Value> &indices,
               Value pred, bool speculative, bool returnOffsets) -> Value {
diff --git a/python/triton/experimental/gluon/language/amd/gfx1250/tdm.py b/python/triton/experimental/gluon/language/amd/gfx1250/tdm.py
@@ -215,6 +215,44 @@ def async_scatter(desc: tensor_descriptor, dst_row_indices: ttgl.tensor, dst_col
                                                mbarrier_handle)
 
 
+@builtin
+def async_gather(desc: tensor_descriptor, src_row_indices: ttgl.tensor, src_col_offset, dst: shared_memory_descriptor,
+                 mbarrier: shared_memory_descriptor = None, _semantic=None) -> None:
+    """Gather data from non-contiguous rows in global memory to shared memory asynchronously.
+
+    This operation uses TDM gather mode to read data from non-contiguous rows in global memory.
+    Unlike async_load which reads from contiguous rows, gather allows reading from arbitrary
+    rows specified by the src_row_indices tensor.
+
+    The dtype of src_row_indices determines the index size:
+    - int16: up to 16 rows can be gathered per TDM instruction
+    - int32: up to 8 rows can be gathered per TDM instruction
+    If more rows are needed, multiple TDM instructions will be automatically issued.
+
+    Args:
+        desc (tensor_descriptor): the source tensor descriptor. Must be 2D.
+        src_row_indices (tensor): 1D tensor of row indices (int16 or int32) in the source tensor.
+        src_col_offset (int or tensor): the starting column offset in the source tensor
+                                        for all gathered rows.
+        dst (shared_memory_descriptor): the shared memory destination to store gathered data. Must be 2D.
+        mbarrier (shared_memory_descriptor, optional): The barrier object to signal "arrive" on.
+    """
+    ndim = len(desc.block_shape)
+    assert ndim == 2, f"TDM gather only supports 2D tensors, got {ndim}D"
+
+    dst_ndim = len(dst.shape)
+    assert dst_ndim == 2, f"TDM gather dst must be 2D, got {dst_ndim}D"
+
+    # Convert src_col_offset to i32
+    src_col_offset_handle = _semantic._convert_to_ir_values([src_col_offset], require_i64=False)[0]
+
+    mbarrier = _unwrap_if_constexpr(mbarrier)
+    mbarrier_handle = mbarrier.handle if mbarrier is not None else ttgl.ir.value()
+
+    _semantic.builder.create_async_tdm_gather(desc.handle, src_row_indices.handle, src_col_offset_handle, dst.handle,
+                                              mbarrier_handle)
+
+
 @builtin
 def prefetch(src: tensor_descriptor, offsets: List[ttgl.constexpr | ttgl.tensor], pred: bool = True,
              speculative: bool = False, _semantic=None) -> None:
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
@@ -870,6 +870,45 @@ def AsyncTDMScatterOp : TT_AMDGPU_Op<"async_tdm_scatter"> {
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// AsyncTDMGatherOp
+//===----------------------------------------------------------------------===//
+
+def AsyncTDMGatherOp : TT_AMDGPU_Op<"async_tdm_gather"> {
+  let summary = "Gather data from non-contiguous global memory rows to local memory asynchronously";
+
+  let description = [{
+    This operation gathers data from non-contiguous rows in global memory to local
+    memory using TDM gather mode.
+    Unlike the regular async_tdm_copy_global_to_local which reads from contiguous memory,
+    this operation uses src_row_indices to specify which rows in global memory to read from.
+
+    The descriptor must be 2D. The src_row_indices specify which rows in global memory
+    to read from. The element type of src_row_indices determines the index size:
+    - I16: 16-bit indices, up to 16 rows per instruction
+    - I32: 32-bit indices, up to 8 rows per instruction
+    If more rows are needed, multiple TDM instructions will be issued.
+
+    The src_col_offset specifies the starting column in the source tensor for
+    all gathered rows.
+  }];
+
+  let arguments = (ins
+    Arg<TT_TensorDescType, "", [MemRead<GlobalMemory>]>:$desc,
+    TensorOf<[I16, I32]>:$src_row_indices,
+    I32:$src_col_offset,
+    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$dst,
+    Optional<TTG_MemDescType>:$barrier
+  );
+
+  let assemblyFormat = [{
+    $desc `[` $src_row_indices `,` $src_col_offset `]` `to` $dst (`,` `barrier` `=` $barrier^)?
+    attr-dict `:` qualified(type($src_row_indices)) `,` qualified(type($dst)) (`,` qualified(type($barrier))^)? `->` qualified(type($desc))
+  }];
+
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // AsyncTDMWait
 //===----------------------------------------------------------------------===//
diff --git a/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp b/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp
@@ -812,6 +812,49 @@ LogicalResult AsyncTDMScatterOp::verify() {
   return success();
 }
 
+LogicalResult AsyncTDMGatherOp::verify() {
+  auto tensorDescTy = getDesc().getType();
+  auto smemTy = getDst().getType();
+
+  // TDM gather mode only supports 2D tensors
+  auto blockShape = tensorDescTy.getBlockType().getShape();
+  if (blockShape.size() != 2)
+    return emitOpError("TDM gather only supports 2D tensors, got ")
+           << blockShape.size() << "D";
+
+  // Check that every dimension of the block shape is <= 2^16
+  auto verifyResult = verifyTDMBlockSize(getOperation(), blockShape);
+  if (failed(verifyResult))
+    return verifyResult;
+
+  auto srcRowIndicesType = cast<RankedTensorType>(getSrcRowIndices().getType());
+  if (srcRowIndicesType.getRank() != 1)
+    return emitOpError("src_row_indices must be a 1D tensor");
+
+  // Element type (i16 or i32) is already verified by ODS constraint
+  // TensorOf<[I16, I32]>
+
+  int64_t numIndices = srcRowIndicesType.getShape()[0];
+  if (!llvm::isPowerOf2_64(numIndices))
+    return emitOpError("src_row_indices size must be a power of 2, got ")
+           << numIndices;
+
+  auto swizzledEnc =
+      llvm::dyn_cast<gpu::SwizzledSharedEncodingAttr>(smemTy.getEncoding());
+  if (swizzledEnc && swizzledEnc.getMaxPhase() != 1)
+    return emitOpError("TDM does not support swizzling");
+
+  auto paddedEnc =
+      llvm::dyn_cast<gpu::PaddedSharedEncodingAttr>(smemTy.getEncoding());
+  if (paddedEnc)
+    return emitOpError("TDM gather does not support padding");
+
+  if (!paddedEnc && !swizzledEnc)
+    return emitOpError("Invalid shared memory layout for TDM");
+
+  return success();
+}
+
 // -- InitBarrierOp --
 LogicalResult InitBarrierOp::verify() {
   if (failed(verifyBarrierType(*this, getAlloc().getType())))
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1384,10 +1384,89 @@ struct AsyncTDMScatterOpConversion
 
     // Predicate must be i32 (not i1) to match other elements in group0
     Value pred = arith::ConstantIntOp::create(rewriter, loc, 1, 32);
-    mlir::LLVM::AMD::emitTDMScatter(rewriter, loc, getTypeConverter(), desc,
-                                    shapePerCTA, srcPtr, pred, elementType,
-                                    barrierPtr, cgaLayout, ctaId, dstRowIndices,
-                                    dstColOffset, use32BitIndices);
+    mlir::LLVM::AMD::emitTDMGatherScatter(
+        rewriter, loc, getTypeConverter(), desc, shapePerCTA, srcPtr, pred,
+        elementType, barrierPtr, cgaLayout, ctaId, dstRowIndices, dstColOffset,
+        use32BitIndices, /*isGather=*/false);
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct AsyncTDMGatherOpConversion
+    : public ConvertOpToLLVMPattern<triton::amdgpu::AsyncTDMGatherOp>,
+      public LoadStoreConversionBase {
+  AsyncTDMGatherOpConversion(LLVMTypeConverter &converter,
+                             const AMD::TargetInfo &targetInfo,
+                             ModuleAxisInfoAnalysis &axisAnalysisPass,
+                             PatternBenefit benefit)
+      : ConvertOpToLLVMPattern(converter, benefit),
+        LoadStoreConversionBase(targetInfo, axisAnalysisPass) {}
+
+  LogicalResult
+  matchAndRewrite(triton::amdgpu::AsyncTDMGatherOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+
+    auto tensorDescTy = op.getDesc().getType();
+    auto smemTy = op.getDst().getType();
+    Type elementType = getTypeConverter()->convertType(smemTy.getElementType());
+
+    SmallVector<Value> desc =
+        unpackLLElements(loc, adaptor.getDesc(), rewriter);
+
+    SmallVector<int64_t> blockShape =
+        llvm::to_vector(tensorDescTy.getBlockType().getShape());
+
+    // Gather only supports 2D tensors
+    assert(blockShape.size() == 2 &&
+           "TDM gather mode only supports 2D tensors");
+
+    auto dstMemObj = LLVM::getSharedMemoryObjectFromStruct(
+        loc, adaptor.getDst(), elementType, rewriter);
+    Value dstPtr = dstMemObj.getBase();
+    int numWarps = triton::gpu::lookupNumWarps(op);
+
+    Value barrierPtr = nullptr;
+    if (op.getBarrier()) {
+      auto smemObj = LLVM::getSharedMemoryObjectFromStruct(
+          loc, adaptor.getBarrier(),
+          typeConverter->convertType(
+              op.getBarrier().getType().getElementType()),
+          rewriter);
+      barrierPtr = smemObj.getBase();
+    }
+
+    // Get the source row indices for gather
+    SmallVector<Value> srcRowIndices =
+        unpackLLElements(loc, adaptor.getSrcRowIndices(), rewriter);
+
+    auto shapePerCTA = triton::gpu::getShapePerCTA(smemTy);
+
+    // Get the source column offset
+    Value srcColOffset = adaptor.getSrcColOffset();
+
+    // Determine index size from the element type of src_row_indices
+    auto srcRowIndicesType =
+        cast<RankedTensorType>(op.getSrcRowIndices().getType());
+    bool use32BitIndices =
+        srcRowIndicesType.getElementType().getIntOrFloatBitWidth() == 32;
+
+    // Create the CGA layout
+    auto sharedLayout = triton::gpu::toLinearLayout(smemTy);
+    auto kBlock = rewriter.getStringAttr("block");
+    auto cgaLayout = sharedLayout.sublayout(
+        {kBlock}, to_vector(sharedLayout.getOutDimNames()));
+    auto ctaId = targetInfo.getClusterCTAId(rewriter, loc);
+
+    // Predicate must be i32 (not i1) to match other elements in group0
+    Value pred = arith::ConstantIntOp::create(rewriter, loc, 1, 32);
+    mlir::LLVM::AMD::emitTDMGatherScatter(
+        rewriter, loc, getTypeConverter(), desc, shapePerCTA, dstPtr, pred,
+        elementType, barrierPtr, cgaLayout, ctaId, srcRowIndices, srcColOffset,
+        use32BitIndices, /*isGather=*/true);
 
     rewriter.eraseOp(op);
     return success();
@@ -2320,13 +2399,14 @@ void populateLoadStoreOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                        RewritePatternSet &patterns,
                                        ModuleAxisInfoAnalysis &axisInfoAnalysis,
                                        PatternBenefit benefit) {
-  patterns.add<
-      AtomicCASOpConversion, AtomicRMWOpConversion, LoadOpConversion,
-      StoreOpConversion, BufferLoadOpConversion, BufferLoadToLocalOpConversion,
-      BufferStoreOpConversion, BufferAtomicRMWOpConversion,
-      AsyncCopyGlobalToLocalOpConversion, AsyncCopyLocalToGlobalOpConversion,
-      BufferAtomicCASOpConversion, AsyncTDMCopyGlobalToLocalOpConversion,
-      AsyncTDMCopyLocalToGlobalOpConversion, AsyncTDMScatterOpConversion>(
+  patterns.add<AtomicCASOpConversion, AtomicRMWOpConversion, LoadOpConversion,
+               StoreOpConversion, BufferLoadOpConversion,
+               BufferLoadToLocalOpConversion, BufferStoreOpConversion,
+               BufferAtomicRMWOpConversion, AsyncCopyGlobalToLocalOpConversion,
+               AsyncCopyLocalToGlobalOpConversion, BufferAtomicCASOpConversion,
+               AsyncTDMCopyGlobalToLocalOpConversion,
+               AsyncTDMCopyLocalToGlobalOpConversion,
+               AsyncTDMScatterOpConversion, AsyncTDMGatherOpConversion>(
       typeConverter, targetInfo, axisInfoAnalysis, benefit);
   patterns.add<AsyncWaitOpConversion>(typeConverter, targetInfo, benefit);
   patterns.add<TDMPrefetchConversion>(typeConverter, targetInfo, benefit);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.cpp
@@ -605,9 +605,10 @@ void fillTDMDescriptor(
   }
 }
 
-// Fill TDM descriptor for scatter operation (2D only).
-// Scatter writes data from LDS to non-contiguous rows in global memory.
-void fillTDMDescriptorForScatter(
+// Fill TDM descriptor for gather/scatter operations (2D only).
+// Gather reads from non-contiguous rows in global memory to LDS.
+// Scatter writes from LDS to non-contiguous rows in global memory.
+void fillTDMDescriptorForGatherScatter(
     RewriterBase &rewriter, Location loc,
     const LLVMTypeConverter *typeConverter, Type elementType,
     SmallVector<int64_t> blockShape, SmallVector<Value> &group0,
@@ -616,7 +617,7 @@ void fillTDMDescriptorForScatter(
     Value ldsPtr, Value pred, Value barrierPtr,
     const triton::LinearLayout &cgaLayout, Value ctaId,
     ArrayRef<Value> rowIndices, bool use32BitIndices) {
-  assert(!rowIndices.empty() && "Scatter requires row indices.");
+  assert(!rowIndices.empty() && "Gather/scatter requires row indices.");
 
   auto ctx = rewriter.getContext();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
@@ -649,17 +650,17 @@ void fillTDMDescriptorForScatter(
   Value ldsOffset = b.mul(ldsRowOffset, b.i32_val(blockShape[1]));
   ldsPtr = b.gep(sharedPtrTy, elementType, ldsPtr, ldsOffset);
 
-  // Update group0 with addresses and enable scatter
+  // Update group0 with addresses and enable gather/scatter mode
   Value globalAddr = b.ptrtoint(i64_ty, globalPtr);
   Value ldsAddr = b.ptrtoint(i32_ty, ldsPtr);
 
-  // Set scatter bits: bit 31 = enable, bit 30 = 32-bit indices
-  Value predWithScatter = b.or_(pred, b.i32_val(1 << 31));
+  // Set gather/scatter bits: bit 31 = enable, bit 30 = 32-bit indices
+  Value predWithGatherScatter = b.or_(pred, b.i32_val(1 << 31));
   if (use32BitIndices) {
-    predWithScatter = b.or_(predWithScatter, b.i32_val(1 << 30));
+    predWithGatherScatter = b.or_(predWithGatherScatter, b.i32_val(1 << 30));
   }
 
-  group0[0] = predWithScatter;
+  group0[0] = predWithGatherScatter;
   group0[1] = ldsAddr;
   group0[2] = b.trunc(i32_ty, globalAddr);
 
@@ -784,28 +785,29 @@ void emitTDMLoadStore(RewriterBase &rewriter, Location loc,
   }
 }
 
-// Emit a TDM scatter operation to write non-contiguous rows from LDS to global.
-void emitTDMScatter(RewriterBase &rewriter, Location loc,
-                    const LLVMTypeConverter *typeConverter,
-                    ArrayRef<Value> desc, ArrayRef<int64_t> blockShape,
-                    Value srcPtr, Value pred, Type elementType,
-                    Value barrierPtr, const triton::LinearLayout &cgaLayout,
-                    Value ctaId, ArrayRef<Value> rowIndices, Value colOffset,
-                    bool use32BitIndices) {
+// Emit a TDM gather or scatter operation for non-contiguous row access.
+void emitTDMGatherScatter(RewriterBase &rewriter, Location loc,
+                          const LLVMTypeConverter *typeConverter,
+                          ArrayRef<Value> desc, ArrayRef<int64_t> blockShape,
+                          Value ldsPtr, Value pred, Type elementType,
+                          Value barrierPtr,
+                          const triton::LinearLayout &cgaLayout, Value ctaId,
+                          ArrayRef<Value> rowIndices, Value colOffset,
+                          bool use32BitIndices, bool isGather) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
-  assert(!rowIndices.empty() && "Scatter requires row indices");
-  assert(colOffset && "Scatter requires column offset");
+  assert(!rowIndices.empty() && "Gather/scatter requires row indices");
+  assert(colOffset && "Gather/scatter requires column offset");
 
   // Determine max indices per instruction based on index size
   size_t maxIndicesPerInstr = use32BitIndices ? 8 : 16;
   size_t numIndices = rowIndices.size();
 
-  // Get the descriptor groups (scatter uses 2D format: 12 dwords)
+  // Get the descriptor groups (gather/scatter uses 2D format: 12 dwords)
   auto group0Vec = SmallVector<Value>(desc.begin(), desc.begin() + 4);
   auto group1Vec = SmallVector<Value>(desc.begin() + 4, desc.end());
 
-  // For TDM scatter, we need group2 and group3 for indices
+  // For TDM gather/scatter, we need group2 and group3 for indices
   SmallVector<Value> group2Vec(4, b.i32_val(0));
   SmallVector<Value> group3Vec(4, b.i32_val(0));
 
@@ -824,12 +826,12 @@ void emitTDMScatter(RewriterBase &rewriter, Location loc,
     auto g2 = group2Vec;
     auto g3 = group3Vec;
 
-    // Fill the descriptor for scatter:
+    // Fill the descriptor for gather/scatter:
     // - ldsRowOffset: row offset within shared memory for this batch
     // - colOffset: starting column in global memory
-    fillTDMDescriptorForScatter(
+    fillTDMDescriptorForGatherScatter(
         rewriter, loc, typeConverter, elementType, to_vector(blockShape), g0,
-        g1, g2, g3, b.i32_val(startIdx), colOffset, srcPtr, pred, barrierPtr,
+        g1, g2, g3, b.i32_val(startIdx), colOffset, ldsPtr, pred, barrierPtr,
         cgaLayout, ctaId, batchIndices, use32BitIndices);
 
     // Pack and emit the instruction
@@ -838,10 +840,13 @@ void emitTDMScatter(RewriterBase &rewriter, Location loc,
     auto group2 = packLLVector(loc, g2, rewriter);
     auto group3 = packLLVector(loc, g3, rewriter);
 
-    // Scatter uses tensor.store.from.lds (not the d2 variant) because it
-    // needs group2/group3 for indices
+    // Gather/scatter uses full 4-group format (not the d2 variant) for indices
+    // Gather: tensor.load.to.lds (global -> LDS)
+    // Scatter: tensor.store.from.lds (LDS -> global)
+    const char *intrinsicName = isGather ? "llvm.amdgcn.tensor.load.to.lds"
+                                         : "llvm.amdgcn.tensor.store.from.lds";
     LLVM::createLLVMIntrinsicCallOp(
-        rewriter, loc, "llvm.amdgcn.tensor.store.from.lds", {},
+        rewriter, loc, intrinsicName, {},
         {group0, group1, group2, group3, b.i32_val(0)});
   }
 }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.h
diff --git a/third_party/amd/python/test/test_gluon_gfx1250.py b/third_party/amd/python/test/test_gluon_gfx1250.py