triton-lang
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td‎
Lines changed: 63 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 9 additions & 5 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎lib/Analysis/BufferRegion.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Analysis/BufferRegion.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 8 additions & 8 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/DescriptorMemoryLayouts.cpp‎
Lines changed: 5 additions & 15 deletions b/‎lib/Dialect/TritonGPU/Transforms/DescriptorMemoryLayouts.cpp‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 3 additions & 3 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 3 additions & 3 deletions
@@ -114,6 +114,14 @@ def TT_DescriptorStoreLikeOpInterface : OpInterface<"DescriptorStoreLikeOpInterf
   ];
 }
 
+def TT_DescriptorLoadLikeOpInterface : OpInterface<"DescriptorLoadLikeOpInterface", [TT_DescriptorOpInterface]> {
+  let description = [{
+    Common marker interface for operations that load from tensor descriptors.
+  }];
+
+  let cppNamespace = "::mlir::triton";
+}
+
 def PredicatedOpInterface : OpInterface<"PredicatedOpInterface"> {
   let description = [{
     Common interface for operations that carry a predicate or mask operand that
 
@@ -1222,7 +1222,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
 }
 
 
-def TT_DescriptorLoadOp : TT_Op<"descriptor_load", [TT_DescriptorOpInterface]> {
+def TT_DescriptorLoadOp : TT_Op<"descriptor_load", [TT_DescriptorLoadLikeOpInterface]> {
   let summary = "Load from descriptor";
   let description = [{
     This operation will be lowered to Nvidia TMA load operation on targets supporting it.
@@ -1291,7 +1291,7 @@ def TT_DescriptorReduceOp : TT_Op<"descriptor_reduce", [TT_DescriptorStoreLikeOp
   let hasVerifier = 1;
 }
 
-def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [TT_DescriptorOpInterface]> {
+def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [TT_DescriptorLoadLikeOpInterface]> {
   let summary = "gather multiple rows from a descriptor into a single tensor";
   let description = [{
     The `tt.descriptor_gather` op will be lowered to NVIDIA TMA
 
@@ -2,6 +2,7 @@
 #define TRITON_TRITONGPU_TRANSFORMS_PIPELINER_PIPELINING_UTILITY_H_
 
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include <optional>
 #include <utility>
@@ -123,7 +124,9 @@ Value createAlloc(Operation *insertBefore, RankedTensorType ty, Location loc,
                   gpu::SharedEncodingTrait sharedEnc, unsigned distance);
 
 // Determine if the operation is a TMA load.
-bool isTMALoad(Operation *op);
+inline bool isTMALoad(Operation *op) {
+  return isa<DescriptorLoadLikeOpInterface>(op);
+}
 
 // Determine if the operation can be lowered to an async load.
 bool canBeAsyncLoad(Operation *op);
 
@@ -3,6 +3,69 @@
 
 include "mlir/IR/OpBase.td"
 
+def TMAOpInterface : OpInterface<"TMAOpInterface"> {
+  let description = [{
+    Common interface for asynchronous TMA operations.
+  }];
+
+  let cppNamespace = "::mlir::triton::nvidia_gpu";
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Get the tensor descriptor",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getDesc",
+      /*args=*/(ins)>,
+  ];
+}
+
+def TMALoadLikeOpInterface : OpInterface<"TMALoadLikeOpInterface", [TMAOpInterface]> {
+  let description = [{
+    Common interface for asynchronous TMA operations that write shared memory.
+  }];
+
+  let cppNamespace = "::mlir::triton::nvidia_gpu";
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Get the destination memory descriptor",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getResult",
+      /*args=*/(ins)>,
+    InterfaceMethod<
+      /*desc=*/"Get the completion barrier",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getBarrier",
+      /*args=*/(ins)>,
+    InterfaceMethod<
+      /*desc=*/"Get the predicate",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getPred",
+      /*args=*/(ins)>,
+  ];
+}
+
+def TMAStoreLikeOpInterface : OpInterface<"TMAStoreLikeOpInterface", [TMAOpInterface]> {
+  let description = [{
+    Common interface for asynchronous TMA operations that read shared memory.
+  }];
+
+  let cppNamespace = "::mlir::triton::nvidia_gpu";
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Get the source memory descriptor",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getSrc",
+      /*args=*/(ins)>,
+    InterfaceMethod<
+      /*desc=*/"Get mutable source memory descriptor",
+      /*retType=*/"::mlir::OpOperand&",
+      /*methodName=*/"getSrcMutable",
+      /*args=*/(ins)>,
+  ];
+}
+
 def MMAv5OpInterface : OpInterface<"MMAv5OpInterface"> {
   let description = [{
      This interface is implemented by MMAv5 dot and dot scaled ops.
 
@@ -411,7 +411,7 @@ def TTNG_AsyncCopyMbarrierArriveOp : TTNG_Op<"async_copy_mbarrier_arrive", [
 
 def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local", [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<MBarrierOpInterface>,
-    DeclareOpInterfaceMethods<PredicatedOpInterface>]> {
+    DeclareOpInterfaceMethods<PredicatedOpInterface>, TMALoadLikeOpInterface]> {
   let summary = "copy data based on descriptor from global memory to local memory asynchronously";
 
   let description = [{
@@ -474,7 +474,8 @@ def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local",
 
 }
 
-def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global"> {
+def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global", [
+    TMAStoreLikeOpInterface]> {
   let summary = "copy data based on descriptor from local memory to global memory asynchronously";
 
   let description = [{
@@ -498,7 +499,9 @@ def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global">
   let hasVerifier = 1;
 }
 
-def TTNG_AsyncTMAReduceOp : TTNG_Op<"async_tma_reduce", [MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>]> {
+def TTNG_AsyncTMAReduceOp : TTNG_Op<"async_tma_reduce", [
+    MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,
+    TMAStoreLikeOpInterface]> {
   let summary = "reduce result in gmem based on a TMA descriptor";
 
   let description = [{
@@ -524,7 +527,7 @@ def TTNG_AsyncTMAReduceOp : TTNG_Op<"async_tma_reduce", [MemoryEffects<[MemRead<
 
 def TTNG_AsyncTMAGatherOp : TTNG_Op<"async_tma_gather", [
     DeclareOpInterfaceMethods<MBarrierOpInterface>,
-    DeclareOpInterfaceMethods<PredicatedOpInterface>]> {
+    DeclareOpInterfaceMethods<PredicatedOpInterface>, TMALoadLikeOpInterface]> {
   let summary = "gather data based on descriptor from global memory to local memory asynchronously";
 
   let description = [{
@@ -550,7 +553,8 @@ def TTNG_AsyncTMAGatherOp : TTNG_Op<"async_tma_gather", [
   let hasVerifier = 1;
 }
 
-def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter"> {
+def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter", [
+    TMAStoreLikeOpInterface]> {
   let summary = "scatter data from local memory into global memory based on a descriptor asynchronously";
 
   let description = [{
 
@@ -312,7 +312,7 @@ void BufferRegionAnalysis::calculateUsedBufferRegions(Operation *op) {
 bool BufferRegionAnalysis::isMemoryAccessOperation(Operation *op) {
   if (isa<ttg::LocalLoadOp, ttg::LocalStoreOp, ttng::TMEMLoadOp,
           ttng::TMEMStoreOp, ttng::TMEMCopyOp, ttg::AsyncCopyGlobalToLocalOp,
-          ttng::AsyncTMACopyLocalToGlobalOp, ttng::AsyncTMAScatterOp>(op)) {
+          ttng::TMAOpInterface>(op)) {
     return true;
   }
   if (isa<ttg::MBarrierOpInterface>(op)) {
 
@@ -264,8 +264,8 @@ getWarpsPerTile(DotOpInterface dotOp, const ArrayRef<int64_t> shape,
 static bool bwdFilter(Operation *op) {
   return (op->hasTrait<OpTrait::Elementwise>() && isMemoryEffectFree(op)) ||
          isView(op) ||
-         isa<Fp4ToFpOp, LoadOp, DescriptorLoadOp, BroadcastOp, ConvertLayoutOp>(
-             op);
+         isa<Fp4ToFpOp, LoadOp, DescriptorLoadLikeOpInterface, BroadcastOp,
+             ConvertLayoutOp>(op);
 }
 
 // Finds the bitwidth with which the value x is loaded
@@ -284,7 +284,7 @@ static int computeOrigBitWidth(Value x) {
 
   int origBitWidth = getElementTypeOrSelf(x).getIntOrFloatBitWidth();
   for (auto op : slice) {
-    if (isa<LoadOp, DescriptorLoadOp>(op)) {
+    if (isa<LoadOp, DescriptorLoadLikeOpInterface>(op)) {
       if (auto tensorTy =
               dyn_cast<RankedTensorType>(op->getResultTypes().front())) {
         origBitWidth =
@@ -473,8 +473,9 @@ static bool canUseTwoCTAs(triton::DotOp dotOp) {
   // Skip convert layouts.
   while (auto cvtOp = b.getDefiningOp<ConvertLayoutOp>())
     b = cvtOp.getSrc();
-  return llvm::isa_and_nonnull<triton::LoadOp, triton::DescriptorLoadOp,
-                               triton::DescriptorGatherOp>(b.getDefiningOp());
+  return llvm::isa_and_nonnull<triton::LoadOp,
+                               triton::DescriptorLoadLikeOpInterface>(
+      b.getDefiningOp());
 }
 
 static DistributedEncodingTrait
@@ -501,8 +502,7 @@ static Value splitBOperand(Value b, mlir::PatternRewriter &rewriter) {
   while (auto cvtOp = b.getDefiningOp<ConvertLayoutOp>())
     b = cvtOp.getSrc();
   auto loadOp = b.getDefiningOp();
-  assert((isa<triton::LoadOp, triton::DescriptorLoadOp,
-              triton::DescriptorGatherOp>(loadOp)) &&
+  assert((isa<triton::LoadOp, triton::DescriptorLoadLikeOpInterface>(loadOp)) &&
          "expected LoadOp");
   RankedTensorType bType = cast<RankedTensorType>(b.getType());
   auto currentLayout = cast<DistributedEncodingTrait>(bType.getEncoding());
@@ -627,7 +627,7 @@ Value addSmemStageToScaleLoad(Value scale, mlir::PatternRewriter &rewriter) {
   if (!op)
     return scale;
 
-  while (!isa<LoadOp, DescriptorLoadOp>(op)) {
+  while (!isa<LoadOp, DescriptorLoadLikeOpInterface>(op)) {
     if (auto reshape = dyn_cast<ReshapeOp>(op)) {
       op = reshape.getSrc().getDefiningOp();
       loadConsumer = reshape;
 
@@ -276,25 +276,15 @@ std::optional<UseInfo>
 AssignDescriptorMemoryLayouts::getUseInfo(Operation *op) {
   UseInfo info;
   info.use = op;
-  if (auto load = dyn_cast<DescriptorLoadOp>(op)) {
+  if (auto load = dyn_cast<DescriptorLoadLikeOpInterface>(op)) {
     info.descriptor = load.getDesc();
     info.desiredSharedEncoding = findLoadEncodingFromUsers(op);
+    auto resultTy = cast<RankedTensorType>(op->getResult(0).getType());
     auto encoding = info.desiredSharedEncoding ? info.desiredSharedEncoding
-                                               : load.getType().getEncoding();
+                                               : resultTy.getEncoding();
     info.cgaLayout = getCGALayout(encoding);
-    auto shape = load.getResult().getType().getShape();
-    auto rank = load.getDesc().getType().getShape().size();
-    info.shape = expandToRank(shape, rank);
-    return info;
-  }
-  if (auto gather = dyn_cast<DescriptorGatherOp>(op)) {
-    info.descriptor = gather.getDesc();
-    info.desiredSharedEncoding = findLoadEncodingFromUsers(op);
-    auto encoding = info.desiredSharedEncoding ? info.desiredSharedEncoding
-                                               : gather.getType().getEncoding();
-    info.cgaLayout = getCGALayout(encoding);
-    auto shape = gather.getResult().getType().getShape();
-    auto rank = gather.getDesc().getType().getShape().size();
+    auto shape = resultTy.getShape();
+    auto rank = info.descriptor.getType().getShape().size();
     info.shape = expandToRank(shape, rank);
     return info;
   }
 
@@ -255,8 +255,8 @@ class UseShmemForScales
     }
     auto localAlloc = getNextOp<LocalAllocOp>(localLoad.getSrc());
     bool usesTMAload =
-        (localAlloc && localAlloc.getSrc() &&
-         (getNextOp<DescriptorLoadOp>(localAlloc.getSrc()) != nullptr));
+        localAlloc && localAlloc.getSrc() &&
+        getNextOp<DescriptorLoadLikeOpInterface>(localAlloc.getSrc());
     if (!isTmemCopyCompatible(localLoad.getSrc().getType(), usesTMAload))
       return failure();
 
 
@@ -107,7 +107,7 @@ class AssignLoadLatencies {
         return false;
       }
     }
-    if (isa<tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op))
+    if (isa<tt::DescriptorLoadLikeOpInterface>(op))
       return true;
     if (!canHaveSharedEncoding(cast<tt::LoadOp>(op))) {
       LDBG("Load " << *op << " cannot have shared encoding");
@@ -291,7 +291,7 @@ loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,
       [&](Operation *op, Operation *finalUser, int distance) {
         if (!seen.insert(op).second || excluded.count(op))
           return;
-        if (isa<tt::LoadOp, tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op)) {
+        if (isa<tt::LoadOp, tt::DescriptorLoadLikeOpInterface>(op)) {
           if (!AssignLoadLatencies::isPipeliningBeneficial(
                   op, finalUser, axisInfoAnalysis, filterSmall))
             return;
@@ -342,7 +342,7 @@ loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,
   // that are not directly used by dot ops.
   if (pipelineWithoutDot) {
     for (Operation &op : forOp.getBody()->without_terminator()) {
-      if (!isa<tt::LoadOp, tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op))
+      if (!isa<tt::LoadOp, tt::DescriptorLoadLikeOpInterface>(op))
         dfs(&op, &op, 0);
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -1222,7 +1222,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable`
`1222`	`1222`	`}`
`1223`	`1223`
`1224`	`1224`
`1225`		`-def TT_DescriptorLoadOp : TT_Op<"descriptor_load", [TT_DescriptorOpInterface]> {`
	`1225`	`+def TT_DescriptorLoadOp : TT_Op<"descriptor_load", [TT_DescriptorLoadLikeOpInterface]> {`
`1226`	`1226`	`let summary = "Load from descriptor";`
`1227`	`1227`	`let description = [{`
`1228`	`1228`	`This operation will be lowered to Nvidia TMA load operation on targets supporting it.`
`@@ -1291,7 +1291,7 @@ def TT_DescriptorReduceOp : TT_Op<"descriptor_reduce", [TT_DescriptorStoreLikeOp`
`1291`	`1291`	`let hasVerifier = 1;`
`1292`	`1292`	`}`
`1293`	`1293`
`1294`		`-def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [TT_DescriptorOpInterface]> {`
	`1294`	`+def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [TT_DescriptorLoadLikeOpInterface]> {`
`1295`	`1295`	`let summary = "gather multiple rows from a descriptor into a single tensor";`
`1296`	`1296`	`let description = [{`
`1297`	`1297`	The `tt.descriptor_gather` op will be lowered to NVIDIA TMA
Original file line number	Diff line number	Diff line change
`@@ -312,7 +312,7 @@ void BufferRegionAnalysis::calculateUsedBufferRegions(Operation *op) {`
`312`	`312`	`bool BufferRegionAnalysis::isMemoryAccessOperation(Operation *op) {`
`313`	`313`	`if (isa<ttg::LocalLoadOp, ttg::LocalStoreOp, ttng::TMEMLoadOp,`
`314`	`314`	`ttng::TMEMStoreOp, ttng::TMEMCopyOp, ttg::AsyncCopyGlobalToLocalOp,`
`315`		`- ttng::AsyncTMACopyLocalToGlobalOp, ttng::AsyncTMAScatterOp>(op)) {`
	`315`	`+ ttng::TMAOpInterface>(op)) {`
`316`	`316`	`return true;`
`317`	`317`	`}`
`318`	`318`	`if (isa<ttg::MBarrierOpInterface>(op)) {`