triton-lang
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 19 additions & 16 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 19 additions & 16 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 5 additions & 1 deletion b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/Analysis/Membar.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 16 additions & 10 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 50 additions & 56 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 50 additions & 56 deletions
@@ -419,8 +419,8 @@ template <typename T> class CallGraph {
 // Create a basic DataFlowSolver with constant and dead code analysis included.
 std::unique_ptr<DataFlowSolver> createDataFlowSolver();
 
-bool isCvtWarpSync(const triton::LinearLayout &srcLayout,
-                   const triton::LinearLayout &dstLayout);
+bool isCvtDimSync(const triton::LinearLayout &srcLayout,
+                  const triton::LinearLayout &dstLayout, StringAttr dim);
 
 } // namespace mlir
 
 
@@ -20,6 +20,8 @@ class TargetInfoBase {
   // target address space
   virtual void barrier(Location loc, RewriterBase &rewriter,
                        triton::gpu::AddrSpace targets) const = 0;
+  // Emit a cluster-level barrier when supported. Defaults to CTA barrier.
+  virtual void clusterBarrier(Location loc, RewriterBase &rewriter) const = 0;
   // Insert a warp syncronization barrier that also guarantees local address
   // space visibility at warp level when supported by the backend.
   // Backends that do not support warp-level barriers should conservatively
 
@@ -15,6 +15,8 @@
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/STLExtras.h"
 
+#include <optional>
+
 #define DEBUG_TYPE "ttgpu_to_llvm"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
@@ -331,7 +333,7 @@ namespace triton {
 namespace gpu {
 
 std::pair<SmallVector<LocalMemOpTile>, SmallVector<LocalMemOpTile>>
-getSrcDstTiles(const TargetInfoBase &targetInfo, int bitwidth);
+getSrcDstTiles(const TargetInfoBase &targetInfo, int bitwidth, bool crossCTA);
 
 Type getFunctionType(Type resultType, ValueRange operands);
 
@@ -567,17 +569,18 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
 // calcPaddedOffset is a lambda that takes a base offset (mlir::Value)
 // and computes a new offset (mlir::Value) by applying padding based on
 // shared memory layout.
-SmallVector<Value> lowerLdSt(
-    Location loc, MLIRContext *ctx, LinearLayout cvt,
-    ArrayRef<Value> valsArray, // Input for store, output for load
-    Type llvmElemTy, Value smemBase,
-    ArrayRef<std::pair<unsigned, unsigned>> paddingShifts, Value affineOffset,
-    uint64_t maskSpanAffineOffset, Value laneId, Value warpId,
-    RewriterBase &rewriter, const TargetInfoBase &targetInfo,
-    std::optional<int> maybeMaxVecElems,
-    std::function<SmallVector<Value>(RewriterBase &, Location, ArrayRef<Value>,
-                                     Value, int, VectorType)>
-        lowerInst);
+SmallVector<Value>
+lowerLdSt(Location loc, MLIRContext *ctx, LinearLayout cvt,
+          ArrayRef<Value> valsArray, // Input for store, output for load
+          Type llvmElemTy, Value smemBase,
+          ArrayRef<std::pair<unsigned, unsigned>> paddingShifts,
+          Value affineOffset, uint64_t maskSpanAffineOffset, Value laneId,
+          Value warpId, RewriterBase &rewriter,
+          const TargetInfoBase &targetInfo, std::optional<int> maybeMaxVecElems,
+          std::function<SmallVector<Value>(RewriterBase &, Location,
+                                           ArrayRef<Value>, Value, int,
+                                           VectorType, std::optional<Value>)>
+              lowerInst);
 
 // Lower local_load/local_store via ld.shared/st.shared
 SmallVector<Value>
@@ -619,10 +622,10 @@ void makeAllWarpGroupsIsolatedFromAbove(Operation *op);
 // Set the correct loop annotation on LLVM branch ops.
 void fixUpLoopAnnotation(ModuleOp mod);
 
-void transferWithinBlockSwizzling(triton::gpu::ConvertLayoutOp op, Value src,
-                                  const TargetInfoBase &targetInfo,
-                                  const LLVMTypeConverter *typeConverter,
-                                  RewriterBase &rewriter);
+void transferSwizzlingLocalMem(triton::gpu::ConvertLayoutOp op, Value src,
+                               const TargetInfoBase &targetInfo,
+                               const LLVMTypeConverter *typeConverter,
+                               RewriterBase &rewriter);
 
 SmallVector<Value> inlineRegionImpl(RewriterBase &rewriter, Region &region,
                                     ArrayRef<Value> args,
 
@@ -49,7 +49,11 @@ unsigned getNumScratchElemsSwizzledCvt(const LinearLayout &srcLayout,
   auto smem = gpu::optimalSwizzlingLdSt(srcLayoutNoBroadcast,
                                         dstLayoutNoBroadcast, bitwidth);
   auto reps = smem.getInDimSize(StringAttr::get(ctx, "reps"));
-  return smem.getTotalOutDimSize() / reps;
+  // The smem has the same cta layout as the srcLayout, so we use that instead
+  // We remove the number of elements that are duplicated in the cta layout
+  auto nBlocks = product(triton::gpu::getCTASplitNum(
+      gpu::LinearEncodingAttr::get(ctx, srcLayout)));
+  return smem.getTotalOutDimSize() / (reps * nBlocks);
 }
 
 unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
 
@@ -324,7 +324,8 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
       auto dstTy = cast<RankedTensorType>(cvt.getType());
       auto srcLayout = triton::gpu::toLinearLayout(srcTy);
       auto dstLayout = triton::gpu::toLinearLayout(dstTy);
-      isWarpSync = mlir::isCvtWarpSync(srcLayout, dstLayout);
+      auto kWarp = StringAttr::get(op->getContext(), "warp");
+      isWarpSync = mlir::isCvtDimSync(srcLayout, dstLayout, kWarp);
     }
 
     if (!curBlockInfo.syncReadSlices.empty() ||
 
@@ -1421,17 +1421,23 @@ std::unique_ptr<DataFlowSolver> createDataFlowSolver() {
   return solver;
 }
 
-bool isCvtWarpSync(const triton::LinearLayout &srcLayout,
-                   const triton::LinearLayout &dstLayout) {
-  // We can use warp.sync when the warp dimension in the convert is trival
-  // and there is no broadcasting at a warp level (otherwise reads may be
-  // wrong)
+bool isCvtDimSync(const triton::LinearLayout &srcLayout,
+                  const triton::LinearLayout &dstLayout, StringAttr dim) {
+  // We can use a dimension-level sync when the conversion is trivial over that
+  // dimension and there is no broadcasting over it.
   auto *ctx = srcLayout.getInDimNames().begin()->getContext();
-  auto comp = dstLayout.invertAndCompose(srcLayout);
   auto kWarp = StringAttr::get(ctx, "warp");
-  return comp.isTrivialOver(kWarp) &&
-         srcLayout.getFreeVariableMasks()[kWarp] == 0 &&
-         dstLayout.getFreeVariableMasks()[kWarp] == 0;
+  auto kBlock = StringAttr::get(ctx, "block");
+  assert((dim == kWarp || dim == kBlock) && "expected dim to be warp or block");
+  assert(srcLayout.hasInDim(dim) && dstLayout.hasInDim(dim) &&
+         "expected dim to be present in both layouts");
+  auto parentTrivial = true;
+  if (dim == kWarp) {
+    parentTrivial = isCvtDimSync(srcLayout, dstLayout, kBlock);
+  }
+  auto comp = dstLayout.invertAndCompose(srcLayout);
+  return parentTrivial && comp.isTrivialOver(dim) &&
+         srcLayout.getFreeVariableMasks()[dim] == 0 &&
+         dstLayout.getFreeVariableMasks()[dim] == 0;
 }
-
 } // namespace mlir
@@ -4,6 +4,8 @@
 #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 
+#include <optional>
+
 #include "triton/Analysis/Allocation.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
@@ -45,26 +47,20 @@ struct ConvertLayoutOpConversion
     LinearLayout srcLayout = toLinearLayout(srcTy);
     LinearLayout dstLayout = toLinearLayout(dstTy);
 
-    StringAttr kBlock = str_attr("block");
-    StringAttr kWarp = str_attr("warp");
-    StringAttr kLane = str_attr("lane");
-    StringAttr kRegister = str_attr("register");
+    auto kBlock = str_attr("block");
+    auto kWarp = str_attr("warp");
+    auto kLane = str_attr("lane");
+    auto kRegister = str_attr("register");
 
     auto dims = conversion.getInDimNames();
     bool alwaysUseWarpShuffle = cvtAlwaysUseWarpShuffle(op);
-    assert(!alwaysUseWarpShuffle || (!llvm::is_contained(dims, kBlock) &&
-                                     !llvm::is_contained(dims, kWarp)));
     assert(to_vector(conversion.getInDimNames()) ==
            to_vector(conversion.getOutDimNames()));
-    if (llvm::is_contained(dims, kBlock)) {
-      // Case 1: Transfer between values in different CTAs.
-      //          This requires moving values through distributed shared memory.
-      return rewriter.notifyMatchFailure(
-          op, "NYI: Transfer between different CTAs");
-    } else if (llvm::is_contained(dims, kWarp)) {
-      // Case 2: Transfer between values in the same CTA, in which case we move
-      //         values through shared memory.
-      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
+    if (llvm::is_contained(dims, kBlock) || llvm::is_contained(dims, kWarp)) {
+      assert(!alwaysUseWarpShuffle);
+      // Transfer between values in the same CTA, or across CTAs. We move values
+      // through (distributed) shared memory.
+      transferSwizzlingLocalMem(op, adaptor.getSrc(), rewriter);
       return success();
     } else if (llvm::is_contained(dims, kLane)) {
       // Case 3. Transfer between values in the same warp, in which case we try
@@ -73,7 +69,7 @@ struct ConvertLayoutOpConversion
       if (cvtNeedsWarpShuffle(srcTy, dstTy) || alwaysUseWarpShuffle)
         return transferWithinWarp(op, adaptor, rewriter);
 
-      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
+      transferSwizzlingLocalMem(op, adaptor.getSrc(), rewriter);
       return success();
     } else if (llvm::is_contained(dims, kRegister)) {
       // Case 4. Transfer between values in the same thread, in which case we
@@ -93,7 +89,7 @@ struct ConvertLayoutOpConversion
                        ConversionPatternRewriter &rewriter) const {
     MLIRContext *ctx = op.getContext();
     auto loc = op.getLoc();
-    StringAttr kRegister = str_attr("register");
+    auto kRegister = str_attr("register");
     assert(!cvtNeedsSharedMemory(op.getSrc().getType(), op.getType()));
 
     auto srcTy = op.getSrc().getType();
@@ -110,7 +106,7 @@ struct ConvertLayoutOpConversion
     return success();
   }
 
-  SmallVector<Value> transferWithinBlockSwizzlingImpl(
+  SmallVector<Value> transferSwizzlingLocalMemImpl(
       Location loc, ConversionPatternRewriter &rewriter,
       const LinearLayout &srcLayout, const LinearLayout &dstLayout,
       ArrayRef<Value> inVals, Type llvmElemTy, Value smemBase) const {
@@ -126,8 +122,8 @@ struct ConvertLayoutOpConversion
         return b.ptrtoint(llvmElemTyPtr, v).getResult();
       }));
       auto outVals =
-          transferWithinBlockSwizzlingImpl(loc, rewriter, srcLayout, dstLayout,
-                                           newInVals, llvmElemTyPtr, smemBase);
+          transferSwizzlingLocalMemImpl(loc, rewriter, srcLayout, dstLayout,
+                                        newInVals, llvmElemTyPtr, smemBase);
       for (auto &v : outVals) {
         v = b.inttoptr(llvmElemTy, v);
       }
@@ -140,7 +136,7 @@ struct ConvertLayoutOpConversion
       auto i8ElemTy = i8_ty;
       auto newInVals = llvm::to_vector(llvm::map_range(
           inVals, [&](Value v) { return b.zext(i8ElemTy, v).getResult(); }));
-      auto outVals = transferWithinBlockSwizzlingImpl(
+      auto outVals = transferSwizzlingLocalMemImpl(
           loc, rewriter, srcLayout, dstLayout, newInVals, i8ElemTy, smemBase);
       for (auto &v : outVals) {
         v = b.trunc(llvmElemTy, v);
@@ -153,15 +149,15 @@ struct ConvertLayoutOpConversion
     if (!removeBroadcastSrc.isIdentity()) {
       auto prmtSrc = removeBroadcastSrc.apply(srcLayout);
       auto newInVals = removeBroadcastSrc.apply(inVals);
-      return transferWithinBlockSwizzlingImpl(loc, rewriter, prmtSrc, dstLayout,
-                                              newInVals, llvmElemTy, smemBase);
+      return transferSwizzlingLocalMemImpl(loc, rewriter, prmtSrc, dstLayout,
+                                           newInVals, llvmElemTy, smemBase);
     }
 
     // Remove broadcasting in dst
     auto removeBroadcastDst = actionRemoveBroadcastedRegs(dstLayout);
     if (!removeBroadcastDst.isIdentity()) {
       auto prmtDst = removeBroadcastDst.apply(dstLayout);
-      auto outVals = transferWithinBlockSwizzlingImpl(
+      auto outVals = transferSwizzlingLocalMemImpl(
           loc, rewriter, srcLayout, prmtDst, inVals, llvmElemTy, smemBase);
       return broadcastAs(outVals, dstLayout);
     }
@@ -173,6 +169,8 @@ struct ConvertLayoutOpConversion
 
     // Extract reps from smem
     auto kReg = str_attr("register");
+    auto kWarp = str_attr("warp");
+    auto kBlock = str_attr("block");
     auto kReps = str_attr("reps");
     auto nReps = smem.getInDimSize(kReps);
     auto reps = LinearLayout::identity1D(nReps, kReg, kReps);
@@ -194,8 +192,11 @@ struct ConvertLayoutOpConversion
     auto storeCvt = *divideRight(totalStoreCvt, reps);
     auto loadCvt = *divideRight(totalLoadCvt, reps);
     auto kOffset = str_attr("offset");
-    storeCvt = storeCvt.reshapeOuts({{kOffset, storeCvt.getTotalOutDimSize()}});
-    loadCvt = loadCvt.reshapeOuts({{kOffset, loadCvt.getTotalOutDimSize()}});
+    auto nBlock = storeCvt.getInDimSize(kBlock);
+    storeCvt = storeCvt.reshapeOuts(
+        {{kOffset, storeCvt.getTotalOutDimSize() / nBlock}, {kBlock, nBlock}});
+    loadCvt = loadCvt.reshapeOuts(
+        {{kOffset, loadCvt.getTotalOutDimSize() / nBlock}, {kBlock, nBlock}});
 
     auto tileSize = storeCvt.getInDimSize(kReg);
 
@@ -204,28 +205,30 @@ struct ConvertLayoutOpConversion
     auto affineOffset = b.i32_val(0);
     auto maskSpanAffineOffset = 0;
 
-    bool isWarpSync = mlir::isCvtWarpSync(srcLayout, dstLayout);
-    for (int i = 0; i < nReps; ++i) {
-      if (i > 0) {
-        if (isWarpSync) {
-          targetInfo.warpSync(loc, rewriter);
-        } else {
-          targetInfo.barrier(loc, rewriter, triton::gpu::AddrSpace::Local);
-        }
+    bool isWarpSync = mlir::isCvtDimSync(srcLayout, dstLayout, kWarp);
+    bool isBlockSync = mlir::isCvtDimSync(srcLayout, dstLayout, kBlock);
+    auto emitBarrier = [&]() {
+      if (isWarpSync) {
+        targetInfo.warpSync(loc, rewriter);
+      } else if (isBlockSync) {
+        targetInfo.barrier(loc, rewriter, triton::gpu::AddrSpace::Local);
+      } else {
+        targetInfo.clusterBarrier(loc, rewriter);
       }
+    };
+
+    for (int i = 0; i < nReps; ++i) {
+      if (i > 0)
+        emitBarrier();
       auto tileInVals =
           ArrayRef<Value>(permutedInVals).slice(i * tileSize, tileSize);
       // Store
       lowerLdStShared(loc, ctx, storeCvt, tileInVals, llvmElemTy, smemBase,
                       /*paddingShifts=*/{}, affineOffset, maskSpanAffineOffset,
                       rewriter, targetInfo);
-      if (isWarpSync) {
-        targetInfo.warpSync(loc, rewriter);
-      } else {
-        targetInfo.barrier(loc, rewriter, triton::gpu::AddrSpace::Local);
-      }
+      emitBarrier();
       // Load
-      SmallVector<Value> tileOutVals = lowerLdStShared(
+      auto tileOutVals = lowerLdStShared(
           loc, ctx, loadCvt, {}, llvmElemTy, smemBase, /*paddingShifts=*/{},
           affineOffset, maskSpanAffineOffset, rewriter, targetInfo);
       llvm::append_range(outVals, tileOutVals);
@@ -236,30 +239,21 @@ struct ConvertLayoutOpConversion
     return outVals;
   }
 
-  void transferWithinBlockSwizzling(ConvertLayoutOp op, Value src,
-                                    ConversionPatternRewriter &rewriter) const {
+  void transferSwizzlingLocalMem(ConvertLayoutOp op, Value src,
+                                 ConversionPatternRewriter &rewriter) const {
     auto loc = op.getLoc();
     auto *ctx = op.getContext();
     auto srcTy = op.getSrc().getType();
     auto dstTy = op.getType();
 
-    // Remove the kBlock dimension from the layout as it's the identity in the
-    // cvt
     auto srcLayout = toLinearLayout(srcTy);
     auto dstLayout = toLinearLayout(dstTy);
-    auto kReg = str_attr("register");
-    auto kLane = str_attr("lane");
-    auto kWarp = str_attr("warp");
-    srcLayout = srcLayout.sublayout({kReg, kLane, kWarp},
-                                    to_vector(srcLayout.getOutDimNames()));
-    dstLayout = dstLayout.sublayout({kReg, kLane, kWarp},
-                                    to_vector(dstLayout.getOutDimNames()));
 
     auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType());
     auto smemBase =
         LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op.getOperation());
     auto inVals = unpackLLElements(loc, src, rewriter);
-    auto outVals = transferWithinBlockSwizzlingImpl(
+    auto outVals = transferSwizzlingLocalMemImpl(
         loc, rewriter, srcLayout, dstLayout, inVals, llvmElemTy, smemBase);
 
     Value result =
@@ -276,8 +270,8 @@ struct ConvertLayoutOpConversion
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto srcTy = op.getSrc().getType();
     auto dstTy = op.getType();
-    StringAttr kReg = str_attr("register");
-    StringAttr kLane = str_attr("lane");
+    auto kReg = str_attr("register");
+    auto kLane = str_attr("lane");
     auto elemTy = getTypeConverter()->convertType(srcTy.getElementType());
     int bitwidth = getIntOrFloatOrPtrBitWidth(elemTy);
 
@@ -434,8 +428,8 @@ struct ConvertLayoutOpConversion
       ArrayRef<TranspositionInfo> mixedTranspositions) const {
     auto *ctx = rewriter.getContext();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
-    StringAttr kReg = str_attr("register");
-    StringAttr kLane = str_attr("lane");
+    auto kReg = str_attr("register");
+    auto kLane = str_attr("lane");
 
     SmallVector<Value> vals(inVals.begin(), inVals.end());
     int m = mixedTranspositions.size();
Original file line number	Diff line number	Diff line change
`@@ -324,7 +324,8 @@ void MembarAnalysis::update(Operation op, BlockInfo blockInfo,`
`324`	`324`	`auto dstTy = cast<RankedTensorType>(cvt.getType());`
`325`	`325`	`auto srcLayout = triton::gpu::toLinearLayout(srcTy);`
`326`	`326`	`auto dstLayout = triton::gpu::toLinearLayout(dstTy);`
`327`		`- isWarpSync = mlir::isCvtWarpSync(srcLayout, dstLayout);`
	`327`	`+ auto kWarp = StringAttr::get(op->getContext(), "warp");`
	`328`	`+ isWarpSync = mlir::isCvtDimSync(srcLayout, dstLayout, kWarp);`
`328`	`329`	`}`
`329`	`330`
`330`	`331`	`if (!curBlockInfo.syncReadSlices.empty() \|\|`