triton-lang
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Analysis/Membar.h‎
Lines changed: 42 additions & 17 deletions b/‎include/triton/Analysis/Membar.h‎
Lines changed: 42 additions & 17 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.h‎
Lines changed: 19 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.h‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 9 additions & 3 deletions b/‎lib/Analysis/Membar.cpp‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp‎
Lines changed: 188 additions & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp‎
Lines changed: 188 additions & 0 deletions
@@ -5,7 +5,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Support/raw_ostream.h"
 
 #include <limits>
 
@@ -145,6 +144,11 @@ class Allocation {
     return bufferSet.at(bufferId).kind == BufferT::BufferKind::Virtual;
   }
 
+  /// Returns if the given buffer is an explicit buffer.
+  bool isExplicitBuffer(BufferId bufferId) const {
+    return bufferSet.at(bufferId).kind == BufferT::BufferKind::Explicit;
+  }
+
   /// Returns the size of total shared memory allocated
   size_t getSharedMemorySize() const { return sharedMemorySize; }
 
 
@@ -4,31 +4,41 @@
 #include "Allocation.h"
 
 #include "llvm/Support/raw_ostream.h"
+#include <functional>
 #include <set>
 #include <tuple>
 
 namespace mlir {
 
 class OpBuilder;
+struct AllocationSlice;
 
 /// Callback to allow backend to provide more information on whether a barrier
 /// is needed between two operations. Even though two operations access the same
 /// shared memory they may not require a barrier in between them.
 using MembarFilterFn =
-    std::function<bool(Operation *, Operation *, Allocation *)>;
+    std::function<bool(Operation *, Operation *, bool /*lhsIsRead*/,
+                       bool /*rhsIsRead*/, Allocation *)>;
+
+/// Slice-level filter to allow backends to ignore specific aliasing cases.
+using MembarSliceFilterFn =
+    std::function<bool(const AllocationSlice &, const AllocationSlice &,
+                       bool /*lhsIsRead*/, bool /*rhsIsRead*/, Allocation *)>;
 
 // Represents the access to a slice of an allocation
 // It contains information both on physical memory (the interval) and a
 // logical view on it (layout, subslice offsets and shape for the access)
 struct AllocationSlice {
 public:
   // Create allocation slice from a value, collecting subslice offsets
-  AllocationSlice(Value value, Interval<size_t> allocationInterval);
+  AllocationSlice(Value value, Interval<size_t> allocationInterval,
+                  Allocation::BufferId bufferId);
 
   // Builder for accesses that represent accesses to the whole
   // allocation (scratch buffers, ArriveBarrierOp, ..)
   AllocationSlice(Interval<size_t> interval)
-      : allocationInterval(interval), accessTy(nullptr) {}
+      : allocationInterval(interval), accessTy(nullptr),
+        bufferId(Allocation::InvalidBufferId) {}
 
   bool operator<(const AllocationSlice &other) const {
     return asTuple() < other.asTuple();
@@ -43,19 +53,25 @@ struct AllocationSlice {
   // Returns true if it can't prove the AllocationSlices are disjoint.
   bool intersects(const AllocationSlice &other) const;
 
+  Allocation::BufferId getBufferId() const { return bufferId; }
+
   void print(raw_ostream &os) const;
 
 private:
-  std::tuple<Interval<size_t>, const void *, llvm::ArrayRef<int64_t>>
+  std::tuple<Interval<size_t>, Allocation::BufferId, const void *,
+             llvm::ArrayRef<int64_t>>
   asTuple() const {
-    return {allocationInterval, accessTy.getAsOpaquePointer(), subsliceOffsets};
+    return {allocationInterval, bufferId, accessTy.getAsOpaquePointer(),
+            subsliceOffsets};
   }
   // Offsets from subslice. Empty when offsets are unknown
   SmallVector<int64_t> subsliceOffsets;
   // The allocated interval for this buffer
   Interval<size_t> allocationInterval;
   // Type of the memory descriptor for this access
   triton::gpu::MemDescType accessTy;
+  // Buffer id for partial sync on wait_barrier deps.
+  Allocation::BufferId bufferId;
 };
 
 struct BlockInfo {
@@ -103,15 +119,19 @@ struct BlockInfo {
 
   /// Returns true if Slices in two BlockInfo objects are intersected.
   bool isIntersected(const BlockInfo &other, MembarFilterFn filter,
-                     Allocation *allocation) const {
-    return /*RAW*/ isIntersected(syncWriteSlices, other.syncReadSlices, filter,
-                                 allocation) ||
+                     Allocation *allocation,
+                     MembarSliceFilterFn sliceFilter = nullptr) const {
+    return /*RAW*/ isIntersected(syncWriteSlices, other.syncReadSlices,
+                                 /*lhsIsRead=*/false, /*rhsIsRead=*/true,
+                                 filter, sliceFilter, allocation) ||
            /*WAR*/
-           isIntersected(syncReadSlices, other.syncWriteSlices, filter,
-                         allocation) ||
+           isIntersected(syncReadSlices, other.syncWriteSlices,
+                         /*lhsIsRead=*/true, /*rhsIsRead=*/false, filter,
+                         sliceFilter, allocation) ||
            /*WAW*/
-           isIntersected(syncWriteSlices, other.syncWriteSlices, filter,
-                         allocation);
+           isIntersected(syncWriteSlices, other.syncWriteSlices,
+                         /*lhsIsRead=*/false, /*rhsIsRead=*/false, filter,
+                         sliceFilter, allocation);
   }
 
   /// Clears the slices because a barrier is inserted.
@@ -130,14 +150,19 @@ struct BlockInfo {
 
 private:
   bool isIntersected(const SliceMapT &lhsSlices, const SliceMapT &rhsSlices,
-                     MembarFilterFn filter, Allocation *allocation) const {
+                     bool lhsIsRead, bool rhsIsRead, MembarFilterFn filter,
+                     MembarSliceFilterFn sliceFilter,
+                     Allocation *allocation) const {
     for (auto &lhs : lhsSlices)
       for (auto &rhs : rhsSlices)
         if (lhs.first.intersects(rhs.first))
-          for (auto lhsOp : lhs.second)
-            for (auto rhsOp : rhs.second)
-              if (!filter || !filter(lhsOp, rhsOp, allocation))
-                return true;
+          if (!sliceFilter || !sliceFilter(lhs.first, rhs.first, lhsIsRead,
+                                           rhsIsRead, allocation))
+            for (auto lhsOp : lhs.second)
+              for (auto rhsOp : rhs.second)
+                if (!filter ||
+                    !filter(lhsOp, rhsOp, lhsIsRead, rhsIsRead, allocation))
+                  return true;
     return false;
   }
 };
 
@@ -0,0 +1,19 @@
+#ifndef TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_CLUSTERBARRIERINSERTION_H_
+#define TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_CLUSTERBARRIERINSERTION_H_
+
+#include "triton/Analysis/Allocation.h"
+
+namespace mlir {
+namespace triton {
+namespace nvidia_gpu {
+
+/// Inserts cluster barriers (cluster_arrive + cluster_wait) using the provided
+/// shared-memory allocation analysis.
+void runClusterBarrierInsertion(ModuleAllocation &moduleAllocation,
+                                int computeCapability);
+
+} // namespace nvidia_gpu
+} // namespace triton
+} // namespace mlir
+
+#endif // TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_CLUSTERBARRIERINSERTION_H_
@@ -10,8 +10,9 @@
 namespace mlir {
 
 AllocationSlice::AllocationSlice(Value value,
-                                 Interval<size_t> allocationInterval)
-    : allocationInterval(allocationInterval) {
+                                 Interval<size_t> allocationInterval,
+                                 Allocation::BufferId bufferId)
+    : allocationInterval(allocationInterval), bufferId(bufferId) {
   auto accessTy = cast<triton::gpu::MemDescType>(value.getType());
   this->accessTy = accessTy;
 
@@ -69,6 +70,9 @@ void AllocationSlice::print(raw_ostream &os) const {
   os << "interval=[" << allocationInterval.start() << ","
      << allocationInterval.end() << ")";
 
+  if (bufferId != Allocation::InvalidBufferId)
+    os << " buffer=" << bufferId;
+
   os << " offsets=[";
   if (!subsliceOffsets.empty()) {
     llvm::interleaveComma(subsliceOffsets, os);
@@ -244,6 +248,8 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
   auto containsLocalBarrier = [](Operation *op) {
     if (isa<gpu::BarrierOp>(op))
       return true;
+    if (isa<triton::nvidia_gpu::ClusterWaitOp>(op))
+      return true;
     if (isa<triton::gpu::WarpSpecializePartitionsOp>(op))
       return true;
     if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(op))
@@ -287,7 +293,7 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
           for (auto bufferId : allocation->getAllBufferIdsWithAliases(value)) {
             if (bufferId != Allocation::InvalidBufferId) {
               auto interval = allocation->getAllocatedInterval(bufferId);
-              auto slice = AllocationSlice(value, interval);
+              auto slice = AllocationSlice(value, interval, bufferId);
 
               if (isa<MemoryEffects::Write>(effectInstance.getEffect()))
                 curBlockInfo.syncWriteSlices[slice].insert(op);
 
@@ -1,4 +1,5 @@
 add_triton_library(TritonNvidiaGPUTransforms
+  ClusterBarrierInsertion.cpp
   CheckMatmulTwoCTAs.cpp
   FenceInsertion.cpp
   InterleaveTMem.cpp
 
@@ -0,0 +1,188 @@
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.h"
+#include "triton/Analysis/Allocation.h"
+#include "triton/Analysis/Membar.h"
+#include "triton/Analysis/Utility.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace mlir {
+namespace triton {
+namespace nvidia_gpu {
+
+namespace {
+
+namespace ttg = mlir::triton::gpu;
+namespace ttng = mlir::triton::nvidia_gpu;
+
+static bool isDistributedMultiCTAOp(Operation *op, bool isRead) {
+  if (auto cvt = dyn_cast<ttg::ConvertLayoutOp>(op)) {
+    if (!isRead)
+      return false;
+    auto srcTy = cvt.getSrc().getType();
+    auto dstTy = cvt.getType();
+    auto kBlock = StringAttr::get(op->getContext(), "block");
+    auto conversion = minimalCvtLayout(srcTy, dstTy);
+    return conversion.hasInDim(kBlock);
+  }
+  if (auto reduce = dyn_cast<triton::ReduceOp>(op)) {
+    if (!isRead)
+      return false;
+    auto srcTy = reduce.getInputTypes()[0];
+    auto splitNum = ttg::getCTASplitNum(srcTy.getEncoding());
+    return splitNum[reduce.getAxis()] > 1;
+  }
+  if (auto mma = dyn_cast<ttng::TCGen5MMAOp>(op)) {
+    return mma.getTwoCtas();
+  } else if (auto mmaScaled = dyn_cast<ttng::TCGen5MMAScaledOp>(op)) {
+    // TODO: Change when we support scaled MMA with 2CTAs
+    assert(!ttng::getModuleTwoCTAs(op->getParentOfType<ModuleOp>()) &&
+           "Scaled MMA with 2CTAs not supported");
+    return false;
+  } else if (auto tma = dyn_cast<ttng::AsyncTMACopyGlobalToLocalOp>(op)) {
+    return tma.getMulticast();
+  }
+  return false;
+}
+
+static bool isPreAllocAliasSliceFilter(const AllocationSlice &lhsSlice,
+                                       const AllocationSlice &rhsSlice,
+                                       bool /*lhsIsRead*/, bool /*rhsIsRead*/,
+                                       Allocation *allocation) {
+  auto bufferId = lhsSlice.getBufferId();
+  return bufferId != Allocation::InvalidBufferId &&
+         bufferId == rhsSlice.getBufferId() &&
+         allocation->isExplicitBuffer(bufferId);
+}
+
+class ClusterBarrierAnalysis : public MembarOrFenceAnalysis {
+public:
+  ClusterBarrierAnalysis() = default;
+  explicit ClusterBarrierAnalysis(Allocation *allocation, MembarFilterFn filter)
+      : MembarOrFenceAnalysis(allocation, filter) {}
+
+private:
+  void update(Operation *op, BlockInfo *blockInfo,
+              FuncBlockInfoMapT *funcBlockInfoMap, OpBuilder *builder) override;
+
+  void insertClusterBarrier(Operation *op, OpBuilder *builder);
+};
+
+void ClusterBarrierAnalysis::insertClusterBarrier(Operation *op,
+                                                  OpBuilder *builder) {
+  OpBuilder::InsertionGuard guard(*builder);
+  ttng::ClusterArriveOp::create(*builder, op->getLoc(), /*relaxed=*/false);
+  ttng::ClusterWaitOp::create(*builder, op->getLoc());
+}
+
+void ClusterBarrierAnalysis::update(Operation *op, BlockInfo *blockInfo,
+                                    FuncBlockInfoMapT *funcBlockInfoMap,
+                                    OpBuilder *builder) {
+  if (isa<ttng::ClusterWaitOp>(op)) {
+    blockInfo->sync();
+    return;
+  }
+
+  BlockInfo curBlockInfo;
+  auto scratchBufferId = Allocation::InvalidBufferId;
+  if (isa<triton::CallOp>(op)) {
+    auto callOpInterface = dyn_cast<CallOpInterface>(op);
+    if (auto callee =
+            dyn_cast<FunctionOpInterface>(callOpInterface.resolveCallable()))
+      curBlockInfo = funcBlockInfoMap->lookup(callee);
+  } else {
+    if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(op)) {
+      SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>>
+          effectInstances;
+      memEffects.getEffects(effectInstances);
+      for (auto effectInstance : effectInstances) {
+        if (auto value = effectInstance.getValue()) {
+          for (auto bufferId : allocation->getBufferIds(value)) {
+            if (bufferId != Allocation::InvalidBufferId) {
+              auto interval = allocation->getAllocatedInterval(bufferId);
+              auto slice = AllocationSlice(value, interval, bufferId);
+              if (isa<MemoryEffects::Write>(effectInstance.getEffect()))
+                curBlockInfo.syncWriteSlices[slice].insert(op);
+              else if (isa<MemoryEffects::Read>(effectInstance.getEffect()))
+                curBlockInfo.syncReadSlices[slice].insert(op);
+            }
+          }
+        }
+      }
+    }
+    scratchBufferId = allocation->getBufferId(op);
+  }
+
+  // Scratch buffer operations consist of a series of shared memory operations
+  // starting from a shared memory write, followed by a series of shared memory
+  // read/write operations, and ending with a shared memory read, i.e., shared
+  // memory write -> ... -> shared memory read.
+  if (scratchBufferId != Allocation::InvalidBufferId) {
+    if (!curBlockInfo.syncReadSlices.empty() ||
+        !curBlockInfo.syncWriteSlices.empty()) {
+      llvm::report_fatal_error(
+          "scratch buffer operations should not have any shared memory "
+          "dependencies");
+    }
+
+    auto interval = allocation->getAllocatedInterval(scratchBufferId);
+    auto scratchSlice = AllocationSlice(interval);
+    curBlockInfo.syncWriteSlices[scratchSlice].insert(op);
+
+    auto insertClusterBarrierNeeded = blockInfo->isIntersected(
+        curBlockInfo, filter, allocation, isPreAllocAliasSliceFilter);
+    if (insertClusterBarrierNeeded) {
+      builder->setInsertionPoint(op);
+      insertClusterBarrier(op, builder);
+    }
+
+    // Clear prior distributed dependencies if we have inserted a cluster
+    // barrier, or if the scratch op itself performs a cluster-level sync.
+    bool hasClusterSync = isDistributedMultiCTAOp(op, /*isRead=*/true);
+    if (insertClusterBarrierNeeded || hasClusterSync)
+      blockInfo->sync();
+
+    curBlockInfo.syncReadSlices[scratchSlice].insert(op);
+  } else if (blockInfo->isIntersected(curBlockInfo, filter, allocation,
+                                      isPreAllocAliasSliceFilter)) {
+    builder->setInsertionPoint(op);
+    insertClusterBarrier(op, builder);
+    blockInfo->sync();
+  }
+
+  blockInfo->join(curBlockInfo);
+}
+
+} // namespace
+
+void runClusterBarrierInsertion(ModuleAllocation &moduleAllocation,
+                                int computeCapability) {
+  ModuleOp mod = moduleAllocation.getModuleOp();
+  if (computeCapability < 90)
+    return;
+  if (ttg::TritonGPUDialect::getNumCTAs(mod) == 1)
+    return;
+
+  MembarFilterFn filterFn = [](Operation *lhs, Operation *rhs, bool lhsIsRead,
+                               bool rhsIsRead, Allocation * /*allocation*/) {
+    // Filter ops that do not touch distributed shared memory. Whether the
+    // aliasing was already present in TTGIR is handled per-allocation slice.
+    bool lhsDist = isDistributedMultiCTAOp(lhs, lhsIsRead);
+    bool rhsDist = isDistributedMultiCTAOp(rhs, rhsIsRead);
+    if (!lhsDist && !rhsDist)
+      return true;
+    return false;
+  };
+
+  ModuleMembarOrFenceAnalysis<ClusterBarrierAnalysis> analysis(
+      &moduleAllocation, filterFn);
+  analysis.run();
+}
+
+} // namespace nvidia_gpu
+} // namespace triton
+} // namespace mlir
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`add_triton_library(TritonNvidiaGPUTransforms`
	`2`	`+ ClusterBarrierInsertion.cpp`
`2`	`3`	`CheckMatmulTwoCTAs.cpp`
`3`	`4`	`FenceInsertion.cpp`
`4`	`5`	`InterleaveTMem.cpp`