triton-lang
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 10 additions & 1 deletion b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎include/triton/Analysis/Membar.h‎
Lines changed: 42 additions & 17 deletions b/‎include/triton/Analysis/Membar.h‎
Lines changed: 42 additions & 17 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.h‎
Lines changed: 19 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.h‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 9 additions & 3 deletions b/‎lib/Analysis/Membar.cpp‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -5,7 +5,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Support/raw_ostream.h"
 
 #include <limits>
 
@@ -145,6 +144,16 @@ class Allocation {
     return bufferSet.at(bufferId).kind == BufferT::BufferKind::Virtual;
   }
 
+  /// Returns if the given buffer is a scratch buffer.
+  bool isScratchBuffer(BufferId bufferId) const {
+    return bufferSet.at(bufferId).kind == BufferT::BufferKind::Scratch;
+  }
+
+  /// Returns if the given buffer is an explicit buffer.
+  bool isExplicitBuffer(BufferId bufferId) const {
+    return bufferSet.at(bufferId).kind == BufferT::BufferKind::Explicit;
+  }
+
   /// Returns the size of total shared memory allocated
   size_t getSharedMemorySize() const { return sharedMemorySize; }
 
 
@@ -4,31 +4,41 @@
 #include "Allocation.h"
 
 #include "llvm/Support/raw_ostream.h"
+#include <functional>
 #include <set>
 #include <tuple>
 
 namespace mlir {
 
 class OpBuilder;
+struct AllocationSlice;
 
 /// Callback to allow backend to provide more information on whether a barrier
 /// is needed between two operations. Even though two operations access the same
 /// shared memory they may not require a barrier in between them.
 using MembarFilterFn =
-    std::function<bool(Operation *, Operation *, Allocation *)>;
+    std::function<bool(Operation *, Operation *, bool /*lhsIsRead*/,
+                       bool /*rhsIsRead*/, Allocation *)>;
+
+/// Slice-level filter to allow backends to ignore specific aliasing cases.
+using MembarSliceFilterFn =
+    std::function<bool(const AllocationSlice &, const AllocationSlice &,
+                       bool /*lhsIsRead*/, bool /*rhsIsRead*/, Allocation *)>;
 
 // Represents the access to a slice of an allocation
 // It contains information both on physical memory (the interval) and a
 // logical view on it (layout, subslice offsets and shape for the access)
 struct AllocationSlice {
 public:
   // Create allocation slice from a value, collecting subslice offsets
-  AllocationSlice(Value value, Interval<size_t> allocationInterval);
+  AllocationSlice(Value value, Interval<size_t> allocationInterval,
+                  Allocation::BufferId bufferId);
 
   // Builder for accesses that represent accesses to the whole
   // allocation (scratch buffers, ArriveBarrierOp, ..)
   AllocationSlice(Interval<size_t> interval)
-      : allocationInterval(interval), accessTy(nullptr) {}
+      : allocationInterval(interval), accessTy(nullptr),
+        bufferId(Allocation::InvalidBufferId) {}
 
   bool operator<(const AllocationSlice &other) const {
     return asTuple() < other.asTuple();
@@ -43,19 +53,25 @@ struct AllocationSlice {
   // Returns true if it can't prove the AllocationSlices are disjoint.
   bool intersects(const AllocationSlice &other) const;
 
+  Allocation::BufferId getBufferId() const { return bufferId; }
+
   void print(raw_ostream &os) const;
 
 private:
-  std::tuple<Interval<size_t>, const void *, llvm::ArrayRef<int64_t>>
+  std::tuple<Interval<size_t>, Allocation::BufferId, const void *,
+             llvm::ArrayRef<int64_t>>
   asTuple() const {
-    return {allocationInterval, accessTy.getAsOpaquePointer(), subsliceOffsets};
+    return {allocationInterval, bufferId, accessTy.getAsOpaquePointer(),
+            subsliceOffsets};
   }
   // Offsets from subslice. Empty when offsets are unknown
   SmallVector<int64_t> subsliceOffsets;
   // The allocated interval for this buffer
   Interval<size_t> allocationInterval;
   // Type of the memory descriptor for this access
   triton::gpu::MemDescType accessTy;
+  // Buffer id for partial sync on wait_barrier deps.
+  Allocation::BufferId bufferId;
 };
 
 struct BlockInfo {
@@ -103,15 +119,19 @@ struct BlockInfo {
 
   /// Returns true if Slices in two BlockInfo objects are intersected.
   bool isIntersected(const BlockInfo &other, MembarFilterFn filter,
-                     Allocation *allocation) const {
-    return /*RAW*/ isIntersected(syncWriteSlices, other.syncReadSlices, filter,
-                                 allocation) ||
+                     Allocation *allocation,
+                     MembarSliceFilterFn sliceFilter = nullptr) const {
+    return /*RAW*/ isIntersected(syncWriteSlices, other.syncReadSlices,
+                                 /*lhsIsRead=*/false, /*rhsIsRead=*/true,
+                                 filter, sliceFilter, allocation) ||
            /*WAR*/
-           isIntersected(syncReadSlices, other.syncWriteSlices, filter,
-                         allocation) ||
+           isIntersected(syncReadSlices, other.syncWriteSlices,
+                         /*lhsIsRead=*/true, /*rhsIsRead=*/false, filter,
+                         sliceFilter, allocation) ||
            /*WAW*/
-           isIntersected(syncWriteSlices, other.syncWriteSlices, filter,
-                         allocation);
+           isIntersected(syncWriteSlices, other.syncWriteSlices,
+                         /*lhsIsRead=*/false, /*rhsIsRead=*/false, filter,
+                         sliceFilter, allocation);
   }
 
   /// Clears the slices because a barrier is inserted.
@@ -130,14 +150,19 @@ struct BlockInfo {
 
 private:
   bool isIntersected(const SliceMapT &lhsSlices, const SliceMapT &rhsSlices,
-                     MembarFilterFn filter, Allocation *allocation) const {
+                     bool lhsIsRead, bool rhsIsRead, MembarFilterFn filter,
+                     MembarSliceFilterFn sliceFilter,
+                     Allocation *allocation) const {
     for (auto &lhs : lhsSlices)
       for (auto &rhs : rhsSlices)
         if (lhs.first.intersects(rhs.first))
-          for (auto lhsOp : lhs.second)
-            for (auto rhsOp : rhs.second)
-              if (!filter || !filter(lhsOp, rhsOp, allocation))
-                return true;
+          if (!sliceFilter || !sliceFilter(lhs.first, rhs.first, lhsIsRead,
+                                           rhsIsRead, allocation))
+            for (auto lhsOp : lhs.second)
+              for (auto rhsOp : rhs.second)
+                if (!filter ||
+                    !filter(lhsOp, rhsOp, lhsIsRead, rhsIsRead, allocation))
+                  return true;
     return false;
   }
 };
 
@@ -0,0 +1,19 @@
+#ifndef TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_CLUSTERBARRIERINSERTION_H_
+#define TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_CLUSTERBARRIERINSERTION_H_
+
+#include "triton/Analysis/Allocation.h"
+
+namespace mlir {
+namespace triton {
+namespace nvidia_gpu {
+
+/// Inserts cluster barriers (cluster_arrive + cluster_wait) using the provided
+/// shared-memory allocation analysis.
+void runClusterBarrierInsertion(ModuleAllocation &moduleAllocation,
+                                int computeCapability);
+
+} // namespace nvidia_gpu
+} // namespace triton
+} // namespace mlir
+
+#endif // TRITON_DIALECT_TRITONNVIDIAGPU_TRANSFORMS_CLUSTERBARRIERINSERTION_H_
@@ -10,8 +10,9 @@
 namespace mlir {
 
 AllocationSlice::AllocationSlice(Value value,
-                                 Interval<size_t> allocationInterval)
-    : allocationInterval(allocationInterval) {
+                                 Interval<size_t> allocationInterval,
+                                 Allocation::BufferId bufferId)
+    : allocationInterval(allocationInterval), bufferId(bufferId) {
   auto accessTy = cast<triton::gpu::MemDescType>(value.getType());
   this->accessTy = accessTy;
 
@@ -69,6 +70,9 @@ void AllocationSlice::print(raw_ostream &os) const {
   os << "interval=[" << allocationInterval.start() << ","
      << allocationInterval.end() << ")";
 
+  if (bufferId != Allocation::InvalidBufferId)
+    os << " buffer=" << bufferId;
+
   os << " offsets=[";
   if (!subsliceOffsets.empty()) {
     llvm::interleaveComma(subsliceOffsets, os);
@@ -244,6 +248,8 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
   auto containsLocalBarrier = [](Operation *op) {
     if (isa<gpu::BarrierOp>(op))
       return true;
+    if (isa<triton::nvidia_gpu::ClusterWaitOp>(op))
+      return true;
     if (isa<triton::gpu::WarpSpecializePartitionsOp>(op))
       return true;
     if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(op))
@@ -287,7 +293,7 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
           for (auto bufferId : allocation->getAllBufferIdsWithAliases(value)) {
             if (bufferId != Allocation::InvalidBufferId) {
               auto interval = allocation->getAllocatedInterval(bufferId);
-              auto slice = AllocationSlice(value, interval);
+              auto slice = AllocationSlice(value, interval, bufferId);
 
               if (isa<MemoryEffects::Write>(effectInstance.getEffect()))
                 curBlockInfo.syncWriteSlices[slice].insert(op);
 
@@ -1,4 +1,5 @@
 add_triton_library(TritonNvidiaGPUTransforms
+  ClusterBarrierInsertion.cpp
   CheckMatmulTwoCTAs.cpp
   FenceInsertion.cpp
   InterleaveTMem.cpp
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`add_triton_library(TritonNvidiaGPUTransforms`
	`2`	`+ ClusterBarrierInsertion.cpp`
`2`	`3`	`CheckMatmulTwoCTAs.cpp`
`3`	`4`	`FenceInsertion.cpp`
`4`	`5`	`InterleaveTMem.cpp`