triton-lang
diff --git a/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 7 additions & 6 deletions b/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎.github/workflows/runner-preparation.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/runner-preparation.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 51 additions & 21 deletions b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 51 additions & 21 deletions
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 41 additions & 13 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 41 additions & 13 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Passes.td‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Passes.td‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 1 addition & 3 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 8 additions & 2 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 14 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 14 additions & 0 deletions
@@ -9,15 +9,16 @@ on:
 
 jobs:
   integration-tests-nvidia:
-    runs-on: ${{ matrix.runner }}
+    name: integration-tests-nvidia (${{ matrix.config.name }})
+    runs-on: ${{ matrix.config.runs_on }}
     timeout-minutes: 60
     # Let A100 and H100 continue even if GB200 fails, as it's a bit flaky
-    continue-on-error: ${{ matrix.runner[0] == 'nvidia-gb200'}}
+    continue-on-error: ${{ startsWith(matrix.config.runner_type, 'nvidia-gb200') }}
     strategy:
       matrix:
-        runner: ${{ fromJson(inputs.matrix) }}
+        config: ${{ fromJson(inputs.matrix) }}
     env:
-      RUNNER_TYPE: ${{ matrix.runner[0] }}
+      RUNNER_TYPE: ${{ matrix.config.runner_type }}
       TRITON_BUILD_WITH_CCACHE: "true"
       TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
       TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
@@ -69,7 +70,7 @@ jobs:
         run: |
           echo "$HOME/.local/bin" >> $GITHUB_PATH
       - name: Setup Python environment for GB200
-        if: ${{ matrix.runner[0] == 'nvidia-gb200' }}
+        if: ${{ startsWith(matrix.config.runner_type, 'nvidia-gb200') }}
         run: |
           echo "/venv/bin" >> $GITHUB_PATH
           echo "VIRTUAL_ENV=/venv" >> $GITHUB_ENV
@@ -90,7 +91,7 @@ jobs:
       - name: Run python tests on CUDA
         run: make NUM_PROCS=24 test-unit
       - name: Run interpreter tests
-        if: ${{ matrix.runner[0] == 'nvidia-h100' }}
+        if: ${{ matrix.config.runner_type == 'nvidia-h100' }}
         run: make test-interpret
       - name: Run regression tests
         run: make test-regression
 
@@ -95,11 +95,11 @@ jobs:
         if: env.enable_integration == 'true'
         run: |
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
-            echo '::set-output name=matrix-NVIDIA::[["nvidia-a100"], ["nvidia-h100"], ["nvidia-gb200"]]'
+            echo '::set-output name=matrix-NVIDIA::[{"name":"nvidia-a100","runner_type":"nvidia-a100","runs_on":["nvidia-a100"]},{"name":"nvidia-h100","runner_type":"nvidia-h100","runs_on":["nvidia-h100"]},{"name":"nvidia-gb200","runner_type":"nvidia-gb200","runs_on":{"group":"gb200-runner-set"}}]'
             echo '::set-output name=matrix-AMD::[["self-hosted", "gfx90a"], ["amd-gfx942"], ["amd-gfx950"]]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
-            echo '::set-output name=matrix-NVIDIA::["ubuntu-latest"]'
+            echo '::set-output name=matrix-NVIDIA::[{"name":"ubuntu-latest","runner_type":"ubuntu-latest","runs_on":"ubuntu-latest"}]'
             echo '::set-output name=matrix-AMD::["ubuntu-latest"]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           fi
@@ -86,6 +86,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::gpu::registerAllocateSharedMemoryPass();
   mlir::triton::gpu::registerTritonGPUAllocateWarpGroups();
   mlir::triton::gpu::registerTritonGPUGlobalScratchAllocationPass();
+  mlir::triton::gpu::registerCanonicalizeLLVMIR();
   mlir::triton::registerConvertWarpSpecializeToLLVM();
   mlir::triton::registerConvertTritonGPUToLLVMPass();
   mlir::triton::registerConvertNVGPUToLLVMPass();
 
@@ -20,6 +20,10 @@ using AllocationAnalysisScratchSizeFn = std::function<unsigned(Operation *)>;
 
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op);
 
+unsigned getNumScratchElemsSwizzledCvt(const LinearLayout &srcLayout,
+                                       const LinearLayout &dstLayout,
+                                       int bitwidth);
+
 unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
                                        RankedTensorType dstTy);
 
@@ -70,8 +74,11 @@ class Allocation {
   explicit Allocation(Operation *operation) : operation(operation) {}
 
   /// Runs allocation analysis on the given top-level operation.
+  /// \param sharedMemoryPartitionSize The size of each shared memory partition
+  ///        in bytes. A value of 0 means shared memory is not partitioned.
   void run(FuncAllocMapT &funcAllocMap,
-           triton::AllocationAnalysisScratchSizeFn scratchSizeGetter);
+           triton::AllocationAnalysisScratchSizeFn scratchSizeGetter,
+           size_t sharedMemoryPartitionSize = 0);
 
   /// Returns the operation this analysis was constructed from.
   Operation *getOperation() const { return operation; }
@@ -92,24 +99,29 @@ class Allocation {
     return Interval<size_t>(buffer.offset, buffer.offset + buffer.size);
   }
 
-  /// Returns the buffer id of the given value.
-  /// This interface only returns the allocated buffer id.
-  /// If you want to get all the buffer ids that are associated with the given
-  /// value, including alias buffers, use getBufferIds.
-  BufferId getBufferId(Value value) const {
-    if (valueBuffer.count(value)) {
-      return valueBuffer.lookup(value)->id;
-    } else {
-      return InvalidBufferId;
+  /// Returns all buffer ids for a value.
+  /// For partitioned tensors, returns all logical piece buffer ids.
+  /// For non-partitioned values, returns a single-element vector.
+  /// Returns empty vector if value has no associated buffer.
+  SmallVector<BufferId> getBufferIds(Value value) const {
+    SmallVector<BufferId> bufferIds;
+    auto it = valueBuffer.find(value);
+    if (it == valueBuffer.end())
+      return bufferIds;
+
+    for (auto *buffer : it->second) {
+      bufferIds.push_back(buffer->id);
     }
+    return bufferIds;
   }
 
-  /// Returns all the buffer ids of the given value, including alias buffers.
-  BufferIdSetT getBufferIds(Value value) const {
+  /// Returns all buffer ids of the given value, including alias buffers.
+  /// This is a superset of getBufferIds that also includes aliased buffers.
+  BufferIdSetT getAllBufferIdsWithAliases(Value value) const {
     BufferIdSetT bufferIds;
-    auto allocBufferId = getBufferId(value);
-    if (allocBufferId != InvalidBufferId)
-      bufferIds.insert(allocBufferId);
+    for (auto bufferId : getBufferIds(value)) {
+      bufferIds.insert(bufferId);
+    }
     for (auto *buffer : aliasBuffer.lookup(value)) {
       if (buffer->id != InvalidBufferId)
         bufferIds.insert(buffer->id);
@@ -154,6 +166,10 @@ class Allocation {
     size_t alignment;
     size_t offset;
 
+    /// For partitioned tensors: buffers that reside in different physical
+    /// partitions.
+    SmallVector<BufferT *> neighbors;
+
     bool operator==(const BufferT &other) const { return id == other.id; }
     bool operator<(const BufferT &other) const { return id < other.id; }
 
@@ -169,8 +185,8 @@ class Allocation {
 
   /// Op -> Scratch Buffer
   using OpScratchMapT = llvm::MapVector<Operation *, BufferT *>;
-  /// Value -> Explicit Buffer
-  using ValueBufferMapT = llvm::MapVector<Value, BufferT *>;
+  /// Value -> Explicit Buffers (vector for partitioned tensors)
+  using ValueBufferMapT = llvm::MapVector<Value, SmallVector<BufferT *>>;
   /// Value -> Alias Buffer
   using AliasBufferMapT = llvm::MapVector<Value, llvm::SetVector<BufferT *>>;
   /// BufferId -> Buffer
@@ -184,16 +200,28 @@ class Allocation {
         nextId, BufferT(Kind, nextId, key, std::forward<Args>(args)...));
     BufferT *buffer = &it->second;
     if constexpr (Kind == BufferT::BufferKind::Explicit) {
-      valueBuffer[key] = buffer;
+      valueBuffer[key].push_back(buffer);
     } else if constexpr (Kind == BufferT::BufferKind::Virtual) {
       opVirtual[key] = buffer;
     } else {
       opScratch[key] = buffer;
     }
   }
 
+  /// Create multiple buffers for partitions where all different partitions
+  /// are neighbors (must be placed in different physical shared memory slots).
+  ///
+  /// \param key The value that owns these buffers
+  /// \param numPartitions Number of partition buffers to create
+  /// \param partitionSize Size of each partition buffer in bytes
+  /// \param alignment Required alignment for each buffer
+  void addPartitionBuffers(Value key, unsigned numPartitions,
+                           size_t partitionSize, size_t alignment);
+
   void addAlias(Value value, Value alloc) {
-    aliasBuffer[value].insert(valueBuffer[alloc]);
+    for (auto *buffer : valueBuffer[alloc]) {
+      aliasBuffer[value].insert(buffer);
+    }
   }
 
 private:
@@ -222,7 +250,8 @@ class ModuleAllocation : public triton::CallGraph<Allocation> {
 
   ModuleAllocation(ModuleOp moduleOp,
                    triton::AllocationAnalysisScratchSizeFn scratchSizeGetter =
-                       triton::defaultAllocationAnalysisScratchSizeFn)
+                       triton::defaultAllocationAnalysisScratchSizeFn,
+                   size_t sharedMemoryPartitionSize = 0)
       : triton::CallGraph<Allocation>(moduleOp) {
     walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
         // Pre-order edge walk callback
@@ -231,7 +260,8 @@ class ModuleAllocation : public triton::CallGraph<Allocation> {
         [&](FunctionOpInterface funcOp) {
           auto [iter, inserted] = funcMap.try_emplace(funcOp, funcOp);
           if (inserted)
-            iter->second.run(funcMap, scratchSizeGetter);
+            iter->second.run(funcMap, scratchSizeGetter,
+                             sharedMemoryPartitionSize);
         });
   }
 
 
@@ -3,6 +3,7 @@
 
 #include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/Support/LLVM.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -24,6 +25,22 @@ inline bool isZeroConst(Value v) {
 
 class ReduceOpHelper {
 public:
+  enum class InThreadVectorizeOpKind {
+    None,
+    AddF,
+    MulF,
+    MinNumF,
+    MaxNumF,
+    MinimumF,
+    MaximumF,
+    AddI,
+    MulI,
+    MinSI,
+    MaxSI,
+    MinUI,
+    MaxUI,
+  };
+
   explicit ReduceOpHelper(triton::ReduceOp op)
       : op(op.getOperation()), axis(op.getAxis()) {
     auto firstTy = cast<RankedTensorType>(op.getOperands()[0].getType());
@@ -42,30 +59,41 @@ class ReduceOpHelper {
     }
   }
 
-  ArrayRef<int64_t> getSrcShape() { return srcShape; }
+  RankedTensorType getSrcTy() { return srcTy; }
 
-  Attribute getSrcLayout() { return srcEncoding; }
+  unsigned getInterWarpSizeWithUniqueData();
 
-  triton::ReduceOp getOperation() { return op; }
+  unsigned getIntraWarpSizeWithUniqueData();
 
-  unsigned getThreadOffsetOnReductionAxis();
+  bool isReduceWithinCTA();
 
-  bool isWarpSynchronous();
+  bool isAssociative();
 
-  unsigned getInterWarpSizeWithUniqueData();
+  unsigned getScratchSizeInBytes();
 
-  unsigned getIntraWarpSizeWithUniqueData();
+  InThreadVectorizeOpKind
+  getInThreadVectorizeOpKind(unsigned axisPack,
+                             bool supportBitwidth16Elementwise,
+                             bool supportBitwidth32Elementwise);
 
-  // The shape of the shared memory space needed for the reduction.
-  SmallVector<unsigned> getScratchRepShape();
+  static triton::ColumnAction
+  moveAxisBasesToFront(const triton::LinearLayout &layout, int axis,
+                       bool isVectorized = false);
 
-  SmallVector<unsigned> getOrderWithAxisAtBeginning();
+  static triton::LinearLayout
+  zeroBasesAlongDimAndReorder(const triton::LinearLayout &layout, unsigned axis,
+                              mlir::StringAttr dim);
 
-  unsigned getScratchSizeInBytes();
+  static triton::LinearLayout getInterLayout(const triton::LinearLayout &layout,
+                                             unsigned axis);
 
-  bool isReduceWithinCTA();
+  static triton::LinearLayout reducedRegLaneLayout(RankedTensorType srcTy,
+                                                   unsigned axis);
 
-  bool isAssociative();
+  static Value createInThreadVectorizedCombineOp(OpBuilder &builder,
+                                                 Location loc,
+                                                 InThreadVectorizeOpKind kind,
+                                                 Value lhs, Value rhs);
 
 private:
   triton::ReduceOp op;
 
@@ -42,4 +42,8 @@ def TritonGPUAllocateWarpGroups : Pass<"tritongpu-allocate-warp-groups", "mlir::
   }];
 }
 
+def CanonicalizeLLVMIR : Pass<"canonicalize-llvm-ir", "mlir::LLVM::LLVMFuncOp"> {
+  let summary = "Canonicalize LLVM IR";
+}
+
 #endif
@@ -103,9 +103,7 @@ void populatePrintOpToLLVMPattern(LLVMTypeConverter &typeConverter,
                                   PatternBenefit benefit);
 
 void populateInstrumentationToLLVMPatterns(LLVMTypeConverter &typeConverter,
-                                           const TargetInfoBase &targetInfo,
-                                           RewritePatternSet &patterns,
-                                           PatternBenefit benefit);
+                                           RewritePatternSet &patterns);
 
 } // namespace triton
 } // namespace mlir
 
@@ -2,6 +2,7 @@
 #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_TARGETINFOBASE_H
 
 #include "triton/Conversion/MLIRTypes.h"
+#include "llvm/ADT/ArrayRef.h"
 
 namespace mlir::triton {
 enum class ProgramIDDim : uint32_t;
@@ -66,8 +67,7 @@ class TargetInfoBase {
 
   virtual bool warpReduce(RewriterBase &rewriter, Location loc,
                           SmallVector<Value> &acc, triton::ReduceOp op,
-                          unsigned numLaneToReduce,
-                          unsigned interleave) const = 0;
+                          unsigned reduceLaneIdMask) const = 0;
 
   virtual std::string getMulhiFuncName(Type resultElementTy) const = 0;
   // Emits LLVM code with |rewriter| to print a message following the given
@@ -102,8 +102,14 @@ class TargetInfoBase {
   virtual bool supportLdMatrix() const { return false; }
   virtual bool supportStMatrix() const { return false; }
   virtual bool supportLdStMatrixB8() const { return false; }
+  virtual bool supportBitwidth16Elementwise() const { return false; }
+  virtual bool supportBitwidth32Elementwise() const { return false; }
   virtual bool isCuda() const { return false; }
 
+  // Returns the shared memory partition size in bytes. A value of 0 means
+  // shared memory is not partitioned.
+  virtual size_t getSharedMemoryPartitionSize() const { return 0; }
+
   // Annotate target specific information to local load operations during
   // lowering to LLVM. `llLoadOp` is the generated LLVM load op.
   virtual void localLoadOpAnnotation(triton::gpu::LocalLoadOp localLoadOp,
 
@@ -342,6 +342,9 @@ LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
 
 // Multiply a square layout with 1 input and output dimension with a vector
 Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x);
+
+// Whether the convert layout should be forced to use warp shuffles.
+bool cvtAlwaysUseWarpShuffle(triton::gpu::ConvertLayoutOp cvt);
 } // namespace gpu
 
 } // namespace triton
@@ -442,6 +445,9 @@ Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
 size_t linearize(ArrayRef<unsigned> multiDim, ArrayRef<unsigned> shape,
                  ArrayRef<unsigned> order);
 
+GlobalOp getOrInsertGlobalConstant(RewriterBase &rewriter, ModuleOp module,
+                                   Type type, Attribute content, StringRef key);
+
 Value addStringToModule(Location loc, RewriterBase &rewriter, StringRef key,
                         StringRef content);
 
@@ -630,6 +636,14 @@ SmallVector<Value> inlineRegion(RewriterBase &rewriter, Region &region,
                           mlir::TypeID::get<TerminatorOp>(), loc);
 }
 
+// #prevBlock
+// if (condition) {
+//   #ifBlock
+// }
+// #thenBlock
+std::tuple</*prevBlock=*/Block *, /*ifBlock=*/Block *, /*thenBlock=*/Block *>
+createIfBlock(ConversionPatternRewriter &b, Location loc, Value cnd);
+
 void finalizeTensorAtomicResults(Operation *op, RankedTensorType tensorTy,
                                  ConversionPatternRewriter &rewriter,
                                  SmallVector<Value> &resultVals,
Original file line number	Diff line number	Diff line change
`@@ -42,4 +42,8 @@ def TritonGPUAllocateWarpGroups : Pass<"tritongpu-allocate-warp-groups", "mlir::`
`42`	`42`	`}];`
`43`	`43`	`}`
`44`	`44`
	`45`	`+def CanonicalizeLLVMIR : Pass<"canonicalize-llvm-ir", "mlir::LLVM::LLVMFuncOp"> {`
	`46`	`+ let summary = "Canonicalize LLVM IR";`
	`47`	`+}`
	`48`	`+`
`45`	`49`	`#endif`