triton-lang
diff --git a/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 7 additions & 6 deletions b/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎.github/workflows/llvm-build.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/llvm-build.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/runner-preparation.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/runner-preparation.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 1 deletion b/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/nvidia-toolchain-version.json‎
Lines changed: 1 addition & 1 deletion b/‎cmake/nvidia-toolchain-version.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/python-api/triton.language.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/python-api/triton.language.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/plugins/README.md‎
Lines changed: 5 additions & 5 deletions b/‎examples/plugins/README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/plugins/TritonPlugin.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/plugins/TritonPlugin.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 56 additions & 22 deletions b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 56 additions & 22 deletions
@@ -9,15 +9,16 @@ on:
 
 jobs:
   integration-tests-nvidia:
-    runs-on: ${{ matrix.runner }}
+    name: integration-tests-nvidia (${{ matrix.config.name }})
+    runs-on: ${{ matrix.config.runs_on }}
     timeout-minutes: 60
     # Let A100 and H100 continue even if GB200 fails, as it's a bit flaky
-    continue-on-error: ${{ matrix.runner[0] == 'nvidia-gb200'}}
+    continue-on-error: ${{ startsWith(matrix.config.runner_type, 'nvidia-gb200') }}
     strategy:
       matrix:
-        runner: ${{ fromJson(inputs.matrix) }}
+        config: ${{ fromJson(inputs.matrix) }}
     env:
-      RUNNER_TYPE: ${{ matrix.runner[0] }}
+      RUNNER_TYPE: ${{ matrix.config.runner_type }}
       TRITON_BUILD_WITH_CCACHE: "true"
       TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
       TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
@@ -69,7 +70,7 @@ jobs:
         run: |
           echo "$HOME/.local/bin" >> $GITHUB_PATH
       - name: Setup Python environment for GB200
-        if: ${{ matrix.runner[0] == 'nvidia-gb200' }}
+        if: ${{ startsWith(matrix.config.runner_type, 'nvidia-gb200') }}
         run: |
           echo "/venv/bin" >> $GITHUB_PATH
           echo "VIRTUAL_ENV=/venv" >> $GITHUB_ENV
@@ -90,7 +91,7 @@ jobs:
       - name: Run python tests on CUDA
         run: make NUM_PROCS=24 test-unit
       - name: Run interpreter tests
-        if: ${{ matrix.runner[0] == 'nvidia-h100' }}
+        if: ${{ matrix.config.runner_type == 'nvidia-h100' }}
         run: make test-interpret
       - name: Run regression tests
         run: make test-regression
 
@@ -104,6 +104,7 @@ jobs:
         sudo apt-get clean
         df -h
         echo "Removing large directories"
+        # deleting 15GB
         df -h
 
     - name: Configure, Build, Test, and Install LLVM (Ubuntu and macOS x64)
@@ -214,6 +215,8 @@ jobs:
         -DCMAKE_RANLIB="/usr/bin/aarch64-linux-gnu-ranlib" \
         -DCMAKE_STRIP="/usr/bin/aarch64-linux-gnu-strip" \
         -DCMAKE_SYSROOT=$SYSROOT \
+        -DLLVM_INCLUDE_TESTS=OFF \
+        -DMLIR_INCLUDE_TESTS=OFF \
         -DLLVM_ENABLE_TERMINFO=OFF \
         llvm-project/llvm
         ninja -C llvm-project/build install
 
@@ -95,11 +95,11 @@ jobs:
         if: env.enable_integration == 'true'
         run: |
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
-            echo '::set-output name=matrix-NVIDIA::[["nvidia-a100"], ["nvidia-h100"], ["nvidia-gb200"]]'
+            echo '::set-output name=matrix-NVIDIA::[{"name":"nvidia-a100","runner_type":"nvidia-a100","runs_on":["nvidia-a100"]},{"name":"nvidia-h100","runner_type":"nvidia-h100","runs_on":["nvidia-h100"]},{"name":"nvidia-gb200","runner_type":"nvidia-gb200","runs_on":{"group":"gb200-runner-set"}}]'
             echo '::set-output name=matrix-AMD::[["self-hosted", "gfx90a"], ["amd-gfx942"], ["amd-gfx950"]]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
-            echo '::set-output name=matrix-NVIDIA::["ubuntu-latest"]'
+            echo '::set-output name=matrix-NVIDIA::[{"name":"ubuntu-latest","runner_type":"ubuntu-latest","runs_on":"ubuntu-latest"}]'
             echo '::set-output name=matrix-AMD::["ubuntu-latest"]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           fi
@@ -86,6 +86,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::gpu::registerAllocateSharedMemoryPass();
   mlir::triton::gpu::registerTritonGPUAllocateWarpGroups();
   mlir::triton::gpu::registerTritonGPUGlobalScratchAllocationPass();
+  mlir::triton::gpu::registerCanonicalizeLLVMIR();
   mlir::triton::registerConvertWarpSpecializeToLLVM();
   mlir::triton::registerConvertTritonGPUToLLVMPass();
   mlir::triton::registerConvertNVGPUToLLVMPass();
@@ -114,7 +115,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUHoistLayoutConversions();
   mlir::registerTritonAMDGPUSinkLayoutConversions();
   mlir::registerTritonAMDGPUPrepareIfCombining();
-  mlir::registerTritonAMDGPUReorderInstructions();
+  mlir::registerTritonAMDGPUMoveUpPrologueLoads();
   mlir::registerTritonAMDGPUBlockPingpong();
   mlir::registerTritonAMDGPUPipeline();
   mlir::registerTritonAMDGPUScheduleLoops();
 
@@ -1 +1 @@
-0729a74e66aeeb7a9839d80bfd64fc49b2e69f52
+ac5dc54d509169d387fcfd495d71853d81c46484
@@ -1,5 +1,5 @@
 {
-  "ptxas-blackwell": "12.9.86",
+  "ptxas-blackwell": "13.1.80",
   "ptxas": "12.9.86",
   "cuobjdump": "13.1.80",
   "nvdisasm": "13.1.80",
 
@@ -150,6 +150,7 @@ Scan/Sort Ops
     cumsum
     histogram
     sort
+    topk
     gather
 
 Atomic Ops
 
@@ -194,14 +194,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 #loc1 = loc("/home/triton/test.py":14:4)
 ```
 
-The hook, as defined, in the example will insert the pass at the end of the make_ttir pipeline but it's placement in the Triton pipeline is abritary.
+The hook, as defined, in the example will insert the pass at the end of the make_ttir pipeline but it's placement in the Triton pipeline is arbitrary.
 This functionality can be toggled on and off by just commenting out this line in kernel code (or setting to None):
 knobs.runtime.add_stages_inspection_hook = inspect_stages_hook
 without needing any core compiler changes or rebuilding Triton.
 
-## Example 3: Inserting a new pass into the compiler pipeline at an arbitary point.
+## Example 3: Inserting a new pass into the compiler pipeline at an arbitrary point.
 
-Example 2 added a new pass to the end of the ttgir "stage". However the plugin pass's location is arbitary and can be dynamically inserted anywhere in the pipeline. Replacing the inspect_stages_hook function from example 2 instead with:
+Example 2 added a new pass to the end of the ttgir "stage". However the plugin pass's location is arbitrary and can be dynamically inserted anywhere in the pipeline. Replacing the inspect_stages_hook function from example 2 instead with:
 
 ```python
 def inspect_stages_hook(self=None, stages=None, options=None, language=None, capability=None):
@@ -223,9 +223,9 @@ def inspect_stages_hook(self=None, stages=None, options=None, language=None, cap
     stages["ttir"] = make_lambda(module.make_ttir)
     return get_key(), get_hash()
 ```
-directs the new pass's placement based on other surrounding passes. Knowing which passes are in the pipeline a priori can challenging, therefore in the next example we show how to dump and inspect the entire pipeline that is run for a particlar kernel to allow for precise placement of specialized out of tree passes even if the upstream pass pipeline structure changes.
+directs the new pass's placement based on other surrounding passes. Knowing which passes are in the pipeline a priori can be challenging, therefore in the next example we show how to dump and inspect the entire pipeline that is run for a particular kernel to allow for precise placement of specialized out of tree passes even if the upstream pass pipeline structure changes.
 
-## Example 4: Fully customizing the compiler pipeline with pass and op insertions at abitrary locations
+## Example 4: Fully customizing the compiler pipeline with pass and op insertions at arbitrary locations
 
 Here we now run two kernels one with the full standard Triton pipeline and one with fully customized pipeline entirely from within
 kernel code with modifying any core Triton compiler code or recompiling. We run the kernel with a hook to output the standard pipeline, modify
 
@@ -78,7 +78,7 @@ tritonEnumeratePluginPasses(uint32_t *passCount, const char **passNames) {
     return TP_SUCCESS;
   unsigned i = 0;
   for (auto passName : passNamesTable) {
-    passNames[i] = passName;
+    passNames[i++] = passName;
   }
   return TP_SUCCESS;
 }
@@ -5,7 +5,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Support/raw_ostream.h"
 
 #include <limits>
 
@@ -20,6 +19,10 @@ using AllocationAnalysisScratchSizeFn = std::function<unsigned(Operation *)>;
 
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op);
 
+unsigned getNumScratchElemsSwizzledCvt(const LinearLayout &srcLayout,
+                                       const LinearLayout &dstLayout,
+                                       int bitwidth);
+
 unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
                                        RankedTensorType dstTy);
 
@@ -70,8 +73,11 @@ class Allocation {
   explicit Allocation(Operation *operation) : operation(operation) {}
 
   /// Runs allocation analysis on the given top-level operation.
+  /// \param sharedMemoryPartitionSize The size of each shared memory partition
+  ///        in bytes. A value of 0 means shared memory is not partitioned.
   void run(FuncAllocMapT &funcAllocMap,
-           triton::AllocationAnalysisScratchSizeFn scratchSizeGetter);
+           triton::AllocationAnalysisScratchSizeFn scratchSizeGetter,
+           size_t sharedMemoryPartitionSize = 0);
 
   /// Returns the operation this analysis was constructed from.
   Operation *getOperation() const { return operation; }
@@ -92,24 +98,29 @@ class Allocation {
     return Interval<size_t>(buffer.offset, buffer.offset + buffer.size);
   }
 
-  /// Returns the buffer id of the given value.
-  /// This interface only returns the allocated buffer id.
-  /// If you want to get all the buffer ids that are associated with the given
-  /// value, including alias buffers, use getBufferIds.
-  BufferId getBufferId(Value value) const {
-    if (valueBuffer.count(value)) {
-      return valueBuffer.lookup(value)->id;
-    } else {
-      return InvalidBufferId;
+  /// Returns all buffer ids for a value.
+  /// For partitioned tensors, returns all logical piece buffer ids.
+  /// For non-partitioned values, returns a single-element vector.
+  /// Returns empty vector if value has no associated buffer.
+  SmallVector<BufferId> getBufferIds(Value value) const {
+    SmallVector<BufferId> bufferIds;
+    auto it = valueBuffer.find(value);
+    if (it == valueBuffer.end())
+      return bufferIds;
+
+    for (auto *buffer : it->second) {
+      bufferIds.push_back(buffer->id);
     }
+    return bufferIds;
   }
 
-  /// Returns all the buffer ids of the given value, including alias buffers.
-  BufferIdSetT getBufferIds(Value value) const {
+  /// Returns all buffer ids of the given value, including alias buffers.
+  /// This is a superset of getBufferIds that also includes aliased buffers.
+  BufferIdSetT getAllBufferIdsWithAliases(Value value) const {
     BufferIdSetT bufferIds;
-    auto allocBufferId = getBufferId(value);
-    if (allocBufferId != InvalidBufferId)
-      bufferIds.insert(allocBufferId);
+    for (auto bufferId : getBufferIds(value)) {
+      bufferIds.insert(bufferId);
+    }
     for (auto *buffer : aliasBuffer.lookup(value)) {
       if (buffer->id != InvalidBufferId)
         bufferIds.insert(buffer->id);
@@ -133,6 +144,11 @@ class Allocation {
     return bufferSet.at(bufferId).kind == BufferT::BufferKind::Virtual;
   }
 
+  /// Returns if the given buffer is an explicit buffer.
+  bool isExplicitBuffer(BufferId bufferId) const {
+    return bufferSet.at(bufferId).kind == BufferT::BufferKind::Explicit;
+  }
+
   /// Returns the size of total shared memory allocated
   size_t getSharedMemorySize() const { return sharedMemorySize; }
 
@@ -154,6 +170,10 @@ class Allocation {
     size_t alignment;
     size_t offset;
 
+    /// For partitioned tensors: buffers that reside in different physical
+    /// partitions.
+    SmallVector<BufferT *> neighbors;
+
     bool operator==(const BufferT &other) const { return id == other.id; }
     bool operator<(const BufferT &other) const { return id < other.id; }
 
@@ -169,8 +189,8 @@ class Allocation {
 
   /// Op -> Scratch Buffer
   using OpScratchMapT = llvm::MapVector<Operation *, BufferT *>;
-  /// Value -> Explicit Buffer
-  using ValueBufferMapT = llvm::MapVector<Value, BufferT *>;
+  /// Value -> Explicit Buffers (vector for partitioned tensors)
+  using ValueBufferMapT = llvm::MapVector<Value, SmallVector<BufferT *>>;
   /// Value -> Alias Buffer
   using AliasBufferMapT = llvm::MapVector<Value, llvm::SetVector<BufferT *>>;
   /// BufferId -> Buffer
@@ -184,16 +204,28 @@ class Allocation {
         nextId, BufferT(Kind, nextId, key, std::forward<Args>(args)...));
     BufferT *buffer = &it->second;
     if constexpr (Kind == BufferT::BufferKind::Explicit) {
-      valueBuffer[key] = buffer;
+      valueBuffer[key].push_back(buffer);
     } else if constexpr (Kind == BufferT::BufferKind::Virtual) {
       opVirtual[key] = buffer;
     } else {
       opScratch[key] = buffer;
     }
   }
 
+  /// Create multiple buffers for partitions where all different partitions
+  /// are neighbors (must be placed in different physical shared memory slots).
+  ///
+  /// \param key The value that owns these buffers
+  /// \param numPartitions Number of partition buffers to create
+  /// \param partitionSize Size of each partition buffer in bytes
+  /// \param alignment Required alignment for each buffer
+  void addPartitionBuffers(Value key, unsigned numPartitions,
+                           size_t partitionSize, size_t alignment);
+
   void addAlias(Value value, Value alloc) {
-    aliasBuffer[value].insert(valueBuffer[alloc]);
+    for (auto *buffer : valueBuffer[alloc]) {
+      aliasBuffer[value].insert(buffer);
+    }
   }
 
 private:
@@ -222,7 +254,8 @@ class ModuleAllocation : public triton::CallGraph<Allocation> {
 
   ModuleAllocation(ModuleOp moduleOp,
                    triton::AllocationAnalysisScratchSizeFn scratchSizeGetter =
-                       triton::defaultAllocationAnalysisScratchSizeFn)
+                       triton::defaultAllocationAnalysisScratchSizeFn,
+                   size_t sharedMemoryPartitionSize = 0)
       : triton::CallGraph<Allocation>(moduleOp) {
     walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
         // Pre-order edge walk callback
@@ -231,7 +264,8 @@ class ModuleAllocation : public triton::CallGraph<Allocation> {
         [&](FunctionOpInterface funcOp) {
           auto [iter, inserted] = funcMap.try_emplace(funcOp, funcOp);
           if (inserted)
-            iter->second.run(funcMap, scratchSizeGetter);
+            iter->second.run(funcMap, scratchSizeGetter,
+                             sharedMemoryPartitionSize);
         });
   }
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0729a74e66aeeb7a9839d80bfd64fc49b2e69f52`
	`1`	`+ac5dc54d509169d387fcfd495d71853d81c46484`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "ptxas-blackwell": "12.9.86",`
	`2`	`+ "ptxas-blackwell": "13.1.80",`
`3`	`3`	`"ptxas": "12.9.86",`
`4`	`4`	`"cuobjdump": "13.1.80",`
`5`	`5`	`"nvdisasm": "13.1.80",`
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ tritonEnumeratePluginPasses(uint32_t passCount, const char *passNames) {`
`78`	`78`	`return TP_SUCCESS;`
`79`	`79`	`unsigned i = 0;`
`80`	`80`	`for (auto passName : passNamesTable) {`
`81`		`- passNames[i] = passName;`
	`81`	`+ passNames[i++] = passName;`
`82`	`82`	`}`
`83`	`83`	`return TP_SUCCESS;`
`84`	`84`	`}`