Skip to content

Commit 6b9ab39

Browse files
authored
Merge branch 'main' into sjw/codegen-names
2 parents cff5e5a + a6e9634 commit 6b9ab39

245 files changed

Lines changed: 16636 additions & 4180 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/integration-tests-nvidia.yml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,16 @@ on:
99

1010
jobs:
1111
integration-tests-nvidia:
12-
runs-on: ${{ matrix.runner }}
12+
name: integration-tests-nvidia (${{ matrix.config.name }})
13+
runs-on: ${{ matrix.config.runs_on }}
1314
timeout-minutes: 60
1415
# Let A100 and H100 continue even if GB200 fails, as it's a bit flaky
15-
continue-on-error: ${{ matrix.runner[0] == 'nvidia-gb200'}}
16+
continue-on-error: ${{ startsWith(matrix.config.runner_type, 'nvidia-gb200') }}
1617
strategy:
1718
matrix:
18-
runner: ${{ fromJson(inputs.matrix) }}
19+
config: ${{ fromJson(inputs.matrix) }}
1920
env:
20-
RUNNER_TYPE: ${{ matrix.runner[0] }}
21+
RUNNER_TYPE: ${{ matrix.config.runner_type }}
2122
TRITON_BUILD_WITH_CCACHE: "true"
2223
TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
2324
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
@@ -69,7 +70,7 @@ jobs:
6970
run: |
7071
echo "$HOME/.local/bin" >> $GITHUB_PATH
7172
- name: Setup Python environment for GB200
72-
if: ${{ matrix.runner[0] == 'nvidia-gb200' }}
73+
if: ${{ startsWith(matrix.config.runner_type, 'nvidia-gb200') }}
7374
run: |
7475
echo "/venv/bin" >> $GITHUB_PATH
7576
echo "VIRTUAL_ENV=/venv" >> $GITHUB_ENV
@@ -90,7 +91,7 @@ jobs:
9091
- name: Run python tests on CUDA
9192
run: make NUM_PROCS=24 test-unit
9293
- name: Run interpreter tests
93-
if: ${{ matrix.runner[0] == 'nvidia-h100' }}
94+
if: ${{ matrix.config.runner_type == 'nvidia-h100' }}
9495
run: make test-interpret
9596
- name: Run regression tests
9697
run: make test-regression

.github/workflows/llvm-build.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ jobs:
104104
sudo apt-get clean
105105
df -h
106106
echo "Removing large directories"
107+
# deleting 15GB
107108
df -h
108109
109110
- name: Configure, Build, Test, and Install LLVM (Ubuntu and macOS x64)
@@ -214,6 +215,8 @@ jobs:
214215
-DCMAKE_RANLIB="/usr/bin/aarch64-linux-gnu-ranlib" \
215216
-DCMAKE_STRIP="/usr/bin/aarch64-linux-gnu-strip" \
216217
-DCMAKE_SYSROOT=$SYSROOT \
218+
-DLLVM_INCLUDE_TESTS=OFF \
219+
-DMLIR_INCLUDE_TESTS=OFF \
217220
-DLLVM_ENABLE_TERMINFO=OFF \
218221
llvm-project/llvm
219222
ninja -C llvm-project/build install

.github/workflows/runner-preparation.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,11 @@ jobs:
9595
if: env.enable_integration == 'true'
9696
run: |
9797
if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
98-
echo '::set-output name=matrix-NVIDIA::[["nvidia-a100"], ["nvidia-h100"], ["nvidia-gb200"]]'
98+
echo '::set-output name=matrix-NVIDIA::[{"name":"nvidia-a100","runner_type":"nvidia-a100","runs_on":["nvidia-a100"]},{"name":"nvidia-h100","runner_type":"nvidia-h100","runs_on":["nvidia-h100"]},{"name":"nvidia-gb200","runner_type":"nvidia-gb200","runs_on":{"group":"gb200-runner-set"}}]'
9999
echo '::set-output name=matrix-AMD::[["self-hosted", "gfx90a"], ["amd-gfx942"], ["amd-gfx950"]]'
100100
echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
101101
else
102-
echo '::set-output name=matrix-NVIDIA::["ubuntu-latest"]'
102+
echo '::set-output name=matrix-NVIDIA::[{"name":"ubuntu-latest","runner_type":"ubuntu-latest","runs_on":"ubuntu-latest"}]'
103103
echo '::set-output name=matrix-AMD::["ubuntu-latest"]'
104104
echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
105105
fi

bin/RegisterTritonDialects.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
8686
mlir::triton::gpu::registerAllocateSharedMemoryPass();
8787
mlir::triton::gpu::registerTritonGPUAllocateWarpGroups();
8888
mlir::triton::gpu::registerTritonGPUGlobalScratchAllocationPass();
89+
mlir::triton::gpu::registerCanonicalizeLLVMIR();
8990
mlir::triton::registerConvertWarpSpecializeToLLVM();
9091
mlir::triton::registerConvertTritonGPUToLLVMPass();
9192
mlir::triton::registerConvertNVGPUToLLVMPass();
@@ -114,7 +115,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
114115
mlir::registerTritonAMDGPUHoistLayoutConversions();
115116
mlir::registerTritonAMDGPUSinkLayoutConversions();
116117
mlir::registerTritonAMDGPUPrepareIfCombining();
117-
mlir::registerTritonAMDGPUReorderInstructions();
118+
mlir::registerTritonAMDGPUMoveUpPrologueLoads();
118119
mlir::registerTritonAMDGPUBlockPingpong();
119120
mlir::registerTritonAMDGPUPipeline();
120121
mlir::registerTritonAMDGPUScheduleLoops();

cmake/llvm-hash.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0729a74e66aeeb7a9839d80bfd64fc49b2e69f52
1+
ac5dc54d509169d387fcfd495d71853d81c46484

cmake/nvidia-toolchain-version.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"ptxas-blackwell": "12.9.86",
2+
"ptxas-blackwell": "13.1.80",
33
"ptxas": "12.9.86",
44
"cuobjdump": "13.1.80",
55
"nvdisasm": "13.1.80",

docs/python-api/triton.language.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ Scan/Sort Ops
150150
cumsum
151151
histogram
152152
sort
153+
topk
153154
gather
154155

155156
Atomic Ops

examples/plugins/README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -194,14 +194,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
194194
#loc1 = loc("/home/triton/test.py":14:4)
195195
```
196196

197-
The hook, as defined, in the example will insert the pass at the end of the make_ttir pipeline but it's placement in the Triton pipeline is abritary.
197+
The hook, as defined, in the example will insert the pass at the end of the make_ttir pipeline but it's placement in the Triton pipeline is arbitrary.
198198
This functionality can be toggled on and off by just commenting out this line in kernel code (or setting to None):
199199
knobs.runtime.add_stages_inspection_hook = inspect_stages_hook
200200
without needing any core compiler changes or rebuilding Triton.
201201

202-
## Example 3: Inserting a new pass into the compiler pipeline at an arbitary point.
202+
## Example 3: Inserting a new pass into the compiler pipeline at an arbitrary point.
203203

204-
Example 2 added a new pass to the end of the ttgir "stage". However the plugin pass's location is arbitary and can be dynamically inserted anywhere in the pipeline. Replacing the inspect_stages_hook function from example 2 instead with:
204+
Example 2 added a new pass to the end of the ttgir "stage". However the plugin pass's location is arbitrary and can be dynamically inserted anywhere in the pipeline. Replacing the inspect_stages_hook function from example 2 instead with:
205205

206206
```python
207207
def inspect_stages_hook(self=None, stages=None, options=None, language=None, capability=None):
@@ -223,9 +223,9 @@ def inspect_stages_hook(self=None, stages=None, options=None, language=None, cap
223223
stages["ttir"] = make_lambda(module.make_ttir)
224224
return get_key(), get_hash()
225225
```
226-
directs the new pass's placement based on other surrounding passes. Knowing which passes are in the pipeline a priori can challenging, therefore in the next example we show how to dump and inspect the entire pipeline that is run for a particlar kernel to allow for precise placement of specialized out of tree passes even if the upstream pass pipeline structure changes.
226+
directs the new pass's placement based on other surrounding passes. Knowing which passes are in the pipeline a priori can be challenging, therefore in the next example we show how to dump and inspect the entire pipeline that is run for a particular kernel to allow for precise placement of specialized out of tree passes even if the upstream pass pipeline structure changes.
227227

228-
## Example 4: Fully customizing the compiler pipeline with pass and op insertions at abitrary locations
228+
## Example 4: Fully customizing the compiler pipeline with pass and op insertions at arbitrary locations
229229

230230
Here we now run two kernels one with the full standard Triton pipeline and one with fully customized pipeline entirely from within
231231
kernel code with modifying any core Triton compiler code or recompiling. We run the kernel with a hook to output the standard pipeline, modify

examples/plugins/TritonPlugin.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ tritonEnumeratePluginPasses(uint32_t *passCount, const char **passNames) {
7878
return TP_SUCCESS;
7979
unsigned i = 0;
8080
for (auto passName : passNamesTable) {
81-
passNames[i] = passName;
81+
passNames[i++] = passName;
8282
}
8383
return TP_SUCCESS;
8484
}

include/triton/Analysis/Allocation.h

Lines changed: 56 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "llvm/ADT/DenseMap.h"
66
#include "llvm/ADT/MapVector.h"
77
#include "llvm/ADT/SetVector.h"
8-
#include "llvm/Support/raw_ostream.h"
98

109
#include <limits>
1110

@@ -20,6 +19,10 @@ using AllocationAnalysisScratchSizeFn = std::function<unsigned(Operation *)>;
2019

2120
unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op);
2221

22+
unsigned getNumScratchElemsSwizzledCvt(const LinearLayout &srcLayout,
23+
const LinearLayout &dstLayout,
24+
int bitwidth);
25+
2326
unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
2427
RankedTensorType dstTy);
2528

@@ -70,8 +73,11 @@ class Allocation {
7073
explicit Allocation(Operation *operation) : operation(operation) {}
7174

7275
/// Runs allocation analysis on the given top-level operation.
76+
/// \param sharedMemoryPartitionSize The size of each shared memory partition
77+
/// in bytes. A value of 0 means shared memory is not partitioned.
7378
void run(FuncAllocMapT &funcAllocMap,
74-
triton::AllocationAnalysisScratchSizeFn scratchSizeGetter);
79+
triton::AllocationAnalysisScratchSizeFn scratchSizeGetter,
80+
size_t sharedMemoryPartitionSize = 0);
7581

7682
/// Returns the operation this analysis was constructed from.
7783
Operation *getOperation() const { return operation; }
@@ -92,24 +98,29 @@ class Allocation {
9298
return Interval<size_t>(buffer.offset, buffer.offset + buffer.size);
9399
}
94100

95-
/// Returns the buffer id of the given value.
96-
/// This interface only returns the allocated buffer id.
97-
/// If you want to get all the buffer ids that are associated with the given
98-
/// value, including alias buffers, use getBufferIds.
99-
BufferId getBufferId(Value value) const {
100-
if (valueBuffer.count(value)) {
101-
return valueBuffer.lookup(value)->id;
102-
} else {
103-
return InvalidBufferId;
101+
/// Returns all buffer ids for a value.
102+
/// For partitioned tensors, returns all logical piece buffer ids.
103+
/// For non-partitioned values, returns a single-element vector.
104+
/// Returns empty vector if value has no associated buffer.
105+
SmallVector<BufferId> getBufferIds(Value value) const {
106+
SmallVector<BufferId> bufferIds;
107+
auto it = valueBuffer.find(value);
108+
if (it == valueBuffer.end())
109+
return bufferIds;
110+
111+
for (auto *buffer : it->second) {
112+
bufferIds.push_back(buffer->id);
104113
}
114+
return bufferIds;
105115
}
106116

107-
/// Returns all the buffer ids of the given value, including alias buffers.
108-
BufferIdSetT getBufferIds(Value value) const {
117+
/// Returns all buffer ids of the given value, including alias buffers.
118+
/// This is a superset of getBufferIds that also includes aliased buffers.
119+
BufferIdSetT getAllBufferIdsWithAliases(Value value) const {
109120
BufferIdSetT bufferIds;
110-
auto allocBufferId = getBufferId(value);
111-
if (allocBufferId != InvalidBufferId)
112-
bufferIds.insert(allocBufferId);
121+
for (auto bufferId : getBufferIds(value)) {
122+
bufferIds.insert(bufferId);
123+
}
113124
for (auto *buffer : aliasBuffer.lookup(value)) {
114125
if (buffer->id != InvalidBufferId)
115126
bufferIds.insert(buffer->id);
@@ -133,6 +144,11 @@ class Allocation {
133144
return bufferSet.at(bufferId).kind == BufferT::BufferKind::Virtual;
134145
}
135146

147+
/// Returns if the given buffer is an explicit buffer.
148+
bool isExplicitBuffer(BufferId bufferId) const {
149+
return bufferSet.at(bufferId).kind == BufferT::BufferKind::Explicit;
150+
}
151+
136152
/// Returns the size of total shared memory allocated
137153
size_t getSharedMemorySize() const { return sharedMemorySize; }
138154

@@ -154,6 +170,10 @@ class Allocation {
154170
size_t alignment;
155171
size_t offset;
156172

173+
/// For partitioned tensors: buffers that reside in different physical
174+
/// partitions.
175+
SmallVector<BufferT *> neighbors;
176+
157177
bool operator==(const BufferT &other) const { return id == other.id; }
158178
bool operator<(const BufferT &other) const { return id < other.id; }
159179

@@ -169,8 +189,8 @@ class Allocation {
169189

170190
/// Op -> Scratch Buffer
171191
using OpScratchMapT = llvm::MapVector<Operation *, BufferT *>;
172-
/// Value -> Explicit Buffer
173-
using ValueBufferMapT = llvm::MapVector<Value, BufferT *>;
192+
/// Value -> Explicit Buffers (vector for partitioned tensors)
193+
using ValueBufferMapT = llvm::MapVector<Value, SmallVector<BufferT *>>;
174194
/// Value -> Alias Buffer
175195
using AliasBufferMapT = llvm::MapVector<Value, llvm::SetVector<BufferT *>>;
176196
/// BufferId -> Buffer
@@ -184,16 +204,28 @@ class Allocation {
184204
nextId, BufferT(Kind, nextId, key, std::forward<Args>(args)...));
185205
BufferT *buffer = &it->second;
186206
if constexpr (Kind == BufferT::BufferKind::Explicit) {
187-
valueBuffer[key] = buffer;
207+
valueBuffer[key].push_back(buffer);
188208
} else if constexpr (Kind == BufferT::BufferKind::Virtual) {
189209
opVirtual[key] = buffer;
190210
} else {
191211
opScratch[key] = buffer;
192212
}
193213
}
194214

215+
/// Create multiple buffers for partitions where all different partitions
216+
/// are neighbors (must be placed in different physical shared memory slots).
217+
///
218+
/// \param key The value that owns these buffers
219+
/// \param numPartitions Number of partition buffers to create
220+
/// \param partitionSize Size of each partition buffer in bytes
221+
/// \param alignment Required alignment for each buffer
222+
void addPartitionBuffers(Value key, unsigned numPartitions,
223+
size_t partitionSize, size_t alignment);
224+
195225
void addAlias(Value value, Value alloc) {
196-
aliasBuffer[value].insert(valueBuffer[alloc]);
226+
for (auto *buffer : valueBuffer[alloc]) {
227+
aliasBuffer[value].insert(buffer);
228+
}
197229
}
198230

199231
private:
@@ -222,7 +254,8 @@ class ModuleAllocation : public triton::CallGraph<Allocation> {
222254

223255
ModuleAllocation(ModuleOp moduleOp,
224256
triton::AllocationAnalysisScratchSizeFn scratchSizeGetter =
225-
triton::defaultAllocationAnalysisScratchSizeFn)
257+
triton::defaultAllocationAnalysisScratchSizeFn,
258+
size_t sharedMemoryPartitionSize = 0)
226259
: triton::CallGraph<Allocation>(moduleOp) {
227260
walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
228261
// Pre-order edge walk callback
@@ -231,7 +264,8 @@ class ModuleAllocation : public triton::CallGraph<Allocation> {
231264
[&](FunctionOpInterface funcOp) {
232265
auto [iter, inserted] = funcMap.try_emplace(funcOp, funcOp);
233266
if (inserted)
234-
iter->second.run(funcMap, scratchSizeGetter);
267+
iter->second.run(funcMap, scratchSizeGetter,
268+
sharedMemoryPartitionSize);
235269
});
236270
}
237271

0 commit comments

Comments
 (0)