Skip to content

Commit c287ebb

Browse files
[AMD] Optimize address increments for buffer loads in loops (#8464)
This PR transfers address computation from offsets in buffer loads to base pointers, which reuses amount of required computations and lowers register pressure. --------- Co-authored-by: Alexander Efimov <efimov.alexander@gmail.com>
1 parent 3254b53 commit c287ebb

9 files changed

Lines changed: 1256 additions & 0 deletions

File tree

bin/RegisterTritonDialects.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
122122
mlir::registerTritonAMDGPUCanonicalizePointers();
123123
mlir::registerTritonAMDGPUConvertToBufferOps();
124124
mlir::registerTritonAMDGPUConvertToTensorOps();
125+
mlir::registerTritonAMDGPUOptimizeBufferOpPtr();
125126
mlir::registerTritonAMDGPUInThreadTranspose();
126127
mlir::registerTritonAMDGPUCoalesceAsyncCopy();
127128
mlir::registerTritonAMDGPUUpdateAsyncWaitCount();

python/src/passes.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
#define ADD_PASS_WRAPPER_0(name, builder) \
22
m.def(name, [](mlir::PassManager &pm) { pm.addPass(builder()); })
33

4+
#define ADD_FUNC_PASS_WRAPPER_0(name, builder) \
5+
m.def(name, [](mlir::PassManager &pm) { \
6+
pm.addNestedPass<mlir::triton::FuncOp>(builder()); \
7+
});
8+
49
#define ADD_PASS_WRAPPER_1(name, builder, ty0) \
510
m.def(name, \
611
[](mlir::PassManager &pm, ty0 val0) { pm.addPass(builder(val0)); })

test/TritonGPU/amd/amd-optimize-buffer-ops-base-ptr-increment.mlir

Lines changed: 589 additions & 0 deletions
Large diffs are not rendered by default.

third_party/amd/backend/compiler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ def make_ttgir(mod, metadata, options):
281281
knobs.amd.use_buffer_atomics,
282282
knobs.amd.buffer_ops_analyze_small_tensor_range,
283283
)
284+
amd.passes.ttgpuir.add_optimize_buffer_op_ptr(pm)
284285

285286
amd.passes.ttgpuir.add_fold_true_cmpi(pm)
286287
amd.passes.ttgpuir.add_prepare_if_combining(pm)

third_party/amd/include/TritonAMDGPUTransforms/Passes.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,14 @@ def TritonAMDGPUConvertToBufferOps : Pass<"tritonamdgpu-convert-buffer-ops", "ml
186186
];
187187
}
188188

189+
def TritonAMDGPUOptimizeBufferOpPtr : Pass<"tritonamdgpu-optimize-buffer-op-ptr", "mlir::triton::FuncOp"> {
190+
let summary = "Optimize address operands of buffer operations";
191+
192+
let description = "This pass optimizes address computation for buffer operations";
193+
194+
let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"];
195+
}
196+
189197
def TritonAMDGPUBlockPingpong: Pass<"tritonamdgpu-block-pingpong", "mlir::ModuleOp"> {
190198
let summary = "Interleaving instructions from two warps on the same SIMD to better utilize matrix core";
191199

third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ add_triton_library(TritonAMDGPUTransforms
55
CoalesceAsyncCopy.cpp
66
ConvertToBufferOps.cpp
77
ConvertToTensorOps.cpp
8+
OptimizeBufferOpPtr.cpp
89
OptimizeEpilogue.cpp
910
OptimizeDotOperands.cpp
1011
HoistLayoutConversions.cpp

0 commit comments

Comments
 (0)