triton-lang
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 1 deletion b/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/TritonGPU/amd/amd-move-up-prologue-loads.mlir‎
Lines changed: 140 additions & 0 deletions b/‎test/TritonGPU/amd/amd-move-up-prologue-loads.mlir‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/amd-reorder-instructions.mlir‎
Lines changed: 0 additions & 176 deletions b/‎test/TritonGPU/amd/amd-reorder-instructions.mlir‎
Lines changed: 0 additions & 176 deletions
diff --git a/‎third_party/amd/backend/compiler.py‎
Lines changed: 1 addition & 1 deletion b/‎third_party/amd/backend/compiler.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎third_party/amd/include/TritonAMDGPUTransforms/Passes.td‎
Lines changed: 8 additions & 6 deletions b/‎third_party/amd/include/TritonAMDGPUTransforms/Passes.td‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
@@ -114,7 +114,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUHoistLayoutConversions();
   mlir::registerTritonAMDGPUSinkLayoutConversions();
   mlir::registerTritonAMDGPUPrepareIfCombining();
-  mlir::registerTritonAMDGPUReorderInstructions();
+  mlir::registerTritonAMDGPUMoveUpPrologueLoads();
   mlir::registerTritonAMDGPUBlockPingpong();
   mlir::registerTritonAMDGPUPipeline();
   mlir::registerTritonAMDGPUScheduleLoops();
 
@@ -0,0 +1,140 @@
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-move-up-prologue-loads | FileCheck %s
+
+// CHECK-LABEL: move_up_slice
+// CHECK: arith.cmpi
+// CHECK: tt.splat
+// CHECK: tt.load
+// CHECK: ttg.local_alloc
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @move_up_slice(%arg0: tensor<32x128x!tt.ptr<f16>, #blocked>, %arg1: i32) {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<1x32x128xf16, #shared, #smem, mutable>
+    %1 = arith.cmpi sgt, %arg1, %c0_i32 : i32
+    %2 = tt.splat %1 : i1 -> tensor<32x128xi1, #blocked>
+    %3 = tt.load %arg0, %2 {amd.pipeliner_part = "prologue"} : tensor<32x128x!tt.ptr<f16>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: keep_load_order
+// CHECK: arith.cmpi sgt
+// CHECK: tt.splat
+// CHECK: tt.load %arg0
+// CHECK: tt.addptr
+// CHECK: arith.cmpi slt
+// CHECK: tt.splat
+// CHECK: tt.load
+// CHECK: ttg.local_alloc
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @keep_load_order(%arg0: tensor<32x128x!tt.ptr<f16>, #blocked>, %arg1: i32, %arg2: i32) {
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant dense<128> : tensor<32x128xi32, #blocked>
+    %0 = tt.addptr %arg0, %cst : tensor<32x128x!tt.ptr<f16>, #blocked>, tensor<32x128xi32, #blocked>
+    %1 = ttg.local_alloc : () -> !ttg.memdesc<1x32x128xf16, #shared, #smem, mutable>
+    %2 = arith.cmpi sgt, %arg1, %c0_i32 : i32
+    %3 = tt.splat %2 : i1 -> tensor<32x128xi1, #blocked>
+    %4 = tt.load %arg0, %3 {amd.pipeliner_part = "prologue"} : tensor<32x128x!tt.ptr<f16>, #blocked>
+    %5 = arith.cmpi slt, %arg2, %c0_i32 : i32
+    %6 = tt.splat %5 : i1 -> tensor<32x128xi1, #blocked>
+    %7 = tt.load %0, %6 {amd.pipeliner_part = "prologue"} : tensor<32x128x!tt.ptr<f16>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: break_at_atomic
+// CHECK: tt.atomic_rmw
+// CHECK: arith.cmpi
+// CHECK: tt.splat
+// CHECK: tt.load
+// CHECK: ttg.local_alloc
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @break_at_atomic(%arg0: tensor<32x128x!tt.ptr<f16>, #blocked>, %arg1: i32, %arg2: !tt.ptr<i32>) {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %0 = tt.atomic_rmw fadd, relaxed, gpu, %arg2, %c1_i32 : (!tt.ptr<i32>, i32) -> i32
+    %1 = ttg.local_alloc : () -> !ttg.memdesc<1x32x128xf16, #shared, #smem, mutable>
+    %2 = arith.cmpi sgt, %arg1, %c0_i32 : i32
+    %3 = tt.splat %2 : i1 -> tensor<32x128xi1, #blocked>
+    %4 = tt.load %arg0, %3 {amd.pipeliner_part = "prologue"} : tensor<32x128x!tt.ptr<f16>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: break_at_barrier
+// CHECK: gpu.barrier
+// CHECK: arith.cmpi
+// CHECK: tt.splat
+// CHECK: tt.load
+// CHECK: ttg.local_alloc
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @break_at_barrier(%arg0: tensor<32x128x!tt.ptr<f16>, #blocked>, %arg1: i32) {
+    %c0_i32 = arith.constant 0 : i32
+    gpu.barrier
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<1x32x128xf16, #shared, #smem, mutable>
+    %1 = arith.cmpi sgt, %arg1, %c0_i32 : i32
+    %2 = tt.splat %1 : i1 -> tensor<32x128xi1, #blocked>
+    %3 = tt.load %arg0, %2 {amd.pipeliner_part = "prologue"} : tensor<32x128x!tt.ptr<f16>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: break_at_loop
+// CHECK: scf.for
+// CHECK: tt.load %arg0
+// CHECK: ttg.local_alloc
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @break_at_loop(%arg0: tensor<32x128x!tt.ptr<f16>, #blocked>, %arg1: i32) {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    scf.for %arg2 = %c0_i32 to %arg1 step %c1_i32  : i32 {
+    }
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<1x32x128xf16, #shared, #smem, mutable>
+    %1 = tt.load %arg0 {amd.pipeliner_part = "prologue"} : tensor<32x128x!tt.ptr<f16>, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+// Negative test: load without amd.pipeliner_part attribute should not be moved
+// CHECK-LABEL: no_prologue_attribute
+// CHECK: ttg.local_alloc
+// CHECK: arith.cmpi
+// CHECK: tt.splat
+// CHECK: tt.load
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @no_prologue_attribute(%arg0: tensor<32x128x!tt.ptr<f16>, #blocked>, %arg1: i32) {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<1x32x128xf16, #shared, #smem, mutable>
+    %1 = arith.cmpi sgt, %arg1, %c0_i32 : i32
+    %2 = tt.splat %1 : i1 -> tensor<32x128xi1, #blocked>
+    %3 = tt.load %arg0, %2 : tensor<32x128x!tt.ptr<f16>, #blocked>
+    tt.return
+  }
+}
@@ -251,7 +251,7 @@ def make_ttgir(mod, metadata, options):
         if is_in_thread_transpose_enabled(options.arch):
             amd.passes.ttgpuir.add_in_thread_transpose(pm)
             passes.ttgpuir.add_remove_layout_conversions(pm)
-        amd.passes.ttgpuir.add_reorder_instructions(pm)
+        amd.passes.ttgpuir.add_move_up_prologue_loads(pm)
         if use_block_pingpong and options.num_stages > 1:
             amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages)
 
 
@@ -154,12 +154,14 @@ def TritonAMDGPUCanonicalizePointers : Pass<"tritonamdgpu-canonicalize-pointers"
   ];
 }
 
-def TritonAMDGPUReorderInstructions: Pass<"tritonamdgpu-reorder-instructions", "mlir::ModuleOp"> {
-  let summary = "Reorder instructions";
-
-  let description = "This pass reorder instructions so as to (1) decrease register pressure (e.g., by moving "
-                    "conversions from shared memory before their first use) and (2) promote LLVM instruction "
-                    "order more friendly to `ptxas`.";
+def TritonAMDGPUMoveUpPrologueLoads
+    : Pass<"tritonamdgpu-move-up-prologue-loads", "mlir::triton::FuncOp"> {
+  let summary = "Move up global loads in prologue for better GEMM performance";
+
+  let description =
+      "This pass moves global load ops early to prefetch in the prologue. "
+      "This may increase register pressure but it enables issuing global loads "
+      "early.";
 
   let dependentDialects = [];
 }
 
@@ -10,7 +10,7 @@ add_triton_library(TritonAMDGPUTransforms
   HoistLayoutConversions.cpp
   SinkLayoutConversions.cpp
   PrepareIfCombining.cpp
-  ReorderInstructions.cpp
+  MoveUpPrologueLoads.cpp
   Pipeline.cpp
   ScheduleLoops.cpp
   LowerLoops.cpp