[AMD] Generalize in-thread tree reduction to support ternary grouping for max/min (#9897)

zhanglx13 · claude · web-flow · commit bc791297ebb5 · 2026-04-03T07:18:18.000Z
Summary

- Generalize treeReduceBinary into treeReduce parameterized by arity,
enabling ternary (or higher) tree reductions when the target benefits
from it
- Add getReductionTreeArity(Operation*) to TargetInfoBase (default: 2)
so targets can request wider grouping per combiner op
- AMD override returns 3 for MaximumFOp/MinimumFOp/MaxNumFOp/MinNumFOp,
generating max(max(a,b), c) groups that LLVM folds into
v_maximum3_f32/v_minimum3_f32

  Motivation

The binary tree reduction creates an alternating pattern where every
other level produces results that LLVM's DAG combiner cannot fold into
ternary instructions. LLVM only matches max(max(a,b), c) → v_maximum3
when the inner max has a single use, but the balanced binary tree
creates intermediate results consumed by the next level that alternate
between foldable and unfoldable.

With arity=3, every group maps directly to a ternary instruction,
reducing max/min instruction count by ~23% (344 → 264 for a 256×256 f32
reduction). NVIDIA has no max3 equivalent, so the default arity=2
preserves existing behavior.

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h b/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
@@ -106,6 +106,14 @@ class TargetInfoBase {
   virtual bool supportLdStMatrixB8() const { return false; }
   virtual bool supportBitwidth16Elementwise() const { return false; }
   virtual bool supportBitwidth32Elementwise() const { return false; }
+
+  // Returns the preferred arity of the in-thread reduction tree for the given
+  // combiner operation. The default is 2 (binary tree). Targets that have
+  // native ternary instructions (e.g. AMD v_maximum3/v_minimum3) can return 3
+  // to generate a ternary reduction tree that maps directly to hardware.
+  virtual unsigned getReductionTreeArity(Operation *combinerOp) const {
+    return 2;
+  }
   virtual bool isCuda() const { return false; }
 
   // Returns the shared memory partition size in bytes. A value of 0 means
diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
@@ -112,18 +112,28 @@ struct ReduceOpConversion
 private:
   const TargetInfoBase &targetInfo;
 
-  SmallVector<Value>
-  treeReduceBinary(Location loc, ConversionPatternRewriter &rewriter,
-                   Region &combineOp,
-                   SmallVector<SmallVector<Value>> values) const {
-    // The number of elements is always a power of two
-    assert(llvm::isPowerOf2_64(values.size()) && !values.empty());
+  // Reduce values using a tree of the given arity. Arity=3 generates
+  // combine(combine(a, b), c) groups that LLVM folds into ternary
+  // instructions (e.g. v_maximum3_f32 on AMD).
+  SmallVector<Value> treeReduce(Location loc,
+                                ConversionPatternRewriter &rewriter,
+                                Region &combineOp,
+                                SmallVector<SmallVector<Value>> values,
+                                unsigned arity) const {
+    assert(!values.empty() && arity >= 2);
     while (values.size() > 1) {
       SmallVector<SmallVector<Value>> next;
-      for (size_t i = 0; i + 1 < values.size(); i += 2) {
-        SmallVector<Value> acc = values[i];
-        accumulate(loc, rewriter, combineOp, acc, values[i + 1]);
-        next.push_back(std::move(acc));
+      for (size_t i = 0; i < values.size(); i += arity) {
+        size_t remaining = values.size() - i;
+        size_t groupSize = std::min(static_cast<size_t>(arity), remaining);
+        if (groupSize == 1) {
+          next.push_back(std::move(values[i]));
+        } else {
+          SmallVector<Value> acc = std::move(values[i]);
+          for (size_t j = 1; j < groupSize; ++j)
+            accumulate(loc, rewriter, combineOp, acc, values[i + j]);
+          next.push_back(std::move(acc));
+        }
       }
       values = std::move(next);
     }
@@ -271,6 +281,9 @@ struct ReduceOpConversion
     Region &combineRegion =
         vectorCombineRegion ? *vectorCombineRegion : op.getCombineOp();
 
+    Operation &combinerOp = combineRegion.front().front();
+    unsigned arity = targetInfo.getReductionTreeArity(&combinerOp);
+
     // Perform a tree reduction
     unsigned numOperands = accs.size();
     SmallVector<SmallVector<Value>> reduced(numOperands);
@@ -286,7 +299,7 @@ struct ReduceOpConversion
         vals.push_back(std::move(cur));
       }
       auto acc =
-          treeReduceBinary(loc, rewriter, combineRegion, std::move(vals));
+          treeReduce(loc, rewriter, combineRegion, std::move(vals), arity);
       for (unsigned opIdx = 0; opIdx < numOperands; ++opIdx) {
         reduced[opIdx].push_back(acc[opIdx]);
       }
diff --git a/test/Conversion/amd/reduce_tree_vectorize.mlir b/test/Conversion/amd/reduce_tree_vectorize.mlir
@@ -59,4 +59,59 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     }) : (tensor<1x128xf32, #blocked_reduce>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
     tt.return
   }
+
+  // Ternary tree reduction for max/min: generates a chain of 3 dependent ops
+  // per group so LLVM can fold into v_maximum3/v_minimum3/v_max3/v_min3.
+
+  // GFX1250-LABEL: reduce_maximum_f32_ternary
+  // GFX1250: %[[A:.*]] = llvm.intr.maximum(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+  // GFX1250-NEXT: %[[B:.*]] = llvm.intr.maximum(%[[A]], %{{.*}}) : (f32, f32) -> f32
+  // GFX1250-NEXT: %[[C:.*]] = llvm.intr.maximum(%[[B]], %{{.*}}) : (f32, f32) -> f32
+  tt.func public @reduce_maximum_f32_ternary(%arg0: tensor<1x128xf32, #blocked_reduce>) {
+    %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+    ^bb0(%a: f32, %b: f32):
+      %max = arith.maximumf %a, %b : f32
+      tt.reduce.return %max : f32
+    }) : (tensor<1x128xf32, #blocked_reduce>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
+    tt.return
+  }
+
+  // GFX1250-LABEL: reduce_minimum_f32_ternary
+  // GFX1250: %[[A:.*]] = llvm.intr.minimum(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+  // GFX1250-NEXT: %[[B:.*]] = llvm.intr.minimum(%[[A]], %{{.*}}) : (f32, f32) -> f32
+  // GFX1250-NEXT: %[[C:.*]] = llvm.intr.minimum(%[[B]], %{{.*}}) : (f32, f32) -> f32
+  tt.func public @reduce_minimum_f32_ternary(%arg0: tensor<1x128xf32, #blocked_reduce>) {
+    %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+    ^bb0(%a: f32, %b: f32):
+      %min = arith.minimumf %a, %b : f32
+      tt.reduce.return %min : f32
+    }) : (tensor<1x128xf32, #blocked_reduce>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
+    tt.return
+  }
+
+  // GFX1250-LABEL: reduce_maxnum_f32_ternary
+  // GFX1250: %[[A:.*]] = llvm.intr.maxnum(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+  // GFX1250-NEXT: %[[B:.*]] = llvm.intr.maxnum(%[[A]], %{{.*}}) : (f32, f32) -> f32
+  // GFX1250-NEXT: %[[C:.*]] = llvm.intr.maxnum(%[[B]], %{{.*}}) : (f32, f32) -> f32
+  tt.func public @reduce_maxnum_f32_ternary(%arg0: tensor<1x128xf32, #blocked_reduce>) {
+    %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+    ^bb0(%a: f32, %b: f32):
+      %max = arith.maxnumf %a, %b : f32
+      tt.reduce.return %max : f32
+    }) : (tensor<1x128xf32, #blocked_reduce>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
+    tt.return
+  }
+
+  // GFX1250-LABEL: reduce_minnum_f32_ternary
+  // GFX1250: %[[A:.*]] = llvm.intr.minnum(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+  // GFX1250-NEXT: %[[B:.*]] = llvm.intr.minnum(%[[A]], %{{.*}}) : (f32, f32) -> f32
+  // GFX1250-NEXT: %[[C:.*]] = llvm.intr.minnum(%[[B]], %{{.*}}) : (f32, f32) -> f32
+  tt.func public @reduce_minnum_f32_ternary(%arg0: tensor<1x128xf32, #blocked_reduce>) {
+    %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+    ^bb0(%a: f32, %b: f32):
+      %min = arith.minnumf %a, %b : f32
+      tt.reduce.return %min : f32
+    }) : (tensor<1x128xf32, #blocked_reduce>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
+    tt.return
+  }
 }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -764,6 +764,16 @@ bool TargetInfo::supportBitwidth32Elementwise() const {
   }
 }
 
+unsigned TargetInfo::getReductionTreeArity(Operation *combinerOp) const {
+  // AMD has native ternary max/min instructions: v_max3/v_min3 on all GFX9+,
+  // and v_maximum3/v_minimum3 additionally on GFX950 and GFX1250.
+  // Use a ternary reduction tree so these map 1:1 to hardware.
+  if (isa<arith::MaximumFOp, arith::MinimumFOp, arith::MaxNumFOp,
+          arith::MinNumFOp>(combinerOp))
+    return 3;
+  return 2;
+}
+
 bool TargetInfo::supportsDirectToLDSScattering() const {
   switch (getISAFamily()) {
   case ISAFamily::GFX1250:
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
@@ -101,6 +101,8 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
   bool supportBitwidth16Elementwise() const override;
   bool supportBitwidth32Elementwise() const override;
 
+  unsigned getReductionTreeArity(Operation *combinerOp) const override;
+
   // Returns true if the target supports per lane addresses into LDS for
   // direct-to-lds loads. Some architectures (e.g. GFX9) do not support
   // scattering and instead have to write warp coalesced into LDS