[AMD] Enable supportBitwidth{16|32}Elementwise in TargetInfo

antiagainst · antiagainst · commit b2bc90550336 · 2026-02-19T01:59:49.000Z
This helps to optimize reduction code generation.
diff --git a/test/Conversion/amd/reduce_tree_vectorize.mlir b/test/Conversion/amd/reduce_tree_vectorize.mlir
@@ -0,0 +1,61 @@
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 -cse | FileCheck %s --check-prefix=GFX942
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx950 -cse | FileCheck %s --check-prefix=GFX950
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx1250 -cse | FileCheck %s --check-prefix=GFX1250
+
+#blocked_reduce = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [1, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  // GFX942-LABEL: reduce_f16
+  // GFX942: llvm.fadd {{.*}} : vector<2xf16>
+  // GFX950-LABEL: reduce_f16
+  // GFX950: llvm.fadd {{.*}} : vector<2xf16>
+  tt.func public @reduce_f16(%arg0: tensor<1x256xf16, #blocked_reduce>) {
+    %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+    ^bb0(%a: f16, %b: f16):
+      %sum = arith.addf %a, %b : f16
+      tt.reduce.return %sum : f16
+    }) : (tensor<1x256xf16, #blocked_reduce>) -> tensor<1xf16, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
+    tt.return
+  }
+
+  // GFX942-LABEL: reduce_f32
+  // GFX942-NOT: llvm.fadd {{.*}} : vector<2xf32>
+  // GFX942: llvm.return
+  // GFX950-LABEL: reduce_f32
+  // GFX950-NOT: llvm.fadd {{.*}} : vector<2xf32>
+  // GFX950: llvm.return
+  tt.func public @reduce_f32(%arg0: tensor<1x256xf32, #blocked_reduce>) {
+    %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+    ^bb0(%a: f32, %b: f32):
+      %sum = arith.addf %a, %b : f32
+      tt.reduce.return %sum : f32
+    }) : (tensor<1x256xf32, #blocked_reduce>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked_reduce = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // GFX1250-LABEL: reduce_f16_tree_vectorize
+  // GFX1250: llvm.fadd {{.*}} : vector<2xf16>
+  tt.func public @reduce_f16_tree_vectorize(%arg0: tensor<1x128xf16, #blocked_reduce>) {
+    %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+    ^bb0(%a: f16, %b: f16):
+      %sum = arith.addf %a, %b : f16
+      tt.reduce.return %sum : f16
+    }) : (tensor<1x128xf16, #blocked_reduce>) -> tensor<1xf16, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
+    tt.return
+  }
+
+  // GFX1250-LABEL: reduce_f32_tree_vectorize
+  // GFX1250: llvm.fadd {{.*}} : vector<2xf32>
+  tt.func public @reduce_f32_tree_vectorize(%arg0: tensor<1x128xf32, #blocked_reduce>) {
+    %0 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+    ^bb0(%a: f32, %b: f32):
+      %sum = arith.addf %a, %b : f32
+      tt.reduce.return %sum : f32
+    }) : (tensor<1x128xf32, #blocked_reduce>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked_reduce}>>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -675,6 +675,12 @@ bool TargetInfo::supportVectorizedAtomics() const {
   return true;
 }
 
+bool TargetInfo::supportBitwidth16Elementwise() const { return true; }
+
+bool TargetInfo::supportBitwidth32Elementwise() const {
+  return getISAFamily() == ISAFamily::GFX1250;
+}
+
 bool TargetInfo::supportsDirectToLDSScattering() const {
   switch (getISAFamily()) {
   case ISAFamily::GFX1250:
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
@@ -98,6 +98,9 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
 
   bool supportVectorizedAtomics() const override;
 
+  bool supportBitwidth16Elementwise() const override;
+  bool supportBitwidth32Elementwise() const override;
+
   // Returns true if the target supports per lane addresses into LDS for
   // direct-to-lds loads. Some architectures (e.g. GFX9) do not support
   // scattering and instead have to write warp coalesced into LDS