[AMDGPU] Enable vectorization of i8 values. #134934

doru1004 · 2025-04-08T21:45:14Z

This patch adjusts the cost model to account for the ability of the AMDGPU optimizer to group together i8 values into i32 values.

llvmbot · 2025-04-08T21:45:49Z

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-analysis

Author: Gheorghe-Teodor Bercea (doru1004)

Changes

This patch enabled the vectorization of i8 values for AMD GPUs.

Patch is 164.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134934.diff

11 Files Affected:

(modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+7)
(modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+2)
(modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+22-3)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+9)
(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+15-4)
(modified) llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll (+28-28)
(modified) llvm/test/Analysis/CostModel/AMDGPU/div.ll (+42-42)
(modified) llvm/test/Analysis/CostModel/AMDGPU/rem.ll (+42-42)
(modified) llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll (+216-216)
(added) llvm/test/CodeGen/AMDGPU/vectorize-i8-as-i32.ll (+88)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2efca0d1d754f..17aa46774194a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
   /// pad to. Default is no padding.
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
 
+  /// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
+  /// Currently only used by the SLP vectorizer.
+  bool canVectorizei8s() const;
+
   /// @}
 
   /// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
   virtual void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
+  virtual bool canVectorizei8s() const = 0;
 };
 
 template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
     Impl.collectKernelLaunchBounds(F, LB);
   }
+
+  bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3fe0a9101fdee..9ab97328370bd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool canVectorizei8s() const { return false; }
+
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
     return 0;
   }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4fea4e5711f5a..ee0c3a9329a1f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
 
+bool TargetTransformInfo::canVectorizei8s() const {
+  return TTIImpl->canVectorizei8s();
+}
+
 bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
   return TTIImpl->shouldExpandReduction(II);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..9c29402755e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+  return ElemWidth == 8                                ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   }
 }
 
+InstructionCost GCNTTIImpl::getScalarizationOverhead(
+    VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
+  unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
+  if (NumVectorElts > 1 &&
+      InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
+    return 0;
+  return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
+                                         CostKind, VL);
+}
+
 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *VT, ArrayRef<int> Mask,
                                            TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   // Larger vector widths may require additional instructions, but are
   // typically cheaper than scalarized versions.
   unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+
+  if (NumVectorElts > 1 &&
+      VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
+    return 0;
+
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       DL.getTypeSizeInBits(VT->getElementType()) == 16) {
     bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+bool GCNTTIImpl::canVectorizei8s() const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index f5062070ac6f4..239becb9aab15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   InstructionCost getVectorSplitCost() { return 0; }
 
+  InstructionCost getScalarizationOverhead(VectorType *InTy,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = {});
+
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
                                  TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+  /// \return return true if we can pack 4 i8s into an i32.
+  bool canVectorizei8s() const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d411f2cb203a..efa1e77e4a2ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               LI0->getPointerAddressSpace(), CostKind);
 
         } else {
-          VecLdCost = TTI->getMemoryOpCost(
-              Instruction::Load, VecTy, LI0->getAlign(),
-              LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+          if (VecTy->getElementType() ==
+                  IntegerType::getInt8Ty(VecTy->getContext()) &&
+              TTI->canVectorizei8s()) {
+            VecLdCost = 1;
+          } else {
+            VecLdCost =
+                TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
+                                     LI0->getPointerAddressSpace(), CostKind,
+                                     TTI::OperandValueInfo());
+          }
         }
         break;
       case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
     auto *VecTy = getWidenedType(ScalarTy, VF);
-    if (TTI->getNumberOfParts(VecTy) == VF)
+    unsigned NumParts = TTI->getNumberOfParts(VecTy);
+    if (TTI->canVectorizei8s() &&
+        VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+      NumParts = 1;
+    if (NumParts == VF)
       continue;
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned ActualVF = std::min(MaxInst - I, VF);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 2b3728b556d9e..2984c616e8f21 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -47,13 +47,13 @@ define i32 @umax(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -72,13 +72,13 @@ define i32 @umax(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -153,13 +153,13 @@ define i32 @umin(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -178,13 +178,13 @@ define i32 @umin(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/div.ll b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
index 459c41a0b3eb5..1d21b66166f62 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/div.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
@@ -44,9 +44,9 @@ define i32 @sdiv() {
 ; SLOW-NEXT:  Co...
[truncated]

llvmbot · 2025-04-08T21:45:49Z

@llvm/pr-subscribers-llvm-transforms

Author: Gheorghe-Teodor Bercea (doru1004)

Changes

This patch enabled the vectorization of i8 values for AMD GPUs.

Patch is 164.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134934.diff

11 Files Affected:

(modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+7)
(modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+2)
(modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+22-3)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+9)
(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+15-4)
(modified) llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll (+28-28)
(modified) llvm/test/Analysis/CostModel/AMDGPU/div.ll (+42-42)
(modified) llvm/test/Analysis/CostModel/AMDGPU/rem.ll (+42-42)
(modified) llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll (+216-216)
(added) llvm/test/CodeGen/AMDGPU/vectorize-i8-as-i32.ll (+88)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2efca0d1d754f..17aa46774194a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
   /// pad to. Default is no padding.
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
 
+  /// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
+  /// Currently only used by the SLP vectorizer.
+  bool canVectorizei8s() const;
+
   /// @}
 
   /// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
   virtual void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
+  virtual bool canVectorizei8s() const = 0;
 };
 
 template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
     Impl.collectKernelLaunchBounds(F, LB);
   }
+
+  bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3fe0a9101fdee..9ab97328370bd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool canVectorizei8s() const { return false; }
+
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
     return 0;
   }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4fea4e5711f5a..ee0c3a9329a1f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
 
+bool TargetTransformInfo::canVectorizei8s() const {
+  return TTIImpl->canVectorizei8s();
+}
+
 bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
   return TTIImpl->shouldExpandReduction(II);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..9c29402755e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+  return ElemWidth == 8                                ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   }
 }
 
+InstructionCost GCNTTIImpl::getScalarizationOverhead(
+    VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
+  unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
+  if (NumVectorElts > 1 &&
+      InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
+    return 0;
+  return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
+                                         CostKind, VL);
+}
+
 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *VT, ArrayRef<int> Mask,
                                            TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   // Larger vector widths may require additional instructions, but are
   // typically cheaper than scalarized versions.
   unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+
+  if (NumVectorElts > 1 &&
+      VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
+    return 0;
+
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       DL.getTypeSizeInBits(VT->getElementType()) == 16) {
     bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+bool GCNTTIImpl::canVectorizei8s() const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index f5062070ac6f4..239becb9aab15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   InstructionCost getVectorSplitCost() { return 0; }
 
+  InstructionCost getScalarizationOverhead(VectorType *InTy,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = {});
+
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
                                  TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+  /// \return return true if we can pack 4 i8s into an i32.
+  bool canVectorizei8s() const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d411f2cb203a..efa1e77e4a2ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               LI0->getPointerAddressSpace(), CostKind);
 
         } else {
-          VecLdCost = TTI->getMemoryOpCost(
-              Instruction::Load, VecTy, LI0->getAlign(),
-              LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+          if (VecTy->getElementType() ==
+                  IntegerType::getInt8Ty(VecTy->getContext()) &&
+              TTI->canVectorizei8s()) {
+            VecLdCost = 1;
+          } else {
+            VecLdCost =
+                TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
+                                     LI0->getPointerAddressSpace(), CostKind,
+                                     TTI::OperandValueInfo());
+          }
         }
         break;
       case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
     auto *VecTy = getWidenedType(ScalarTy, VF);
-    if (TTI->getNumberOfParts(VecTy) == VF)
+    unsigned NumParts = TTI->getNumberOfParts(VecTy);
+    if (TTI->canVectorizei8s() &&
+        VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+      NumParts = 1;
+    if (NumParts == VF)
       continue;
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned ActualVF = std::min(MaxInst - I, VF);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 2b3728b556d9e..2984c616e8f21 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -47,13 +47,13 @@ define i32 @umax(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -72,13 +72,13 @@ define i32 @umax(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -153,13 +153,13 @@ define i32 @umin(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -178,13 +178,13 @@ define i32 @umin(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/div.ll b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
index 459c41a0b3eb5..1d21b66166f62 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/div.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
@@ -44,9 +44,9 @@ define i32 @sdiv() {
 ; SLOW-NEXT:  Co...
[truncated]

llvmbot · 2025-04-08T21:45:50Z

@llvm/pr-subscribers-backend-amdgpu

Author: Gheorghe-Teodor Bercea (doru1004)

Changes

This patch enabled the vectorization of i8 values for AMD GPUs.

Patch is 164.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134934.diff

11 Files Affected:

(modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+7)
(modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+2)
(modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+22-3)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+9)
(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+15-4)
(modified) llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll (+28-28)
(modified) llvm/test/Analysis/CostModel/AMDGPU/div.ll (+42-42)
(modified) llvm/test/Analysis/CostModel/AMDGPU/rem.ll (+42-42)
(modified) llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll (+216-216)
(added) llvm/test/CodeGen/AMDGPU/vectorize-i8-as-i32.ll (+88)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2efca0d1d754f..17aa46774194a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
   /// pad to. Default is no padding.
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
 
+  /// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
+  /// Currently only used by the SLP vectorizer.
+  bool canVectorizei8s() const;
+
   /// @}
 
   /// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
   virtual void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
+  virtual bool canVectorizei8s() const = 0;
 };
 
 template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
     Impl.collectKernelLaunchBounds(F, LB);
   }
+
+  bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3fe0a9101fdee..9ab97328370bd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool canVectorizei8s() const { return false; }
+
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
     return 0;
   }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4fea4e5711f5a..ee0c3a9329a1f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
 
+bool TargetTransformInfo::canVectorizei8s() const {
+  return TTIImpl->canVectorizei8s();
+}
+
 bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
   return TTIImpl->shouldExpandReduction(II);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..9c29402755e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+  return ElemWidth == 8                                ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   }
 }
 
+InstructionCost GCNTTIImpl::getScalarizationOverhead(
+    VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
+  unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
+  if (NumVectorElts > 1 &&
+      InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
+    return 0;
+  return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
+                                         CostKind, VL);
+}
+
 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *VT, ArrayRef<int> Mask,
                                            TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   // Larger vector widths may require additional instructions, but are
   // typically cheaper than scalarized versions.
   unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+
+  if (NumVectorElts > 1 &&
+      VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
+    return 0;
+
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       DL.getTypeSizeInBits(VT->getElementType()) == 16) {
     bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+bool GCNTTIImpl::canVectorizei8s() const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index f5062070ac6f4..239becb9aab15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   InstructionCost getVectorSplitCost() { return 0; }
 
+  InstructionCost getScalarizationOverhead(VectorType *InTy,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = {});
+
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
                                  TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+  /// \return return true if we can pack 4 i8s into an i32.
+  bool canVectorizei8s() const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d411f2cb203a..efa1e77e4a2ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               LI0->getPointerAddressSpace(), CostKind);
 
         } else {
-          VecLdCost = TTI->getMemoryOpCost(
-              Instruction::Load, VecTy, LI0->getAlign(),
-              LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+          if (VecTy->getElementType() ==
+                  IntegerType::getInt8Ty(VecTy->getContext()) &&
+              TTI->canVectorizei8s()) {
+            VecLdCost = 1;
+          } else {
+            VecLdCost =
+                TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
+                                     LI0->getPointerAddressSpace(), CostKind,
+                                     TTI::OperandValueInfo());
+          }
         }
         break;
       case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
     auto *VecTy = getWidenedType(ScalarTy, VF);
-    if (TTI->getNumberOfParts(VecTy) == VF)
+    unsigned NumParts = TTI->getNumberOfParts(VecTy);
+    if (TTI->canVectorizei8s() &&
+        VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+      NumParts = 1;
+    if (NumParts == VF)
       continue;
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned ActualVF = std::min(MaxInst - I, VF);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 2b3728b556d9e..2984c616e8f21 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -47,13 +47,13 @@ define i32 @umax(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -72,13 +72,13 @@ define i32 @umax(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -153,13 +153,13 @@ define i32 @umin(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -178,13 +178,13 @@ define i32 @umin(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/div.ll b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
index 459c41a0b3eb5..1d21b66166f62 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/div.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
@@ -44,9 +44,9 @@ define i32 @sdiv() {
 ; SLOW-NEXT:  Co...
[truncated]

llvm/test/CodeGen/AMDGPU/vectorize-i8-as-i32.ll

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

alexey-bataev

The logic in the SLP vectorizer should be target-independent

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Precommit tests for PR: #134934

…n (#138801) Precommit tests for PR: llvm/llvm-project#134934

github-actions · 2025-06-10T20:27:26Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jrbyrnes

LGTM but please wait for @arsenm for approval.

The one outstanding concern that we discussed offline (I think) is that this method of vectorizing does not encourage vectorization of other illegal types that may benefit from cross BB type coercion. We don't really have any strong present need that I know of, but we may encounter in the future. But, it is probably best to address such a need as it occurs rather than spending a bunch of time trying to future proof.

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

doru1004 · 2025-06-13T14:59:28Z

Thank you for pointing that out @arsenm. I used the condition you suggested please let me know if you have any further comments.

arsenm

LGTM with nits. I think the load/store costs need to be revisited, we're treating them as as cheap as cheap ALU instructions which they certainly are not (also ignoring unaligned cost)

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

doru1004 · 2025-06-17T14:07:08Z

LGTM with nits. I think the load/store costs need to be revisited, we're treating them as as cheap as cheap ALU instructions which they certainly are not (also ignoring unaligned cost)

This is totally fine in this case. This is how load/stores for other types are treated at this point in the code. As a sanity check: the total cost of the instruction itself is actually just 4x this value so no other costs are included for these instructions from the start. And it's the same for the other load/store types so this is not an exception.

doru1004 · 2025-06-17T18:11:36Z

LGTM with nits. I think the load/store costs need to be revisited, we're treating them as as cheap as cheap ALU instructions which they certainly are not (also ignoring unaligned cost)

Since this is a problem which affects all load/store instructions regardless of type we will address this in a future PR to maintain consistency.

This patch adds a release note that explains the current status of OpenACC in Clang. Currently we cannot actually make an executable because the OpenACC dialect of MLIR doesn't support any amount of lowering to LLVM-IR, so the usefulness of OpenACC is entirely for front-end related uses, such as tooling or semantic checking.

This patch adjusts the cost model to account for the ability of the AMDGPU optimizer to group together i8 values into i32 values. Co-authored-by: Erich Keane <[email protected]>

llvmbot added backend:AMDGPU vectorizers llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Apr 8, 2025

doru1004 requested a review from jrbyrnes April 8, 2025 21:45

doru1004 requested a review from bcahoon April 8, 2025 21:46

doru1004 force-pushed the add-i8-vectorisation branch from 19b28a7 to fedc510 Compare April 8, 2025 23:09

fhahn reviewed Apr 9, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/vectorize-i8-as-i32.ll Outdated Show resolved Hide resolved

doru1004 force-pushed the add-i8-vectorisation branch from fedc510 to 9a927fe Compare April 9, 2025 14:30

alexey-bataev reviewed Apr 9, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

doru1004 force-pushed the add-i8-vectorisation branch from 9a927fe to 3f92804 Compare April 9, 2025 14:33

alexey-bataev requested changes Apr 9, 2025

View reviewed changes

alexey-bataev reviewed Apr 9, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

jrbyrnes reviewed Apr 9, 2025

View reviewed changes

llvm/lib/Analysis/TargetTransformInfo.cpp Outdated Show resolved Hide resolved

jrbyrnes reviewed Apr 9, 2025

View reviewed changes

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll Outdated Show resolved Hide resolved

shiltian requested review from jayfoad and arsenm April 14, 2025 04:29

shiltian reviewed Apr 14, 2025

View reviewed changes

llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll Outdated Show resolved Hide resolved

shiltian reviewed Apr 14, 2025

View reviewed changes

llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll Outdated Show resolved Hide resolved

shiltian reviewed Apr 14, 2025

View reviewed changes

llvm/include/llvm/Analysis/TargetTransformInfo.h Outdated Show resolved Hide resolved

shiltian reviewed Apr 14, 2025

View reviewed changes

llvm/include/llvm/Analysis/TargetTransformInfo.h Outdated Show resolved Hide resolved

doru1004 force-pushed the add-i8-vectorisation branch 2 times, most recently from cc100d8 to 9c0cdeb Compare April 15, 2025 15:57

shiltian reviewed Apr 15, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp Outdated Show resolved Hide resolved

shiltian reviewed Apr 15, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h Outdated Show resolved Hide resolved

shiltian reviewed Apr 15, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp Outdated Show resolved Hide resolved

doru1004 force-pushed the add-i8-vectorisation branch from 9c0cdeb to ce790a6 Compare April 15, 2025 16:42

doru1004 force-pushed the add-i8-vectorisation branch 4 times, most recently from cec2dc7 to 7eca9ac Compare May 8, 2025 05:07

doru1004 added a commit that referenced this pull request May 9, 2025

[AMDGPU][NFC] Add tests in preparation for i8 vectorization (#138801)

25a0319

Precommit tests for PR: #134934

llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request May 9, 2025

Automerge: [AMDGPU][NFC] Add tests in preparation for i8 vectorizatio…

cbd45a2

…n (#138801) Precommit tests for PR: llvm/llvm-project#134934

doru1004 force-pushed the add-i8-vectorisation branch from 7eca9ac to 703a497 Compare May 9, 2025 16:38

doru1004 force-pushed the add-i8-vectorisation branch 2 times, most recently from 32c7371 to edf6984 Compare June 10, 2025 20:25

doru1004 force-pushed the add-i8-vectorisation branch from edf6984 to ec8b007 Compare June 10, 2025 23:01

jrbyrnes reviewed Jun 12, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp Outdated Show resolved Hide resolved

doru1004 force-pushed the add-i8-vectorisation branch 2 times, most recently from e31e578 to f93da64 Compare June 12, 2025 21:09

arsenm reviewed Jun 13, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp Outdated Show resolved Hide resolved

doru1004 force-pushed the add-i8-vectorisation branch from f93da64 to b9666c8 Compare June 13, 2025 14:57

arsenm approved these changes Jun 17, 2025

View reviewed changes

doru1004 force-pushed the add-i8-vectorisation branch from b9666c8 to f788a42 Compare June 17, 2025 15:01

doru1004 force-pushed the add-i8-vectorisation branch 3 times, most recently from b797e8b to f968b80 Compare June 26, 2025 20:46

llvmbot added the clang Clang issues not falling into any other category label Jun 26, 2025

doru1004 force-pushed the add-i8-vectorisation branch from f968b80 to f1989e8 Compare June 26, 2025 21:15

doru1004 force-pushed the add-i8-vectorisation branch from f1989e8 to 9d0221e Compare June 26, 2025 22:12

doru1004 merged commit 3df36a2 into llvm:main Jun 26, 2025
7 checks passed

[AMDGPU] Enable vectorization of i8 values. #134934

[AMDGPU] Enable vectorization of i8 values. #134934

Uh oh!

Conversation

doru1004 commented Apr 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Apr 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Apr 8, 2025

Uh oh!

llvmbot commented Apr 8, 2025

Uh oh!

Uh oh!

Uh oh!

alexey-bataev left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Jun 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jrbyrnes left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

doru1004 commented Jun 13, 2025

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

doru1004 commented Jun 17, 2025

Uh oh!

doru1004 commented Jun 17, 2025

Uh oh!

Uh oh!

Uh oh!

doru1004 commented Apr 8, 2025 •

edited

Loading

llvmbot commented Apr 8, 2025 •

edited

Loading

github-actions bot commented Jun 10, 2025 •

edited

Loading