-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU] Enable vectorization of i8 values. #134934
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-analysis Author: Gheorghe-Teodor Bercea (doru1004) ChangesThis patch enabled the vectorization of i8 values for AMD GPUs. Patch is 164.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134934.diff 11 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2efca0d1d754f..17aa46774194a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
/// pad to. Default is no padding.
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
+ /// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
+ /// Currently only used by the SLP vectorizer.
+ bool canVectorizei8s() const;
+
/// @}
/// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
virtual void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
+ virtual bool canVectorizei8s() const = 0;
};
template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
Impl.collectKernelLaunchBounds(F, LB);
}
+
+ bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
};
template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3fe0a9101fdee..9ab97328370bd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxNumArgs() const { return UINT_MAX; }
+ bool canVectorizei8s() const { return false; }
+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
return 0;
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4fea4e5711f5a..ee0c3a9329a1f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
return TTIImpl->getMaxNumArgs();
}
+bool TargetTransformInfo::canVectorizei8s() const {
+ return TTIImpl->canVectorizei8s();
+}
+
bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
return TTIImpl->shouldExpandReduction(II);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..9c29402755e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
- return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
- : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
- : 1;
+ return ElemWidth == 8 ? 4
+ : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+ : 1;
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
}
+InstructionCost GCNTTIImpl::getScalarizationOverhead(
+ VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
+ unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
+ if (NumVectorElts > 1 &&
+ InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
+ return 0;
+ return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
+ CostKind, VL);
+}
+
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+
+ if (NumVectorElts > 1 &&
+ VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
+ return 0;
+
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
}
+
+bool GCNTTIImpl::canVectorizei8s() const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index f5062070ac6f4..239becb9aab15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getVectorSplitCost() { return 0; }
+ InstructionCost getScalarizationOverhead(VectorType *InTy,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind,
+ ArrayRef<Value *> VL = {});
+
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+ /// \return return true if we can pack 4 i8s into an i32.
+ bool canVectorizei8s() const;
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d411f2cb203a..efa1e77e4a2ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
LI0->getPointerAddressSpace(), CostKind);
} else {
- VecLdCost = TTI->getMemoryOpCost(
- Instruction::Load, VecTy, LI0->getAlign(),
- LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+ if (VecTy->getElementType() ==
+ IntegerType::getInt8Ty(VecTy->getContext()) &&
+ TTI->canVectorizei8s()) {
+ VecLdCost = 1;
+ } else {
+ VecLdCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
+ LI0->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo());
+ }
}
break;
case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
auto *VecTy = getWidenedType(ScalarTy, VF);
- if (TTI->getNumberOfParts(VecTy) == VF)
+ unsigned NumParts = TTI->getNumberOfParts(VecTy);
+ if (TTI->canVectorizei8s() &&
+ VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+ NumParts = 1;
+ if (NumParts == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned ActualVF = std::min(MaxInst - I, VF);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 2b3728b556d9e..2984c616e8f21 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -47,13 +47,13 @@ define i32 @umax(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -72,13 +72,13 @@ define i32 @umax(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -153,13 +153,13 @@ define i32 @umin(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -178,13 +178,13 @@ define i32 @umin(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/div.ll b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
index 459c41a0b3eb5..1d21b66166f62 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/div.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
@@ -44,9 +44,9 @@ define i32 @sdiv() {
; SLOW-NEXT: Co...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Gheorghe-Teodor Bercea (doru1004) ChangesThis patch enabled the vectorization of i8 values for AMD GPUs. Patch is 164.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134934.diff 11 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2efca0d1d754f..17aa46774194a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
/// pad to. Default is no padding.
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
+ /// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
+ /// Currently only used by the SLP vectorizer.
+ bool canVectorizei8s() const;
+
/// @}
/// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
virtual void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
+ virtual bool canVectorizei8s() const = 0;
};
template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
Impl.collectKernelLaunchBounds(F, LB);
}
+
+ bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
};
template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3fe0a9101fdee..9ab97328370bd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxNumArgs() const { return UINT_MAX; }
+ bool canVectorizei8s() const { return false; }
+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
return 0;
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4fea4e5711f5a..ee0c3a9329a1f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
return TTIImpl->getMaxNumArgs();
}
+bool TargetTransformInfo::canVectorizei8s() const {
+ return TTIImpl->canVectorizei8s();
+}
+
bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
return TTIImpl->shouldExpandReduction(II);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..9c29402755e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
- return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
- : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
- : 1;
+ return ElemWidth == 8 ? 4
+ : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+ : 1;
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
}
+InstructionCost GCNTTIImpl::getScalarizationOverhead(
+ VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
+ unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
+ if (NumVectorElts > 1 &&
+ InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
+ return 0;
+ return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
+ CostKind, VL);
+}
+
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+
+ if (NumVectorElts > 1 &&
+ VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
+ return 0;
+
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
}
+
+bool GCNTTIImpl::canVectorizei8s() const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index f5062070ac6f4..239becb9aab15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getVectorSplitCost() { return 0; }
+ InstructionCost getScalarizationOverhead(VectorType *InTy,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind,
+ ArrayRef<Value *> VL = {});
+
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+ /// \return return true if we can pack 4 i8s into an i32.
+ bool canVectorizei8s() const;
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d411f2cb203a..efa1e77e4a2ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
LI0->getPointerAddressSpace(), CostKind);
} else {
- VecLdCost = TTI->getMemoryOpCost(
- Instruction::Load, VecTy, LI0->getAlign(),
- LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+ if (VecTy->getElementType() ==
+ IntegerType::getInt8Ty(VecTy->getContext()) &&
+ TTI->canVectorizei8s()) {
+ VecLdCost = 1;
+ } else {
+ VecLdCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
+ LI0->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo());
+ }
}
break;
case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
auto *VecTy = getWidenedType(ScalarTy, VF);
- if (TTI->getNumberOfParts(VecTy) == VF)
+ unsigned NumParts = TTI->getNumberOfParts(VecTy);
+ if (TTI->canVectorizei8s() &&
+ VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+ NumParts = 1;
+ if (NumParts == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned ActualVF = std::min(MaxInst - I, VF);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 2b3728b556d9e..2984c616e8f21 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -47,13 +47,13 @@ define i32 @umax(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -72,13 +72,13 @@ define i32 @umax(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -153,13 +153,13 @@ define i32 @umin(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -178,13 +178,13 @@ define i32 @umin(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/div.ll b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
index 459c41a0b3eb5..1d21b66166f62 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/div.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
@@ -44,9 +44,9 @@ define i32 @sdiv() {
; SLOW-NEXT: Co...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Gheorghe-Teodor Bercea (doru1004) ChangesThis patch enabled the vectorization of i8 values for AMD GPUs. Patch is 164.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134934.diff 11 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2efca0d1d754f..17aa46774194a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
/// pad to. Default is no padding.
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
+ /// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
+ /// Currently only used by the SLP vectorizer.
+ bool canVectorizei8s() const;
+
/// @}
/// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
virtual void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
+ virtual bool canVectorizei8s() const = 0;
};
template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
Impl.collectKernelLaunchBounds(F, LB);
}
+
+ bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
};
template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3fe0a9101fdee..9ab97328370bd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxNumArgs() const { return UINT_MAX; }
+ bool canVectorizei8s() const { return false; }
+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
return 0;
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4fea4e5711f5a..ee0c3a9329a1f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
return TTIImpl->getMaxNumArgs();
}
+bool TargetTransformInfo::canVectorizei8s() const {
+ return TTIImpl->canVectorizei8s();
+}
+
bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
return TTIImpl->shouldExpandReduction(II);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..9c29402755e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
- return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
- : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
- : 1;
+ return ElemWidth == 8 ? 4
+ : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+ : 1;
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
}
+InstructionCost GCNTTIImpl::getScalarizationOverhead(
+ VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
+ unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
+ if (NumVectorElts > 1 &&
+ InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
+ return 0;
+ return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
+ CostKind, VL);
+}
+
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+
+ if (NumVectorElts > 1 &&
+ VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
+ return 0;
+
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
}
+
+bool GCNTTIImpl::canVectorizei8s() const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index f5062070ac6f4..239becb9aab15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getVectorSplitCost() { return 0; }
+ InstructionCost getScalarizationOverhead(VectorType *InTy,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind,
+ ArrayRef<Value *> VL = {});
+
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+ /// \return return true if we can pack 4 i8s into an i32.
+ bool canVectorizei8s() const;
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d411f2cb203a..efa1e77e4a2ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
LI0->getPointerAddressSpace(), CostKind);
} else {
- VecLdCost = TTI->getMemoryOpCost(
- Instruction::Load, VecTy, LI0->getAlign(),
- LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+ if (VecTy->getElementType() ==
+ IntegerType::getInt8Ty(VecTy->getContext()) &&
+ TTI->canVectorizei8s()) {
+ VecLdCost = 1;
+ } else {
+ VecLdCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
+ LI0->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo());
+ }
}
break;
case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
auto *VecTy = getWidenedType(ScalarTy, VF);
- if (TTI->getNumberOfParts(VecTy) == VF)
+ unsigned NumParts = TTI->getNumberOfParts(VecTy);
+ if (TTI->canVectorizei8s() &&
+ VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+ NumParts = 1;
+ if (NumParts == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned ActualVF = std::min(MaxInst - I, VF);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 2b3728b556d9e..2984c616e8f21 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -47,13 +47,13 @@ define i32 @umax(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -72,13 +72,13 @@ define i32 @umax(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -153,13 +153,13 @@ define i32 @umin(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -178,13 +178,13 @@ define i32 @umin(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/div.ll b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
index 459c41a0b3eb5..1d21b66166f62 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/div.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
@@ -44,9 +44,9 @@ define i32 @sdiv() {
; SLOW-NEXT: Co...
[truncated]
|
19b28a7
to
fedc510
Compare
fedc510
to
9a927fe
Compare
9a927fe
to
3f92804
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The logic in the SLP vectorizer should be target-independent
llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll
Outdated
Show resolved
Hide resolved
cc100d8
to
9c0cdeb
Compare
9c0cdeb
to
ce790a6
Compare
65a7bc4
to
c003317
Compare
; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = add <4 x i8> undef, undef | ||
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = add <5 x i8> undef, undef | ||
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v6i8 = add <6 x i8> undef, undef | ||
; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8 = add <32 x i8> undef, undef |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cost of v32i8 add should be 32
https://godbolt.org/z/o5W1Y1oer
In the ISA, we don't have an instruction to do vectorized i8 adds. In the example, we see that the v32i8 vector add is scalarized into a bunch of individual i8 adds. Given that we need 32 instructions, the cost of the v32i8 add should be 32.
The result of changing the cost to 8 is that the vector version has a cost savings of 24 over the scalar version. Thus, for a vectorizeable chain, the vectorized version can degrade relative to the scalar version by a cost of up to 23. Since the cost savings of 24 doesn't actually translate to improved assembly, this means we will allow code degradation for no benefit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are the other costs ok, is it just the cost for 32 the problem?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you expecting the following code to be worth vectorizing from a cost model perspective?
%el0 = extractelement <16 x i8> %invec, i64 0
%el1 = extractelement <16 x i8> %invec, i64 1
%el2 = extractelement <16 x i8> %invec, i64 2
%el3 = extractelement <16 x i8> %invec, i64 3
%mul0 = mul i8 %el0, 1
%mul1 = mul i8 %el1, 1
%mul2 = mul i8 %el2, 1
%mul3 = mul i8 %el3, 1
%add0 = add i8 %mul0, 1
%add1 = add i8 %mul1, 1
%add2 = add i8 %mul2, 1
%add3 = add i8 %mul3, 1
%vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0
%vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1
%vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2
%vecins3 = insertelement <16 x i8> %vecins2, i8 %add3, i64 3
store <16 x i8> %vecins3, ptr %out
ret void
Because with the patch it can currently get transformed to this:
%0 = shufflevector <16 x i8> %invec, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%1 = mul <4 x i8> %0, splat (i8 1)
%2 = add <4 x i8> %1, splat (i8 1)
%3 = shufflevector <4 x i8> %2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
store <16 x i8> %3, ptr %out, align 16
ret void
So there is no benefit to having this vectorized code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok I just verified that the assembly for the 2 cases above is more or less identical in which case the current cost model is already correct in that regard.
c003317
to
571d100
Compare
571d100
to
d19fb9a
Compare
This patch enabled the vectorization of i8 values for AMD GPUs.