Skip to content

Commit 7831c5e

Browse files
authored
[VectorCombine] Pull out TargetCostKind argument to allow globally set cost kind value (llvm#118652)
Don't use TCK_RecipThroughput independently in every VectorCombine fold. Some prep work to allow a potential future patch to use VectorCombine to optimise for code size for -Os/Oz builds (setting TCK_CodeSize instead of TCK_RecipThroughput). There's still more cleanup to do as a lot of get*Cost calls are relying on the default TargetCostKind value (usually TCK_RecipThroughput but not always).
1 parent 669f704 commit 7831c5e

File tree

1 file changed

+25
-45
lines changed

1 file changed

+25
-45
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+25-45
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,10 @@ class VectorCombine {
6767
public:
6868
VectorCombine(Function &F, const TargetTransformInfo &TTI,
6969
const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
70-
const DataLayout *DL, bool TryEarlyFoldsOnly)
70+
const DataLayout *DL, TTI::TargetCostKind CostKind,
71+
bool TryEarlyFoldsOnly)
7172
: F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL),
72-
TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
73+
CostKind(CostKind), TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
7374

7475
bool run();
7576

@@ -81,6 +82,7 @@ class VectorCombine {
8182
AAResults &AA;
8283
AssumptionCache ∾
8384
const DataLayout *DL;
85+
TTI::TargetCostKind CostKind;
8486

8587
/// If true, only perform beneficial early IR transforms. Do not introduce new
8688
/// vector operations.
@@ -249,7 +251,6 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
249251
InstructionCost OldCost =
250252
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
251253
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
252-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
253254
OldCost +=
254255
TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
255256
/* Insert */ true, HasExtract, CostKind);
@@ -329,11 +330,11 @@ bool VectorCombine::widenSubvectorLoad(Instruction &I) {
329330
// undef value is 0. We could add that cost if the cost model accurately
330331
// reflects the real cost of that operation.
331332
InstructionCost OldCost =
332-
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
333+
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
333334

334335
// New pattern: load PtrOp
335336
InstructionCost NewCost =
336-
TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS);
337+
TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
337338

338339
// We can aggressively convert to the vector form because the backend can
339340
// invert this transform if it does not result in a performance win.
@@ -366,7 +367,6 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
366367
return nullptr;
367368

368369
Type *VecTy = Ext0->getVectorOperand()->getType();
369-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
370370
assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
371371
InstructionCost Cost0 =
372372
TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
@@ -436,7 +436,6 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
436436
// both sequences.
437437
unsigned Ext0Index = Ext0IndexC->getZExtValue();
438438
unsigned Ext1Index = Ext1IndexC->getZExtValue();
439-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
440439

441440
InstructionCost Extract0Cost =
442441
TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
@@ -683,7 +682,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
683682
Mask[Index] = Index + NumElts;
684683

685684
Type *ScalarTy = VecTy->getScalarType();
686-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
687685
InstructionCost OldCost =
688686
TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
689687
TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
@@ -772,21 +770,20 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) {
772770
unsigned NumOps = IsUnary ? 1 : 2;
773771

774772
// The new shuffle must not cost more than the old shuffle.
775-
TargetTransformInfo::TargetCostKind CK =
776-
TargetTransformInfo::TCK_RecipThroughput;
777773
TargetTransformInfo::ShuffleKind SK =
778774
IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
779775
: TargetTransformInfo::SK_PermuteTwoSrc;
780776

781777
InstructionCost DestCost =
782-
TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) +
778+
TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CostKind) +
783779
(NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
784780
TargetTransformInfo::CastContextHint::None,
785-
CK));
781+
CostKind));
786782
InstructionCost SrcCost =
787-
TTI.getShuffleCost(SK, SrcTy, Mask, CK) +
783+
TTI.getShuffleCost(SK, SrcTy, Mask, CostKind) +
788784
TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
789-
TargetTransformInfo::CastContextHint::None, CK);
785+
TargetTransformInfo::CastContextHint::None,
786+
CostKind);
790787
if (DestCost > SrcCost || !DestCost.isValid())
791788
return false;
792789

@@ -841,7 +838,6 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
841838
// Calculate cost of splatting both operands into vectors and the vector
842839
// intrinsic
843840
VectorType *VecTy = cast<VectorType>(VPI.getType());
844-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
845841
SmallVector<int> Mask;
846842
if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
847843
Mask.resize(FVTy->getNumElements(), 0);
@@ -1003,7 +999,6 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
1003999

10041000
// Get cost estimate for the insert element. This cost will factor into
10051001
// both sequences.
1006-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10071002
InstructionCost InsertCost = TTI.getVectorInstrCost(
10081003
Instruction::InsertElement, VecTy, CostKind, Index);
10091004
InstructionCost OldCost =
@@ -1080,7 +1075,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
10801075

10811076
auto *Ext0 = cast<ExtractElementInst>(I0);
10821077
auto *Ext1 = cast<ExtractElementInst>(I1);
1083-
ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
1078+
ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
10841079
if (!ConvertToShuf)
10851080
return false;
10861081
assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
@@ -1089,13 +1084,12 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
10891084
// The original scalar pattern is:
10901085
// binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
10911086
CmpInst::Predicate Pred = P0;
1092-
unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
1093-
: Instruction::ICmp;
1087+
unsigned CmpOpcode =
1088+
CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
10941089
auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
10951090
if (!VecTy)
10961091
return false;
10971092

1098-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10991093
InstructionCost Ext0Cost =
11001094
TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0),
11011095
Ext1Cost =
@@ -1386,7 +1380,6 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
13861380
}
13871381

13881382
auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
1389-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13901383
OriginalCost +=
13911384
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
13921385
Index ? Index->getZExtValue() : -1);
@@ -1480,8 +1473,6 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
14801473
}
14811474

14821475
// Try to merge shuffles across the binop if the new shuffles are not costly.
1483-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1484-
14851476
InstructionCost OldCost =
14861477
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
14871478
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
@@ -1575,8 +1566,6 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
15751566
}
15761567

15771568
// Try to replace a binop with a shuffle if the shuffle is not costly.
1578-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1579-
15801569
InstructionCost OldCost =
15811570
TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy, CostKind) +
15821571
TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy, CostKind) +
@@ -1672,8 +1661,6 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
16721661
FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
16731662

16741663
// Try to replace a castop with a shuffle if the shuffle is not costly.
1675-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1676-
16771664
InstructionCost CostC0 =
16781665
TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
16791666
TTI::CastContextHint::None, CostKind);
@@ -1767,8 +1754,6 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
17671754
}
17681755

17691756
// Try to merge the shuffles if the new shuffle is not costly.
1770-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1771-
17721757
InstructionCost InnerCost0 =
17731758
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
17741759
InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0);
@@ -1837,12 +1822,10 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
18371822
return false;
18381823

18391824
InstructionCost OldCost =
1840-
TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0),
1841-
TTI::TCK_RecipThroughput) +
1842-
TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1),
1843-
TTI::TCK_RecipThroughput) +
1825+
TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
1826+
TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
18441827
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask,
1845-
TTI::TCK_RecipThroughput, 0, nullptr, {II0, II1}, &I);
1828+
CostKind, 0, nullptr, {II0, II1}, &I);
18461829

18471830
SmallVector<Type *> NewArgsTy;
18481831
InstructionCost NewCost = 0;
@@ -1854,10 +1837,10 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
18541837
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
18551838
VecTy->getNumElements() * 2));
18561839
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
1857-
VecTy, OldMask, TTI::TCK_RecipThroughput);
1840+
VecTy, OldMask, CostKind);
18581841
}
18591842
IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
1860-
NewCost += TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput);
1843+
NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
18611844

18621845
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
18631846
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -1923,7 +1906,7 @@ generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
19231906
}
19241907

19251908
/// Detect concat of multiple values into a vector
1926-
static bool isFreeConcat(ArrayRef<InstLane> Item,
1909+
static bool isFreeConcat(ArrayRef<InstLane> Item, TTI::TargetCostKind CostKind,
19271910
const TargetTransformInfo &TTI) {
19281911
auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
19291912
unsigned NumElts = Ty->getNumElements();
@@ -1934,8 +1917,7 @@ static bool isFreeConcat(ArrayRef<InstLane> Item,
19341917
// during legalization.
19351918
SmallVector<int, 16> ConcatMask(NumElts * 2);
19361919
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
1937-
if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask,
1938-
TTI::TCK_RecipThroughput) != 0)
1920+
if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, CostKind) != 0)
19391921
return false;
19401922

19411923
unsigned NumSlices = Item.size() / NumElts;
@@ -2189,7 +2171,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
21892171
}
21902172
}
21912173

2192-
if (isFreeConcat(Item, TTI)) {
2174+
if (isFreeConcat(Item, CostKind, TTI)) {
21932175
ConcatLeafs.insert(FrontU);
21942176
continue;
21952177
}
@@ -2367,7 +2349,6 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
23672349
auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
23682350
Type *ResultTy = I.getType();
23692351

2370-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
23712352
InstructionCost OldCost = TTI.getArithmeticReductionCost(
23722353
ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
23732354
OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
@@ -2717,7 +2698,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
27172698
/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
27182699
/// Cost model calculations takes into account if zext(x) has other users and
27192700
/// whether it can be propagated through them too.
2720-
bool VectorCombine::shrinkType(llvm::Instruction &I) {
2701+
bool VectorCombine::shrinkType(Instruction &I) {
27212702
Value *ZExted, *OtherOperand;
27222703
if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
27232704
m_Value(OtherOperand))) &&
@@ -2746,7 +2727,6 @@ bool VectorCombine::shrinkType(llvm::Instruction &I) {
27462727

27472728
// Calculate costs of leaving current IR as it is and moving ZExt operation
27482729
// later, along with adding truncates if needed
2749-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
27502730
InstructionCost ZExtCost = TTI.getCastInstrCost(
27512731
Instruction::ZExt, BigTy, SmallTy,
27522732
TargetTransformInfo::CastContextHint::None, CostKind);
@@ -2833,7 +2813,6 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
28332813
auto *Ins = cast<InsertElementInst>(&I);
28342814
auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
28352815

2836-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
28372816
InstructionCost OldCost =
28382817
TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx) +
28392818
TTI.getVectorInstrCost(*Ins, VecTy, CostKind, InsIdx);
@@ -2981,7 +2960,8 @@ PreservedAnalyses VectorCombinePass::run(Function &F,
29812960
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
29822961
AAResults &AA = FAM.getResult<AAManager>(F);
29832962
const DataLayout *DL = &F.getDataLayout();
2984-
VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly);
2963+
VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
2964+
TryEarlyFoldsOnly);
29852965
if (!Combiner.run())
29862966
return PreservedAnalyses::all();
29872967
PreservedAnalyses PA;

0 commit comments

Comments
 (0)