Skip to content

[RISCV][TTI] Use processShuffleMask for shuffle legalization estimate #136191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 22, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 61 additions & 41 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,61 @@ static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
}

/// Attempt to approximate the cost of a shuffle which will require splitting
/// during legalization. Note that processShuffleMasks is not an exact proxy
/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
/// reasonably close upperbound.
static InstructionCost costShuffleViaSplitting(RISCVTTIImpl &TTI, MVT LegalVT,
VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind) {
assert(LegalVT.isFixedLengthVector() && !Mask.empty());
unsigned LegalNumElts = LegalVT.getVectorNumElements();
// Number of destination vectors after legalization:
unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
// We are going to permute multiple sources and the result will be in
// multiple destinations. Providing an accurate cost only for splits where
// the element type remains the same.
if (NumOfDests <= 1 ||
LegalVT.getVectorElementType().getSizeInBits() !=
Tp->getElementType()->getPrimitiveSizeInBits() ||
LegalNumElts >= Tp->getElementCount().getFixedValue())
return InstructionCost::getInvalid();

unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
unsigned LegalVTSize = LegalVT.getStoreSize();
// Number of source vectors after legalization:
unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);

auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);

unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
assert(NormalizedVF >= Mask.size() &&
"Normalized mask expected to be not shorter than original mask.");
copy(Mask, NormalizedMask.begin());
InstructionCost Cost = 0;
SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
processShuffleMasks(
NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
[&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
return;
if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
.second)
return;
Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
RegMask, CostKind, 0, nullptr);
},
[&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
CostKind, 0, nullptr);
});
return Cost;
}

/// Try to perform better estimation of the permutation.
/// 1. Split the source/destination vectors into real registers.
/// 2. Do the mask analysis to identify which real registers are
Expand Down Expand Up @@ -645,48 +700,13 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return true;
}
};
// We are going to permute multiple sources and the result will be in
// multiple destinations. Providing an accurate cost only for splits where
// the element type remains the same.

if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
shouldSplit(Kind) &&
LT.second.getVectorElementType().getSizeInBits() ==
Tp->getElementType()->getPrimitiveSizeInBits() &&
LT.second.getVectorNumElements() <
cast<FixedVectorType>(Tp)->getNumElements() &&
divideCeil(Mask.size(),
cast<FixedVectorType>(Tp)->getNumElements()) ==
static_cast<unsigned>(*LT.first.getValue())) {
unsigned NumRegs = *LT.first.getValue();
unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);

InstructionCost Cost = 0;
for (unsigned I = 0, NumSrcRegs = divideCeil(Mask.size(), SubVF);
I < NumSrcRegs; ++I) {
bool IsSingleVector = true;
SmallVector<int> SubMask(SubVF, PoisonMaskElem);
transform(
Mask.slice(I * SubVF,
I == NumSrcRegs - 1 ? Mask.size() % SubVF : SubVF),
SubMask.begin(), [&](int I) -> int {
if (I == PoisonMaskElem)
return PoisonMaskElem;
bool SingleSubVector = I / VF == 0;
IsSingleVector &= SingleSubVector;
return (SingleSubVector ? 0 : 1) * SubVF + (I % VF) % SubVF;
});
if (all_of(enumerate(SubMask), [](auto &&P) {
return P.value() == PoisonMaskElem ||
static_cast<unsigned>(P.value()) == P.index();
}))
continue;
Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc
: TTI::SK_PermuteTwoSrc,
SubVecTy, SubMask, CostKind, 0, nullptr);
}
return Cost;
shouldSplit(Kind)) {
InstructionCost SplitCost =
costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
if (SplitCost.isValid())
return SplitCost;
}
}

Expand Down
Loading
Loading