Skip to content

Commit 1f4baf2

Browse files
committed
Enable vectorization of i8 values.
1 parent a5024cd commit 1f4baf2

File tree

4 files changed

+1519
-3
lines changed

4 files changed

+1519
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

+32-3
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
return ElemWidth == 8 ? 4
348+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
349+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
350+
: 1;
350351
}
351352

352353
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1422,3 +1423,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14221423
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14231424
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14241425
}
1426+
1427+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1428+
Align Alignment,
1429+
unsigned AddressSpace,
1430+
TTI::TargetCostKind CostKind,
1431+
TTI::OperandValueInfo OpInfo,
1432+
const Instruction *I) const {
1433+
if (VectorType *VecTy = dyn_cast<VectorType>(Src))
1434+
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1435+
VecTy->getElementType() ==
1436+
IntegerType::getInt8Ty(VecTy->getContext())) {
1437+
return ((DL.getTypeSizeInBits(VecTy) - 1) /
1438+
getLoadStoreVecRegBitWidth(AddressSpace)) +
1439+
1;
1440+
}
1441+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1442+
OpInfo, I);
1443+
}
1444+
1445+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1446+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
1447+
if (VecTy->getElementType() ==
1448+
IntegerType::getInt8Ty(VecTy->getContext())) {
1449+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1450+
return ((ElementCount - 1) / 4) + 1;
1451+
}
1452+
return BaseT::getNumberOfParts(Tp);
1453+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

+14
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
290290
void collectKernelLaunchBounds(
291291
const Function &F,
292292
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
293+
294+
/// Account for loads of i8 vector types to have reduced cost. For
295+
/// example the cost of load 4 i8s values is one is the cost of loading
296+
/// a single i32 value.
297+
InstructionCost getMemoryOpCost(
298+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
299+
TTI::TargetCostKind CostKind,
300+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
301+
const Instruction *I = nullptr) const override;
302+
303+
/// When counting parts on AMD GPUs, account for i8s being grouped
304+
/// together under a single i32 value. Otherwise fall back to base
305+
/// implementation.
306+
unsigned getNumberOfParts(Type *Tp) const override;
293307
};
294308

295309
} // end namespace llvm

0 commit comments

Comments
 (0)