@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
344
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
346
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
347
+ return ElemWidth == 8 ? 4
348
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
349
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
350
+ : 1 ;
350
351
}
351
352
352
353
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1422,3 +1423,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
1422
1423
LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
1423
1424
LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
1424
1425
}
1426
+
1427
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1428
+ Align Alignment,
1429
+ unsigned AddressSpace,
1430
+ TTI::TargetCostKind CostKind,
1431
+ TTI::OperandValueInfo OpInfo,
1432
+ const Instruction *I) const {
1433
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src))
1434
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1435
+ VecTy->getElementType () ==
1436
+ IntegerType::getInt8Ty (VecTy->getContext ())) {
1437
+ return ((DL.getTypeSizeInBits (VecTy) - 1 ) /
1438
+ getLoadStoreVecRegBitWidth (AddressSpace)) +
1439
+ 1 ;
1440
+ }
1441
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1442
+ OpInfo, I);
1443
+ }
1444
+
1445
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) const {
1446
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
1447
+ if (VecTy->getElementType () ==
1448
+ IntegerType::getInt8Ty (VecTy->getContext ())) {
1449
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1450
+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1451
+ }
1452
+ return BaseT::getNumberOfParts (Tp);
1453
+ }
0 commit comments