Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,25 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(
VIC);
}

// Building a packed <2 x float> for a v_pk_*_f32 source is not free: the
// two lanes must occupy an aligned VGPR pair, so a statically-indexed
// insert that synthesizes such a pair costs ~1 v_mov_b32 to align the lane.
// Charging it keeps the SLP vectorizer honest about manufacturing pairs
// from non-adjacent scalars; without it SLP over-vectorizes and inflates
// register pressure.
//
// Restricted to f32: at 32-bit width the only packed VOP3P ALU ops are
// v_pk_{add,mul,fma}_f32 - there is no packed 32-bit integer op - so a
// <2 x i32> has no pair-alignment consumer and must not be taxed. Scoped to
// the gfx9 generation (gfx90a/gfx94x/gfx950); newer packed-FP32 targets
// (gfx12) are left unchanged pending separate evaluation.
if (Opcode == Instruction::InsertElement && EltSize == 32 &&
ST->hasPackedFP32Ops() &&
ST->getGeneration() == AMDGPUSubtarget::GFX9)
if (auto *VecTy = dyn_cast<FixedVectorType>(ValTy))
if (VecTy->getNumElements() == 2 && VecTy->getElementType()->isFloatTy())
return 1;

// Extracts are just reads of a subregister, so are free. Inserts are
// considered free because we don't want to have any cost for scalarizing
// operations, and we don't have to copy into a different register class.
Expand Down Expand Up @@ -1344,6 +1363,56 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);

unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());

// Packed FP32 (gfx9, gfx90a+): two FP32 elements form a v_pk_*_f32 source
// that must live in an aligned VGPR pair, so each non-identity element in the
// result costs ~1 v_mov_b32 to align into its pair slot. The 16/8-bit branch
// below relies on subword packing (multiple elements per VGPR) and does not
// apply to FP32, so FP32 is handled separately here. The intent is to make
// the SLP vectorizer cost-honest about synthesizing pairs from non-adjacent
// scalars.
//
// f32-only and gfx9-only for the same reasons as in getVectorInstrCost: there
// is no packed 32-bit integer op (so <2 x i32> shuffles are not taxed), and
// newer packed-FP32 targets (gfx12) are left unchanged pending evaluation.
if (ScalarSize == 32 && SrcTy->getElementType()->isFloatTy() &&
ST->hasPackedFP32Ops() &&
ST->getGeneration() == AMDGPUSubtarget::GFX9) {
auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy);
if (DstVecTy && SrcVecTy && DstVecTy->getNumElements() >= 2) {
unsigned NumDstElts = DstVecTy->getNumElements();
unsigned NumSrcElts = SrcVecTy->getNumElements();
InstructionCost PerMove = 1;
switch (Kind) {
case TTI::SK_Broadcast:
// Splat scalar to N FP32 lanes: 1 v_mov per non-source lane.
return PerMove * (NumDstElts - 1);
case TTI::SK_Reverse:
case TTI::SK_PermuteSingleSrc:
case TTI::SK_PermuteTwoSrc: {
if (Mask.empty())
return PerMove * NumDstElts;
unsigned NumMoves = 0;
for (unsigned I = 0; I < Mask.size(); ++I) {
int SrcIdx = Mask[I];
if (SrcIdx == -1)
continue;
unsigned EffectiveSrc = SrcIdx < (int)NumSrcElts
? unsigned(SrcIdx)
: unsigned(SrcIdx) - NumSrcElts;
if (EffectiveSrc == I)
continue;
++NumMoves;
}
return PerMove * NumMoves;
}
default:
break;
}
}
}

if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
Expand Down
85 changes: 63 additions & 22 deletions llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx950 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX950-SIZE %s
; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX900-SIZE %s
; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s

define void @maximum_f16() {
Expand Down Expand Up @@ -155,30 +155,75 @@ define void @maximum_bf16() {
define void @maximum_f32() {
; GFX950-FASTF64-LABEL: 'maximum_f32'
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; ALL-LABEL: 'maximum_f32'
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
; GFX90A-FASTF64-LABEL: 'maximum_f32'
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SIZE-LABEL: 'maximum_f32'
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
; FASTF64-LABEL: 'maximum_f32'
; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; FASTF64-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; FASTF64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; FASTF64-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; FASTF64-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; FASTF64-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOWF64-LABEL: 'maximum_f32'
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX950-SIZE-LABEL: 'maximum_f32'
; GFX950-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; GFX950-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; GFX950-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; GFX950-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; GFX950-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; GFX950-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; GFX950-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; GFX90A-SIZE-LABEL: 'maximum_f32'
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; GFX900-SIZE-LABEL: 'maximum_f32'
; GFX900-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; GFX900-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; GFX900-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; GFX900-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; GFX900-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; GFX900-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; GFX900-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SLOW-SIZE-LABEL: 'maximum_f32'
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f32 = call float @llvm.maximum.f32(float undef, float undef)
%v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
Expand Down Expand Up @@ -225,7 +270,3 @@ define void @maximum_f64() {
%v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; FASTF64: {{.*}}
; GFX90A-FASTF64: {{.*}}
; GFX90A-SIZE: {{.*}}
Loading
Loading