ROCm · michaelselehov · May 29, 2026
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1042,6 +1042,25 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(
                                        VIC);
     }
 
+    // Building a packed <2 x float> for a v_pk_*_f32 source is not free: the
+    // two lanes must occupy an aligned VGPR pair, so a statically-indexed
+    // insert that synthesizes such a pair costs ~1 v_mov_b32 to align the lane.
+    // Charging it keeps the SLP vectorizer honest about manufacturing pairs
+    // from non-adjacent scalars; without it SLP over-vectorizes and inflates
+    // register pressure.
+    //
+    // Restricted to f32: at 32-bit width the only packed VOP3P ALU ops are
+    // v_pk_{add,mul,fma}_f32 - there is no packed 32-bit integer op - so a
+    // <2 x i32> has no pair-alignment consumer and must not be taxed. Scoped to
+    // the gfx9 generation (gfx90a/gfx94x/gfx950); newer packed-FP32 targets
+    // (gfx12) are left unchanged pending separate evaluation.
+    if (Opcode == Instruction::InsertElement && EltSize == 32 &&
+        ST->hasPackedFP32Ops() &&
+        ST->getGeneration() == AMDGPUSubtarget::GFX9)
+      if (auto *VecTy = dyn_cast<FixedVectorType>(ValTy))
+        if (VecTy->getNumElements() == 2 && VecTy->getElementType()->isFloatTy())
+          return 1;
+
     // Extracts are just reads of a subregister, so are free. Inserts are
     // considered free because we don't want to have any cost for scalarizing
     // operations, and we don't have to copy into a different register class.
@@ -1344,6 +1363,56 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
 
   unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
+
+  // Packed FP32 (gfx9, gfx90a+): two FP32 elements form a v_pk_*_f32 source
+  // that must live in an aligned VGPR pair, so each non-identity element in the
+  // result costs ~1 v_mov_b32 to align into its pair slot. The 16/8-bit branch
+  // below relies on subword packing (multiple elements per VGPR) and does not
+  // apply to FP32, so FP32 is handled separately here. The intent is to make
+  // the SLP vectorizer cost-honest about synthesizing pairs from non-adjacent
+  // scalars.
+  //
+  // f32-only and gfx9-only for the same reasons as in getVectorInstrCost: there
+  // is no packed 32-bit integer op (so <2 x i32> shuffles are not taxed), and
+  // newer packed-FP32 targets (gfx12) are left unchanged pending evaluation.
+  if (ScalarSize == 32 && SrcTy->getElementType()->isFloatTy() &&
+      ST->hasPackedFP32Ops() &&
+      ST->getGeneration() == AMDGPUSubtarget::GFX9) {
+    auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+    auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy);
+    if (DstVecTy && SrcVecTy && DstVecTy->getNumElements() >= 2) {
+      unsigned NumDstElts = DstVecTy->getNumElements();
+      unsigned NumSrcElts = SrcVecTy->getNumElements();
+      InstructionCost PerMove = 1;
+      switch (Kind) {
+      case TTI::SK_Broadcast:
+        // Splat scalar to N FP32 lanes: 1 v_mov per non-source lane.
+        return PerMove * (NumDstElts - 1);
+      case TTI::SK_Reverse:
+      case TTI::SK_PermuteSingleSrc:
+      case TTI::SK_PermuteTwoSrc: {
+        if (Mask.empty())
+          return PerMove * NumDstElts;
+        unsigned NumMoves = 0;
+        for (unsigned I = 0; I < Mask.size(); ++I) {
+          int SrcIdx = Mask[I];
+          if (SrcIdx == -1)
+            continue;
+          unsigned EffectiveSrc = SrcIdx < (int)NumSrcElts
+                                      ? unsigned(SrcIdx)
+                                      : unsigned(SrcIdx) - NumSrcElts;
+          if (EffectiveSrc == I)
+            continue;
+          ++NumMoves;
+        }
+        return PerMove * NumMoves;
+      }
+      default:
+        break;
+      }
+    }
+  }
+
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       (ScalarSize == 16 || ScalarSize == 8)) {
     // Larger vector widths may require additional instructions, but are

diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
@@ -5,7 +5,7 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx950 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX950-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX900-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
 
 define void @maximum_f16() {
@@ -155,30 +155,75 @@ define void @maximum_bf16() {
 define void @maximum_f32() {
 ; GFX950-FASTF64-LABEL: 'maximum_f32'
 ; GFX950-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
-; GFX950-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; GFX950-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
 ; GFX950-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
 ; GFX950-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
 ; GFX950-FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
 ; GFX950-FASTF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
 ; GFX950-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-LABEL: 'maximum_f32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX90A-FASTF64-LABEL: 'maximum_f32'
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'maximum_f32'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FASTF64-LABEL: 'maximum_f32'
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maximum_f32'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX950-SIZE-LABEL: 'maximum_f32'
+; GFX950-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; GFX950-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; GFX950-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; GFX950-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; GFX950-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; GFX950-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; GFX950-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX90A-SIZE-LABEL: 'maximum_f32'
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX900-SIZE-LABEL: 'maximum_f32'
+; GFX900-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; GFX900-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; GFX900-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; GFX900-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; GFX900-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; GFX900-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; GFX900-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maximum_f32'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.maximum.f32(float undef, float undef)
   %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
@@ -225,7 +270,3 @@ define void @maximum_f64() {
   %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FASTF64: {{.*}}
-; GFX90A-FASTF64: {{.*}}
-; GFX90A-SIZE: {{.*}}