AMDGPU: Implement isExtractVecEltCheap #122460

arsenm · 2025-01-10T14:07:57Z

Once again we have excessive TLI hooks with bad defaults. Permit this
for 32-bit element vectors, which are just use-different-register.
We should permit 16-bit vectors as cheap with legal packed instructions,
but I see some mixed improvements and regressions that need investigation.

arsenm · 2025-01-10T14:08:06Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-01-10T14:09:47Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Once again we have excessive TLI hooks with bad defaults. Permit this
for 32-bit element vectors, which are just use-different-register.
We should permit 16-bit vectors as cheap with legal packed instructions,
but I see some mixed improvements and regressions that need investigation.

Full diff: https://github.com/llvm/llvm-project/pull/122460.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+7)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1)
(modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+5-7)
(modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+27-5)
(modified) llvm/test/CodeGen/AMDGPU/trunc-combine.ll (+5-4)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 529d9ba17d4f60..3fa7add8c6f268 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1949,6 +1949,13 @@ bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   return Index == 0;
 }
 
+bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
+  // TODO: This should be more aggressive, particular for 16-bit element
+  // vectors. However there are some mixed improvements and regressions.
+  EVT EltTy = VT.getVectorElementType();
+  return EltTy.getSizeInBits() % 32 == 0;
+}
+
 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
     switch (Op) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 5c215f76552d9c..bbb96d9115a0a9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -365,6 +365,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                unsigned Index) const override;
+  bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
 
   bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
 
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index b520dd1060ec8c..30e3bc3ba5da85 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -385,17 +385,15 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; SDAG-CI:       ; %bb.0:
 ; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v4
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; SDAG-CI-NEXT:    v_mad_f32 v0, v4, v2, v1
-; SDAG-CI-NEXT:    v_mac_f32_e32 v1, v5, v3
+; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SDAG-CI-NEXT:    v_mad_f32 v0, v1, v2, v5
+; SDAG-CI-NEXT:    v_mad_f32 v1, v4, v3, v5
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 6b7eff316fe95b..0833dada43e4d5 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -549,17 +549,19 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_fadd_fsub:
+; GCN-LABEL: {{^}}fadd_fadd_fsub_0:
 ; GFX900:       v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
 ; GFX900:       v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
-; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
+
+; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+
 ; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
 ; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) {
+define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 bb:
   %i12 = fadd <2 x float> zeroinitializer, %arg
-  %shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
+  %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
   %i13 = fadd <2 x float> zeroinitializer, %shift8
   %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
   %i15 = fsub <2 x float> %i14, zeroinitializer
@@ -567,6 +569,26 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}fadd_fadd_fsub:
+; GFX900:       v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX900:       v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; PACKED-SDAG:  v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
+
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) {
+bb:
+  %i12 = fadd <2 x float> %arg, %arg1
+  %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+  %i13 = fadd <2 x float> %arg1, %shift8
+  %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
+  %i15 = fsub <2 x float> %i14, %arg1
+  store <2 x float> %i15, ptr addrspace(1) %ptr
+  ret void
+}
+
 ; GCN-LABEL: {{^}}fadd_shuffle_v4:
 ; GFX900-COUNT-4:       v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; PACKED-SDAG-COUNT-2:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index 02e30b6c68e994..af2af23640cee5 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -184,6 +184,7 @@ define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshr_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
@@ -300,14 +301,15 @@ define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %a
 ; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v0, 0xffff0011, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0xffff
+; SI-NEXT:    v_or_b32_e32 v0, 17, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, 0xffff0011, v0
+; VI-NEXT:    v_or_b32_e32 v0, 17, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
   %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17)
@@ -368,7 +370,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_lo_u32 v0, v0, 18
-; VI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
   %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18)

jayfoad

LGTM

jayfoad · 2025-01-14T16:59:43Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+  // TODO: This should be more aggressive, particular for 16-bit element
+  // vectors. However there are some mixed improvements and regressions.
+  EVT EltTy = VT.getVectorElementType();
+  return EltTy.getSizeInBits() % 32 == 0;


@broxigarchen @Sisyph for true16 we should aim to return EltTy.getSizeInBits() % 16 == 0 here.

Even without true16 it should be better (maybe only even aligned cases?)

Yeah, without true16 EltTy.getSizeInBits() * Index % 32 == 0 would make sense to me.

Yes I would think EltTy.getSizeInBits() * Index % 16 == 0 for True16 would be the way to go.

arsenm · 2025-01-17T01:30:53Z

Merge activity

Jan 16, 8:30 PM EST: A user started a stack merge that includes this pull request via Graphite.
Jan 16, 8:35 PM EST: Graphite rebased this pull request as part of a merge.
Jan 16, 8:38 PM EST: A user merged this pull request with Graphite.

Once again we have excessive TLI hooks with bad defaults. Permit this for 32-bit element vectors, which are just use-different-register. We should permit 16-bit vectors as cheap with legal packed instructions, but I see some mixed improvements and regressions that need investigation.

arsenm mentioned this pull request Jan 10, 2025

DAG: Fix vector bin op scalarize defining a partially undef vector #122459

Merged

arsenm added the backend:AMDGPU label Jan 10, 2025 — with Graphite App

arsenm requested review from broxigarchen, jayfoad, kosarev, Pierre-vh, rampitec, shiltian and Sisyph January 10, 2025 14:09

arsenm marked this pull request as ready for review January 10, 2025 14:09

rampitec approved these changes Jan 10, 2025

View reviewed changes

arsenm force-pushed the users/arsenm/dag-fix-scalarize-bin-op-of-splats-overdefining-one-undef-operand branch from 52e6f81 to d12bd25 Compare January 13, 2025 07:38

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from df2fcf7 to b93cb29 Compare January 13, 2025 07:38

arsenm mentioned this pull request Jan 13, 2025

DAG: Avoid forming shufflevector from a single extract_vector_elt #122672

Merged

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from b93cb29 to ea6a8ce Compare January 14, 2025 16:41

jayfoad approved these changes Jan 14, 2025

View reviewed changes

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from ea6a8ce to 78a65e5 Compare January 15, 2025 04:17

arsenm force-pushed the users/arsenm/dag-fix-scalarize-bin-op-of-splats-overdefining-one-undef-operand branch from 0fd42cf to 4c6f15f Compare January 16, 2025 07:55

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch 3 times, most recently from da5089e to 93c5e27 Compare January 16, 2025 12:55

arsenm force-pushed the users/arsenm/dag-fix-scalarize-bin-op-of-splats-overdefining-one-undef-operand branch from ea73541 to c41456e Compare January 17, 2025 01:32

Base automatically changed from users/arsenm/dag-fix-scalarize-bin-op-of-splats-overdefining-one-undef-operand to main January 17, 2025 01:34

arsenm added 2 commits January 17, 2025 01:35

Test updates after revision in parent

4170ae5

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from 93c5e27 to 4170ae5 Compare January 17, 2025 01:35

arsenm merged commit ca95519 into main Jan 17, 2025
5 of 7 checks passed

arsenm deleted the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch January 17, 2025 01:38

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Implement isExtractVecEltCheap #122460

AMDGPU: Implement isExtractVecEltCheap #122460

Uh oh!

arsenm commented Jan 10, 2025

Uh oh!

arsenm commented Jan 10, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jan 10, 2025

Uh oh!

jayfoad left a comment

Uh oh!

jayfoad Jan 14, 2025

Uh oh!

arsenm Jan 14, 2025

Uh oh!

jayfoad Jan 14, 2025

Uh oh!

Sisyph Jan 14, 2025

Uh oh!

arsenm commented Jan 17, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

AMDGPU: Implement isExtractVecEltCheap #122460

AMDGPU: Implement isExtractVecEltCheap #122460

Uh oh!

Conversation

arsenm commented Jan 10, 2025

Uh oh!

arsenm commented Jan 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 10, 2025

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

jayfoad Jan 14, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Jan 14, 2025

Choose a reason for hiding this comment

Uh oh!

jayfoad Jan 14, 2025

Choose a reason for hiding this comment

Uh oh!

Sisyph Jan 14, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm commented Jan 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

Uh oh!

arsenm commented Jan 10, 2025 •

edited

Loading

arsenm commented Jan 17, 2025 •

edited

Loading