-
Notifications
You must be signed in to change notification settings - Fork 13.3k
AMDGPU: Implement isExtractVecEltCheap #122460
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Implement isExtractVecEltCheap #122460
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesOnce again we have excessive TLI hooks with bad defaults. Permit this Full diff: https://github.com/llvm/llvm-project/pull/122460.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 529d9ba17d4f60..3fa7add8c6f268 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1949,6 +1949,13 @@ bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
return Index == 0;
}
+bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
+ // TODO: This should be more aggressive, particular for 16-bit element
+ // vectors. However there are some mixed improvements and regressions.
+ EVT EltTy = VT.getVectorElementType();
+ return EltTy.getSizeInBits() % 32 == 0;
+}
+
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
if (Subtarget->has16BitInsts() && VT == MVT::i16) {
switch (Op) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 5c215f76552d9c..bbb96d9115a0a9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -365,6 +365,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const override;
+ bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index b520dd1060ec8c..30e3bc3ba5da85 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -385,17 +385,15 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v5
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v1
+; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v4
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v5
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0
-; SDAG-CI-NEXT: v_mad_f32 v0, v4, v2, v1
-; SDAG-CI-NEXT: v_mac_f32_e32 v1, v5, v3
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SDAG-CI-NEXT: v_mad_f32 v0, v1, v2, v5
+; SDAG-CI-NEXT: v_mad_f32 v1, v4, v3, v5
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 6b7eff316fe95b..0833dada43e4d5 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -549,17 +549,19 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fadd_fadd_fsub:
+; GCN-LABEL: {{^}}fadd_fadd_fsub_0:
; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
+
+; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) {
+define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
bb:
%i12 = fadd <2 x float> zeroinitializer, %arg
- %shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
+ %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
%i13 = fadd <2 x float> zeroinitializer, %shift8
%i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
%i15 = fsub <2 x float> %i14, zeroinitializer
@@ -567,6 +569,26 @@ bb:
ret void
}
+; GCN-LABEL: {{^}}fadd_fadd_fsub:
+; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
+
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) {
+bb:
+ %i12 = fadd <2 x float> %arg, %arg1
+ %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+ %i13 = fadd <2 x float> %arg1, %shift8
+ %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
+ %i15 = fsub <2 x float> %i14, %arg1
+ store <2 x float> %i15, ptr addrspace(1) %ptr
+ ret void
+}
+
; GCN-LABEL: {{^}}fadd_shuffle_v4:
; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; PACKED-SDAG-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index 02e30b6c68e994..af2af23640cee5 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -184,6 +184,7 @@ define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshr_b32_e32 v0, 16, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
@@ -300,14 +301,15 @@ define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %a
; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0
-; SI-NEXT: v_mov_b32_e32 v1, 0xffff
+; SI-NEXT: v_or_b32_e32 v0, 17, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0
+; VI-NEXT: v_or_b32_e32 v0, 17, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
%lshr = or <2 x i32> %undef.hi.elt, splat (i32 17)
@@ -368,7 +370,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mul_lo_u32 v0, v0, 18
-; VI-NEXT: v_and_b32_e32 v0, 0xfffe, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
%lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18)
|
52e6f81
to
d12bd25
Compare
df2fcf7
to
b93cb29
Compare
b93cb29
to
ea6a8ce
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
// TODO: This should be more aggressive, particular for 16-bit element | ||
// vectors. However there are some mixed improvements and regressions. | ||
EVT EltTy = VT.getVectorElementType(); | ||
return EltTy.getSizeInBits() % 32 == 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@broxigarchen @Sisyph for true16 we should aim to return EltTy.getSizeInBits() % 16 == 0
here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even without true16 it should be better (maybe only even aligned cases?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, without true16 EltTy.getSizeInBits() * Index % 32 == 0
would make sense to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes I would think EltTy.getSizeInBits() * Index % 16 == 0 for True16 would be the way to go.
ea6a8ce
to
78a65e5
Compare
0fd42cf
to
4c6f15f
Compare
da5089e
to
93c5e27
Compare
ea73541
to
c41456e
Compare
Once again we have excessive TLI hooks with bad defaults. Permit this for 32-bit element vectors, which are just use-different-register. We should permit 16-bit vectors as cheap with legal packed instructions, but I see some mixed improvements and regressions that need investigation.
93c5e27
to
4170ae5
Compare
Once again we have excessive TLI hooks with bad defaults. Permit this
for 32-bit element vectors, which are just use-different-register.
We should permit 16-bit vectors as cheap with legal packed instructions,
but I see some mixed improvements and regressions that need investigation.