Skip to content

Commit 3860dfc

Browse files
committed
AMDGPU: Implement isExtractVecEltCheap
Once again we have excessive TLI hooks with bad defaults. Permit this for 32-bit element vectors, which are just use-different-register. We should permit 16-bit vectors as cheap with legal packed instructions, but I see some mixed improvements and regressions that need investigation.
1 parent ea73541 commit 3860dfc

File tree

5 files changed

+45
-16
lines changed

5 files changed

+45
-16
lines changed

Diff for: llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -1952,6 +1952,13 @@ bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
19521952
return Index == 0;
19531953
}
19541954

1955+
bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
1956+
// TODO: This should be more aggressive, particular for 16-bit element
1957+
// vectors. However there are some mixed improvements and regressions.
1958+
EVT EltTy = VT.getVectorElementType();
1959+
return EltTy.getSizeInBits() % 32 == 0;
1960+
}
1961+
19551962
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
19561963
if (Subtarget->has16BitInsts() && VT == MVT::i16) {
19571964
switch (Op) {

Diff for: llvm/lib/Target/AMDGPU/SIISelLowering.h

+1
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
365365

366366
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
367367
unsigned Index) const override;
368+
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
368369

369370
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
370371

Diff for: llvm/test/CodeGen/AMDGPU/mad-mix.ll

+5-7
Original file line numberDiff line numberDiff line change
@@ -385,17 +385,15 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
385385
; SDAG-CI: ; %bb.0:
386386
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387387
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
388-
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v5
389388
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
390-
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v1
389+
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
391390
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
392391
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
393-
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v4
394392
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
395-
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v5
396-
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0
397-
; SDAG-CI-NEXT: v_mad_f32 v0, v4, v2, v1
398-
; SDAG-CI-NEXT: v_mac_f32_e32 v1, v5, v3
393+
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
394+
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v0
395+
; SDAG-CI-NEXT: v_mad_f32 v0, v1, v2, v5
396+
; SDAG-CI-NEXT: v_mad_f32 v1, v4, v3, v5
399397
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
400398
;
401399
; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:

Diff for: llvm/test/CodeGen/AMDGPU/packed-fp32.ll

+27-5
Original file line numberDiff line numberDiff line change
@@ -549,24 +549,46 @@ bb:
549549
ret void
550550
}
551551

552-
; GCN-LABEL: {{^}}fadd_fadd_fsub:
552+
; GCN-LABEL: {{^}}fadd_fadd_fsub_0:
553553
; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
554554
; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
555-
; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
556-
; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
555+
556+
; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
557+
; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
558+
557559
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
558560
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
559-
define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) {
561+
define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
560562
bb:
561563
%i12 = fadd <2 x float> zeroinitializer, %arg
562-
%shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
564+
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
563565
%i13 = fadd <2 x float> zeroinitializer, %shift8
564566
%i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
565567
%i15 = fsub <2 x float> %i14, zeroinitializer
566568
store <2 x float> %i15, ptr undef
567569
ret void
568570
}
569571

572+
; GCN-LABEL: {{^}}fadd_fadd_fsub:
573+
; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
574+
; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
575+
576+
; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
577+
; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
578+
579+
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
580+
; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
581+
define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) {
582+
bb:
583+
%i12 = fadd <2 x float> %arg, %arg1
584+
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
585+
%i13 = fadd <2 x float> %arg1, %shift8
586+
%i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
587+
%i15 = fsub <2 x float> %i14, %arg1
588+
store <2 x float> %i15, ptr addrspace(1) %ptr
589+
ret void
590+
}
591+
570592
; GCN-LABEL: {{^}}fadd_shuffle_v4:
571593
; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
572594
; PACKED-SDAG-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}

Diff for: llvm/test/CodeGen/AMDGPU/trunc-combine.ll

+5-4
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32
184184
; SI: ; %bb.0:
185185
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186186
; SI-NEXT: v_lshr_b32_e32 v0, 16, v0
187+
; SI-NEXT: v_mov_b32_e32 v1, 0
187188
; SI-NEXT: s_setpc_b64 s[30:31]
188189
;
189190
; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
@@ -300,14 +301,15 @@ define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %a
300301
; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
301302
; SI: ; %bb.0:
302303
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
303-
; SI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0
304-
; SI-NEXT: v_mov_b32_e32 v1, 0xffff
304+
; SI-NEXT: v_or_b32_e32 v0, 17, v0
305+
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
306+
; SI-NEXT: v_mov_b32_e32 v1, 0
305307
; SI-NEXT: s_setpc_b64 s[30:31]
306308
;
307309
; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
308310
; VI: ; %bb.0:
309311
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
310-
; VI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0
312+
; VI-NEXT: v_or_b32_e32 v0, 17, v0
311313
; VI-NEXT: s_setpc_b64 s[30:31]
312314
%undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
313315
%lshr = or <2 x i32> %undef.hi.elt, splat (i32 17)
@@ -368,7 +370,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %
368370
; VI: ; %bb.0:
369371
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370372
; VI-NEXT: v_mul_lo_u32 v0, v0, 18
371-
; VI-NEXT: v_and_b32_e32 v0, 0xfffe, v0
372373
; VI-NEXT: s_setpc_b64 s[30:31]
373374
%undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
374375
%lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18)

0 commit comments

Comments
 (0)