Skip to content

Commit acdbd89

Browse files
committed
DAG: Avoid forming shufflevector from a single extract_vector_elt
This avoids regressions in a future AMDGPU commit. Previously we would have a build_vector (extract_vector_elt x), undef with free access to the elements bloated into a shuffle of one element + undef, which has much worse combine support than the extract. Alternatively could check aggressivelyPreferBuildVectorSources, but I'm not sure it's really different than isExtractVecEltCheap.
1 parent ea6a8ce commit acdbd89

File tree

6 files changed

+123
-91
lines changed

6 files changed

+123
-91
lines changed

Diff for: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+20-5
Original file line numberDiff line numberDiff line change
@@ -23799,6 +23799,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2379923799
SmallVector<SDValue, 8> VecIn;
2380023800
VecIn.push_back(SDValue());
2380123801

23802+
// If we have a single extract_element with a constant index, track the index
23803+
// value.
23804+
unsigned OneConstExtractIndex = ~0u;
23805+
2380223806
for (unsigned i = 0; i != NumElems; ++i) {
2380323807
SDValue Op = N->getOperand(i);
2380423808

@@ -23816,23 +23820,27 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2381623820

2381723821
// Not an undef or zero. If the input is something other than an
2381823822
// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
23819-
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23820-
!isa<ConstantSDNode>(Op.getOperand(1)))
23823+
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
2382123824
return SDValue();
23822-
SDValue ExtractedFromVec = Op.getOperand(0);
2382323825

23826+
SDValue ExtractedFromVec = Op.getOperand(0);
2382423827
if (ExtractedFromVec.getValueType().isScalableVector())
2382523828
return SDValue();
23829+
auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
23830+
if (!ExtractIdx)
23831+
return SDValue();
2382623832

23827-
const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
23828-
if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
23833+
if (ExtractIdx->getAsAPIntVal().uge(
23834+
ExtractedFromVec.getValueType().getVectorNumElements()))
2382923835
return SDValue();
2383023836

2383123837
// All inputs must have the same element type as the output.
2383223838
if (VT.getVectorElementType() !=
2383323839
ExtractedFromVec.getValueType().getVectorElementType())
2383423840
return SDValue();
2383523841

23842+
OneConstExtractIndex = ExtractIdx->getZExtValue();
23843+
2383623844
// Have we seen this input vector before?
2383723845
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
2383823846
// a map back from SDValues to numbers isn't worth it.
@@ -23855,6 +23863,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2385523863
// VecIn accordingly.
2385623864
bool DidSplitVec = false;
2385723865
if (VecIn.size() == 2) {
23866+
// If we only found a single constant indexed extract_vector_elt feeding the
23867+
// build_vector, do not produce a more complicated shuffle if the extract is
23868+
// cheap.
23869+
if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
23870+
TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
23871+
return SDValue();
23872+
2385823873
unsigned MaxIndex = 0;
2385923874
unsigned NearestPow2 = 0;
2386023875
SDValue Vec = VecIn.back();

Diff for: llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

+5-5
Original file line numberDiff line numberDiff line change
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
452452
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
453453
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
454454
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
455-
; GCN-NEXT: v_mov_b32_e32 v3, s1
456-
; GCN-NEXT: v_mov_b32_e32 v0, s2
457-
; GCN-NEXT: v_mov_b32_e32 v1, s3
458-
; GCN-NEXT: v_mov_b32_e32 v2, s0
459-
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
455+
; GCN-NEXT: v_mov_b32_e32 v0, s0
456+
; GCN-NEXT: v_mov_b32_e32 v2, s2
457+
; GCN-NEXT: v_mov_b32_e32 v1, s1
458+
; GCN-NEXT: v_mov_b32_e32 v3, s3
459+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
460460
; GCN-NEXT: s_endpgm
461461
entry:
462462
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel

Diff for: llvm/test/CodeGen/X86/avx512-build-vector.ll

+3-5
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,9 @@ define <16 x i32> @test2(<16 x i32> %x) {
1414
define <16 x float> @test3(<4 x float> %a) {
1515
; CHECK-LABEL: test3:
1616
; CHECK: ## %bb.0:
17-
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
18-
; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
19-
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
20-
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
21-
; CHECK-NEXT: vmovaps %zmm1, %zmm0
17+
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero
18+
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
19+
; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
2220
; CHECK-NEXT: retq
2321
%b = extractelement <4 x float> %a, i32 2
2422
%c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5

Diff for: llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

+89-68
Original file line numberDiff line numberDiff line change
@@ -2846,12 +2846,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec
28462846
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
28472847
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
28482848
; CHECK: # %bb.0:
2849-
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2850-
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,0]
2851-
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2852-
; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
2853-
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
2854-
; CHECK-NEXT: vmovaps %xmm1, %xmm0
2849+
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,3]
2850+
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
2851+
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
2852+
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2853+
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2854+
; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
28552855
; CHECK-NEXT: vzeroupper
28562856
; CHECK-NEXT: retq
28572857
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
@@ -2863,11 +2863,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec,
28632863
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
28642864
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
28652865
; CHECK: # %bb.0:
2866-
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,0]
2867-
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2868-
; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2869-
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2870-
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2866+
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3]
2867+
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
2868+
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
2869+
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2870+
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2871+
; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
28712872
; CHECK-NEXT: vzeroupper
28722873
; CHECK-NEXT: retq
28732874
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
@@ -2878,12 +2879,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec
28782879
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
28792880
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
28802881
; CHECK: # %bb.0:
2881-
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2882-
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,2,7,0]
2883-
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2884-
; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
2885-
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
2886-
; CHECK-NEXT: vmovaps %xmm1, %xmm0
2882+
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,2,7,3]
2883+
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
2884+
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
2885+
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2886+
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2887+
; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
28872888
; CHECK-NEXT: vzeroupper
28882889
; CHECK-NEXT: retq
28892890
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
@@ -2895,11 +2896,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec,
28952896
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
28962897
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
28972898
; CHECK: # %bb.0:
2898-
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,2,7,0]
2899-
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2900-
; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2901-
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2902-
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2899+
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,2,7,3]
2900+
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
2901+
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
2902+
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2903+
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2904+
; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
29032905
; CHECK-NEXT: vzeroupper
29042906
; CHECK-NEXT: retq
29052907
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
@@ -3885,10 +3887,12 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp,
38853887
define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
38863888
; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
38873889
; CHECK: # %bb.0:
3888-
; CHECK-NEXT: vmovapd 16(%rdi), %xmm2
3889-
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3890-
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3891-
; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
3890+
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
3891+
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3892+
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3893+
; CHECK-NEXT: vpermpd $226, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[2,0,2,3]
3894+
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3895+
; CHECK-NEXT: vzeroupper
38923896
; CHECK-NEXT: retq
38933897
%vec = load <4 x double>, ptr %vp
38943898
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -3900,10 +3904,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2
39003904
define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
39013905
; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
39023906
; CHECK: # %bb.0:
3903-
; CHECK-NEXT: vmovapd 16(%rdi), %xmm1
3904-
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3905-
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3906-
; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
3907+
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3908+
; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
3909+
; CHECK-NEXT: vpermpd $226, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[2,0,2,3]
3910+
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3911+
; CHECK-NEXT: vzeroupper
39073912
; CHECK-NEXT: retq
39083913
%vec = load <4 x double>, ptr %vp
39093914
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -4130,38 +4135,42 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double>
41304135
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
41314136
; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
41324137
; CHECK-FAST: # %bb.0:
4133-
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
4134-
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
4135-
; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4138+
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,0]
4139+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm1
4140+
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,0]
4141+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0
4142+
; CHECK-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
41364143
; CHECK-FAST-NEXT: retq
41374144
;
41384145
; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
41394146
; CHECK-FAST-PERLANE: # %bb.0:
4140-
; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4141-
; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4142-
; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
4147+
; CHECK-FAST-PERLANE-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
4148+
; CHECK-FAST-PERLANE-NEXT: vpermpd %zmm0, %zmm1, %zmm0
4149+
; CHECK-FAST-PERLANE-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
41434150
; CHECK-FAST-PERLANE-NEXT: retq
41444151
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
41454152
ret <4 x double> %res
41464153
}
41474154
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
41484155
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
41494156
; CHECK-FAST: # %bb.0:
4150-
; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
4151-
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
4152-
; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4153-
; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
4154-
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
4155-
; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4157+
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0]
4158+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm3
4159+
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm4 = [7,0]
4160+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm4, %zmm0
4161+
; CHECK-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
4162+
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4163+
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4164+
; CHECK-FAST-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
41564165
; CHECK-FAST-NEXT: retq
41574166
;
41584167
; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
41594168
; CHECK-FAST-PERLANE: # %bb.0:
4160-
; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4161-
; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4169+
; CHECK-FAST-PERLANE-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
4170+
; CHECK-FAST-PERLANE-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
41624171
; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
41634172
; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
4164-
; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
4173+
; CHECK-FAST-PERLANE-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
41654174
; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
41664175
; CHECK-FAST-PERLANE-NEXT: retq
41674176
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
@@ -4173,20 +4182,23 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v
41734182
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
41744183
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
41754184
; CHECK-FAST: # %bb.0:
4176-
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
4177-
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4178-
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4179-
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4180-
; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4185+
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,0]
4186+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm2
4187+
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,0]
4188+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4189+
; CHECK-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
4190+
; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4191+
; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4192+
; CHECK-FAST-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
41814193
; CHECK-FAST-NEXT: retq
41824194
;
41834195
; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
41844196
; CHECK-FAST-PERLANE: # %bb.0:
4185-
; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4186-
; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4197+
; CHECK-FAST-PERLANE-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
41874198
; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
41884199
; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4189-
; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
4200+
; CHECK-FAST-PERLANE-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4201+
; CHECK-FAST-PERLANE-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
41904202
; CHECK-FAST-PERLANE-NEXT: retq
41914203
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
41924204
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4493,9 +4505,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp,
44934505
define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
44944506
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
44954507
; CHECK: # %bb.0:
4496-
; CHECK-NEXT: vmovapd (%rdi), %ymm1
4497-
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0]
4498-
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4508+
; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,2]
4509+
; CHECK-NEXT: vmovaps (%rdi), %zmm1
4510+
; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm0
4511+
; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
4512+
; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
4513+
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
44994514
; CHECK-NEXT: retq
45004515
%vec = load <8 x double>, ptr %vp
45014516
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4504,12 +4519,15 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
45044519
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
45054520
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
45064521
; CHECK: # %bb.0:
4507-
; CHECK-NEXT: vmovapd (%rdi), %ymm2
4508-
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,2,1,0]
4509-
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4510-
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4511-
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4512-
; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4522+
; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,2]
4523+
; CHECK-NEXT: vmovapd (%rdi), %zmm3
4524+
; CHECK-NEXT: vpermpd %zmm3, %zmm2, %zmm2
4525+
; CHECK-NEXT: vmovddup 8(%rdi), %xmm4 # xmm4 = mem[0,0]
4526+
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0]
4527+
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
4528+
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4529+
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4530+
; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
45134531
; CHECK-NEXT: retq
45144532
%vec = load <8 x double>, ptr %vp
45154533
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4521,12 +4539,15 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4
45214539
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
45224540
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
45234541
; CHECK: # %bb.0:
4524-
; CHECK-NEXT: vmovapd (%rdi), %ymm2
4525-
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0]
4526-
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4527-
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4528-
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4529-
; CHECK-NEXT: vmovapd %ymm1, %ymm0
4542+
; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,2]
4543+
; CHECK-NEXT: vmovapd (%rdi), %zmm2
4544+
; CHECK-NEXT: vpermpd %zmm2, %zmm1, %zmm1
4545+
; CHECK-NEXT: vmovddup 8(%rdi), %xmm3 # xmm3 = mem[0,0]
4546+
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0]
4547+
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
4548+
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4549+
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4550+
; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
45304551
; CHECK-NEXT: retq
45314552
%vec = load <8 x double>, ptr %vp
45324553
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>

Diff for: llvm/test/CodeGen/X86/insertelement-duplicates.ll

+4-6
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,16 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli
3131
; AVX-32: # %bb.0: # %L.entry
3232
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
3333
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
34-
; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0
35-
; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
36-
; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
34+
; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
35+
; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
3736
; AVX-32-NEXT: vmovups %ymm0, 608(%eax)
3837
; AVX-32-NEXT: vzeroupper
3938
; AVX-32-NEXT: retl
4039
;
4140
; AVX-64-LABEL: PR15298:
4241
; AVX-64: # %bb.0: # %L.entry
43-
; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0
44-
; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
45-
; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
42+
; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
43+
; AVX-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
4644
; AVX-64-NEXT: vmovups %ymm0, 608(%rsi)
4745
; AVX-64-NEXT: vzeroupper
4846
; AVX-64-NEXT: retq

Diff for: llvm/test/CodeGen/X86/sse-align-12.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ define <4 x float> @b(ptr %y, <4 x float> %z) nounwind {
4040
define <2 x double> @c(ptr %y) nounwind {
4141
; CHECK-LABEL: c:
4242
; CHECK: # %bb.0:
43-
; CHECK-NEXT: movups (%rdi), %xmm0
44-
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
43+
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
44+
; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
4545
; CHECK-NEXT: retq
4646
%x = load <2 x double>, ptr %y, align 8
4747
%a = extractelement <2 x double> %x, i32 0

0 commit comments

Comments
 (0)