Skip to content

Commit 1cae79c

Browse files
committed
DAG: Handle load in SimplifyDemandedVectorElts
This improves some AMDGPU cases and avoids future regressions. The combiner likes to form shuffles for cases where an extract_vector_elt would do perfectly well, and this recovers some of the regressions from losing load narrowing. AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have some improvements, but mostly regressions. In particular X86 looks much worse. I'm guessing this is because it's shouldReduceLoadWidth is wrong. I mostly just regenerated the checks. I assume some set of them should switch to use volatile loads to defeat the optimization.
1 parent 2f7d3ec commit 1cae79c

File tree

149 files changed

+4004
-3802
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+4004
-3802
lines changed

Diff for: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

+31
Original file line numberDiff line numberDiff line change
@@ -3479,6 +3479,37 @@ bool TargetLowering::SimplifyDemandedVectorElts(
34793479

34803480
break;
34813481
}
3482+
case ISD::LOAD: {
3483+
auto *Ld = cast<LoadSDNode>(Op);
3484+
if (!ISD::isNormalLoad(Ld) || !Ld->isSimple())
3485+
break;
3486+
3487+
// TODO: Handle arbitrary vector extract for isMask
3488+
if (DemandedElts.popcount() != 1)
3489+
break;
3490+
3491+
EVT VT = Ld->getValueType(0);
3492+
if (TLO.LegalOperations() &&
3493+
!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
3494+
break;
3495+
3496+
EVT EltVT = VT.getVectorElementType();
3497+
SDLoc DL(Ld);
3498+
3499+
unsigned Idx = DemandedElts.countTrailingZeros();
3500+
3501+
SDValue IdxVal = TLO.DAG.getVectorIdxConstant(Idx, DL);
3502+
SDValue Scalarized =
3503+
scalarizeExtractedVectorLoad(EltVT, DL, VT, IdxVal, Ld, TLO.DAG);
3504+
if (!Scalarized)
3505+
break;
3506+
3507+
TLO.DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Scalarized.getValue(1));
3508+
3509+
SDValue Insert = TLO.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
3510+
TLO.DAG.getUNDEF(VT), Scalarized, IdxVal);
3511+
return TLO.CombineTo(Op, Insert);
3512+
}
34823513
case ISD::VECTOR_SHUFFLE: {
34833514
SDValue LHS = Op.getOperand(0);
34843515
SDValue RHS = Op.getOperand(1);

Diff for: llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll

+27-27
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ define void @test_i64_v2f32(ptr %p, ptr %q) {
3030
; CHECK: ld1 { v{{[0-9]+}}.2s }
3131
; CHECK: rev64 v{{[0-9]+}}.2s
3232
; CHECK: str
33-
%1 = load <2 x float>, ptr %p
33+
%1 = load volatile <2 x float>, ptr %p
3434
%2 = fadd <2 x float> %1, %1
3535
%3 = bitcast <2 x float> %2 to i64
3636
%4 = add i64 %3, %3
@@ -43,7 +43,7 @@ define void @test_i64_v2i32(ptr %p, ptr %q) {
4343
; CHECK: ld1 { v{{[0-9]+}}.2s }
4444
; CHECK: rev64 v{{[0-9]+}}.2s
4545
; CHECK: str
46-
%1 = load <2 x i32>, ptr %p
46+
%1 = load volatile <2 x i32>, ptr %p
4747
%2 = add <2 x i32> %1, %1
4848
%3 = bitcast <2 x i32> %2 to i64
4949
%4 = add i64 %3, %3
@@ -121,7 +121,7 @@ define void @test_f64_v2f32(ptr %p, ptr %q) {
121121
; CHECK: ld1 { v{{[0-9]+}}.2s }
122122
; CHECK: rev64 v{{[0-9]+}}.2s
123123
; CHECK: str
124-
%1 = load <2 x float>, ptr %p
124+
%1 = load volatile <2 x float>, ptr %p
125125
%2 = fadd <2 x float> %1, %1
126126
%3 = bitcast <2 x float> %2 to double
127127
%4 = fadd double %3, %3
@@ -134,7 +134,7 @@ define void @test_f64_v2i32(ptr %p, ptr %q) {
134134
; CHECK: ld1 { v{{[0-9]+}}.2s }
135135
; CHECK: rev64 v{{[0-9]+}}.2s
136136
; CHECK: str
137-
%1 = load <2 x i32>, ptr %p
137+
%1 = load volatile <2 x i32>, ptr %p
138138
%2 = add <2 x i32> %1, %1
139139
%3 = bitcast <2 x i32> %2 to double
140140
%4 = fadd double %3, %3
@@ -213,7 +213,7 @@ define void @test_v1i64_v2f32(ptr %p, ptr %q) {
213213
; CHECK: ld1 { v{{[0-9]+}}.2s }
214214
; CHECK: rev64 v{{[0-9]+}}.2s
215215
; CHECK: str
216-
%1 = load <2 x float>, ptr %p
216+
%1 = load volatile <2 x float>, ptr %p
217217
%2 = fadd <2 x float> %1, %1
218218
%3 = bitcast <2 x float> %2 to <1 x i64>
219219
%4 = add <1 x i64> %3, %3
@@ -226,7 +226,7 @@ define void @test_v1i64_v2i32(ptr %p, ptr %q) {
226226
; CHECK: ld1 { v{{[0-9]+}}.2s }
227227
; CHECK: rev64 v{{[0-9]+}}.2s
228228
; CHECK: str
229-
%1 = load <2 x i32>, ptr %p
229+
%1 = load volatile <2 x i32>, ptr %p
230230
%2 = add <2 x i32> %1, %1
231231
%3 = bitcast <2 x i32> %2 to <1 x i64>
232232
%4 = add <1 x i64> %3, %3
@@ -318,7 +318,7 @@ define void @test_v2f32_v1i64(ptr %p, ptr %q) {
318318
define void @test_v2f32_v2i32(ptr %p, ptr %q) {
319319
; CHECK: ld1 { v{{[0-9]+}}.2s }
320320
; CHECK: st1 { v{{[0-9]+}}.2s }
321-
%1 = load <2 x i32>, ptr %p
321+
%1 = load volatile <2 x i32>, ptr %p
322322
%2 = add <2 x i32> %1, %1
323323
%3 = bitcast <2 x i32> %2 to <2 x float>
324324
%4 = fadd <2 x float> %3, %3
@@ -410,7 +410,7 @@ define void @test_v2i32_v1i64(ptr %p, ptr %q) {
410410
define void @test_v2i32_v2f32(ptr %p, ptr %q) {
411411
; CHECK: ld1 { v{{[0-9]+}}.2s }
412412
; CHECK: st1 { v{{[0-9]+}}.2s }
413-
%1 = load <2 x float>, ptr %p
413+
%1 = load volatile <2 x float>, ptr %p
414414
%2 = fadd <2 x float> %1, %1
415415
%3 = bitcast <2 x float> %2 to <2 x i32>
416416
%4 = add <2 x i32> %3, %3
@@ -488,7 +488,7 @@ define void @test_v4i16_v2f32(ptr %p, ptr %q) {
488488
; CHECK: ld1 { v{{[0-9]+}}.2s }
489489
; CHECK: rev32 v{{[0-9]+}}.4h
490490
; CHECK: st1 { v{{[0-9]+}}.4h }
491-
%1 = load <2 x float>, ptr %p
491+
%1 = load volatile <2 x float>, ptr %p
492492
%2 = fadd <2 x float> %1, %1
493493
%3 = bitcast <2 x float> %2 to <4 x i16>
494494
%4 = add <4 x i16> %3, %3
@@ -501,7 +501,7 @@ define void @test_v4i16_v2i32(ptr %p, ptr %q) {
501501
; CHECK: ld1 { v{{[0-9]+}}.2s }
502502
; CHECK: rev32 v{{[0-9]+}}.4h
503503
; CHECK: st1 { v{{[0-9]+}}.4h }
504-
%1 = load <2 x i32>, ptr %p
504+
%1 = load volatile <2 x i32>, ptr %p
505505
%2 = add <2 x i32> %1, %1
506506
%3 = bitcast <2 x i32> %2 to <4 x i16>
507507
%4 = add <4 x i16> %3, %3
@@ -587,7 +587,7 @@ define void @test_v4f16_v2f32(ptr %p, ptr %q) {
587587
; CHECK: fadd
588588
; CHECK-NOT: rev
589589
; CHECK: st1 { v{{[0-9]+}}.4h }
590-
%1 = load <2 x float>, ptr %p
590+
%1 = load volatile <2 x float>, ptr %p
591591
%2 = fadd <2 x float> %1, %1
592592
%3 = bitcast <2 x float> %2 to <4 x half>
593593
%4 = fadd <4 x half> %3, %3
@@ -602,7 +602,7 @@ define void @test_v4f16_v2i32(ptr %p, ptr %q) {
602602
; CHECK: fadd
603603
; CHECK-NOT: rev
604604
; CHECK: st1 { v{{[0-9]+}}.4h }
605-
%1 = load <2 x i32>, ptr %p
605+
%1 = load volatile <2 x i32>, ptr %p
606606
%2 = add <2 x i32> %1, %1
607607
%3 = bitcast <2 x i32> %2 to <4 x half>
608608
%4 = fadd <4 x half> %3, %3
@@ -682,7 +682,7 @@ define void @test_v8i8_v2f32(ptr %p, ptr %q) {
682682
; CHECK: ld1 { v{{[0-9]+}}.2s }
683683
; CHECK: rev32 v{{[0-9]+}}.8b
684684
; CHECK: st1 { v{{[0-9]+}}.8b }
685-
%1 = load <2 x float>, ptr %p
685+
%1 = load volatile <2 x float>, ptr %p
686686
%2 = fadd <2 x float> %1, %1
687687
%3 = bitcast <2 x float> %2 to <8 x i8>
688688
%4 = add <8 x i8> %3, %3
@@ -695,7 +695,7 @@ define void @test_v8i8_v2i32(ptr %p, ptr %q) {
695695
; CHECK: ld1 { v{{[0-9]+}}.2s }
696696
; CHECK: rev32 v{{[0-9]+}}.8b
697697
; CHECK: st1 { v{{[0-9]+}}.8b }
698-
%1 = load <2 x i32>, ptr %p
698+
%1 = load volatile <2 x i32>, ptr %p
699699
%2 = add <2 x i32> %1, %1
700700
%3 = bitcast <2 x i32> %2 to <8 x i8>
701701
%4 = add <8 x i8> %3, %3
@@ -721,7 +721,7 @@ define void @test_f128_v2f64(ptr %p, ptr %q) {
721721
; CHECK: ld1 { v{{[0-9]+}}.2d }
722722
; CHECK: ext
723723
; CHECK: str
724-
%1 = load <2 x double>, ptr %p
724+
%1 = load volatile <2 x double>, ptr %p
725725
%2 = fadd <2 x double> %1, %1
726726
%3 = bitcast <2 x double> %2 to fp128
727727
%4 = fadd fp128 %3, %3
@@ -734,7 +734,7 @@ define void @test_f128_v2i64(ptr %p, ptr %q) {
734734
; CHECK: ld1 { v{{[0-9]+}}.2d }
735735
; CHECK: ext
736736
; CHECK: str
737-
%1 = load <2 x i64>, ptr %p
737+
%1 = load volatile <2 x i64>, ptr %p
738738
%2 = add <2 x i64> %1, %1
739739
%3 = bitcast <2 x i64> %2 to fp128
740740
%4 = fadd fp128 %3, %3
@@ -816,7 +816,7 @@ define void @test_v2f64_f128(ptr %p, ptr %q) {
816816
define void @test_v2f64_v2i64(ptr %p, ptr %q) {
817817
; CHECK: ld1 { v{{[0-9]+}}.2d }
818818
; CHECK: st1 { v{{[0-9]+}}.2d }
819-
%1 = load <2 x i64>, ptr %p
819+
%1 = load volatile <2 x i64>, ptr %p
820820
%2 = add <2 x i64> %1, %1
821821
%3 = bitcast <2 x i64> %2 to <2 x double>
822822
%4 = fadd <2 x double> %3, %3
@@ -895,7 +895,7 @@ define void @test_v2i64_f128(ptr %p, ptr %q) {
895895
define void @test_v2i64_v2f64(ptr %p, ptr %q) {
896896
; CHECK: ld1 { v{{[0-9]+}}.2d }
897897
; CHECK: st1 { v{{[0-9]+}}.2d }
898-
%1 = load <2 x double>, ptr %p
898+
%1 = load volatile <2 x double>, ptr %p
899899
%2 = fadd <2 x double> %1, %1
900900
%3 = bitcast <2 x double> %2 to <2 x i64>
901901
%4 = add <2 x i64> %3, %3
@@ -979,7 +979,7 @@ define void @test_v4f32_v2f64(ptr %p, ptr %q) {
979979
; CHECK: rev64 v{{[0-9]+}}.4s
980980
; CHECK-NOT: rev
981981
; CHECK: st1 { v{{[0-9]+}}.4s }
982-
%1 = load <2 x double>, ptr %p
982+
%1 = load volatile <2 x double>, ptr %p
983983
%2 = fadd <2 x double> %1, %1
984984
%3 = bitcast <2 x double> %2 to <4 x float>
985985
%4 = fadd <4 x float> %3, %3
@@ -994,7 +994,7 @@ define void @test_v4f32_v2i64(ptr %p, ptr %q) {
994994
; CHECK: fadd
995995
; CHECK-NOT: rev
996996
; CHECK: st1 { v{{[0-9]+}}.4s }
997-
%1 = load <2 x i64>, ptr %p
997+
%1 = load volatile <2 x i64>, ptr %p
998998
%2 = add <2 x i64> %1, %1
999999
%3 = bitcast <2 x i64> %2 to <4 x float>
10001000
%4 = fadd <4 x float> %3, %3
@@ -1062,7 +1062,7 @@ define void @test_v4i32_v2f64(ptr %p, ptr %q) {
10621062
; CHECK: ld1 { v{{[0-9]+}}.2d }
10631063
; CHECK: rev64 v{{[0-9]+}}.4s
10641064
; CHECK: st1 { v{{[0-9]+}}.4s }
1065-
%1 = load <2 x double>, ptr %p
1065+
%1 = load volatile <2 x double>, ptr %p
10661066
%2 = fadd <2 x double> %1, %1
10671067
%3 = bitcast <2 x double> %2 to <4 x i32>
10681068
%4 = add <4 x i32> %3, %3
@@ -1075,7 +1075,7 @@ define void @test_v4i32_v2i64(ptr %p, ptr %q) {
10751075
; CHECK: ld1 { v{{[0-9]+}}.2d }
10761076
; CHECK: rev64 v{{[0-9]+}}.4s
10771077
; CHECK: st1 { v{{[0-9]+}}.4s }
1078-
%1 = load <2 x i64>, ptr %p
1078+
%1 = load volatile <2 x i64>, ptr %p
10791079
%2 = add <2 x i64> %1, %1
10801080
%3 = bitcast <2 x i64> %2 to <4 x i32>
10811081
%4 = add <4 x i32> %3, %3
@@ -1141,7 +1141,7 @@ define void @test_v8i16_v2f64(ptr %p, ptr %q) {
11411141
; CHECK: ld1 { v{{[0-9]+}}.2d }
11421142
; CHECK: rev64 v{{[0-9]+}}.8h
11431143
; CHECK: st1 { v{{[0-9]+}}.8h }
1144-
%1 = load <2 x double>, ptr %p
1144+
%1 = load volatile <2 x double>, ptr %p
11451145
%2 = fadd <2 x double> %1, %1
11461146
%3 = bitcast <2 x double> %2 to <8 x i16>
11471147
%4 = add <8 x i16> %3, %3
@@ -1154,7 +1154,7 @@ define void @test_v8i16_v2i64(ptr %p, ptr %q) {
11541154
; CHECK: ld1 { v{{[0-9]+}}.2d }
11551155
; CHECK: rev64 v{{[0-9]+}}.8h
11561156
; CHECK: st1 { v{{[0-9]+}}.8h }
1157-
%1 = load <2 x i64>, ptr %p
1157+
%1 = load volatile <2 x i64>, ptr %p
11581158
%2 = add <2 x i64> %1, %1
11591159
%3 = bitcast <2 x i64> %2 to <8 x i16>
11601160
%4 = add <8 x i16> %3, %3
@@ -1234,7 +1234,7 @@ define void @test_v16i8_v2f64(ptr %p, ptr %q) {
12341234
; CHECK: ld1 { v{{[0-9]+}}.2d }
12351235
; CHECK: rev64 v{{[0-9]+}}.16b
12361236
; CHECK: st1 { v{{[0-9]+}}.16b }
1237-
%1 = load <2 x double>, ptr %p
1237+
%1 = load volatile <2 x double>, ptr %p
12381238
%2 = fadd <2 x double> %1, %1
12391239
%3 = bitcast <2 x double> %2 to <16 x i8>
12401240
%4 = add <16 x i8> %3, %3
@@ -1247,7 +1247,7 @@ define void @test_v16i8_v2i64(ptr %p, ptr %q) {
12471247
; CHECK: ld1 { v{{[0-9]+}}.2d }
12481248
; CHECK: rev64 v{{[0-9]+}}.16b
12491249
; CHECK: st1 { v{{[0-9]+}}.16b }
1250-
%1 = load <2 x i64>, ptr %p
1250+
%1 = load volatile <2 x i64>, ptr %p
12511251
%2 = add <2 x i64> %1, %1
12521252
%3 = bitcast <2 x i64> %2 to <16 x i8>
12531253
%4 = add <16 x i8> %3, %3
@@ -1315,7 +1315,7 @@ define %struct.struct1 @test_v4f16_struct(ptr %ret) {
13151315
entry:
13161316
; CHECK: ld1 { {{v[0-9]+}}.4h }
13171317
; CHECK-NOT: rev
1318-
%0 = load <4 x half>, ptr %ret, align 2
1318+
%0 = load volatile <4 x half>, ptr %ret, align 2
13191319
%1 = extractelement <4 x half> %0, i32 0
13201320
%.fca.0.insert = insertvalue %struct.struct1 undef, half %1, 0
13211321
ret %struct.struct1 %.fca.0.insert

Diff for: llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll

+1-4
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@
2727
define i64 @g(ptr %p) {
2828
; CHECK-LABEL: g:
2929
; CHECK: // %bb.0:
30-
; CHECK-NEXT: ldr x8, [x0, #8]
31-
; CHECK-NEXT: add x9, x8, x8
32-
; CHECK-NEXT: add x8, x9, x8
33-
; CHECK-NEXT: sub x0, x8, x8
30+
; CHECK-NEXT: mov x0, xzr
3431
; CHECK-NEXT: ret
3532
%vec = load <2 x i64>, ptr %p, align 1
3633
%elt = extractelement <2 x i64> %vec, i32 1

Diff for: llvm/test/CodeGen/AArch64/fcmp.ll

+21-22
Original file line numberDiff line numberDiff line change
@@ -679,48 +679,47 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
679679
; CHECK-SD-NEXT: .cfi_def_cfa_offset 160
680680
; CHECK-SD-NEXT: .cfi_offset w30, -16
681681
; CHECK-SD-NEXT: stp q2, q5, [sp, #112] // 32-byte Folded Spill
682+
; CHECK-SD-NEXT: add x8, sp, #176
682683
; CHECK-SD-NEXT: // kill: def $d6 killed $d6 def $q6
683684
; CHECK-SD-NEXT: // kill: def $d7 killed $d7 def $q7
684-
; CHECK-SD-NEXT: ldr d5, [sp, #184]
685-
; CHECK-SD-NEXT: str q3, [sp, #64] // 16-byte Folded Spill
686-
; CHECK-SD-NEXT: ldp d3, d2, [sp, #168]
685+
; CHECK-SD-NEXT: str q3, [sp, #32] // 16-byte Folded Spill
686+
; CHECK-SD-NEXT: ldp d3, d2, [sp, #160]
687687
; CHECK-SD-NEXT: mov v6.d[1], v7.d[0]
688688
; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
689689
; CHECK-SD-NEXT: mov v0.16b, v1.16b
690690
; CHECK-SD-NEXT: mov v1.16b, v4.16b
691-
; CHECK-SD-NEXT: str q5, [sp, #96] // 16-byte Folded Spill
692-
; CHECK-SD-NEXT: ldr d5, [sp, #160]
693-
; CHECK-SD-NEXT: mov v3.d[1], v2.d[0]
694-
; CHECK-SD-NEXT: str q5, [sp, #80] // 16-byte Folded Spill
695-
; CHECK-SD-NEXT: stp q6, q3, [sp, #32] // 32-byte Folded Spill
691+
; CHECK-SD-NEXT: ld1 { v2.d }[1], [x8]
692+
; CHECK-SD-NEXT: stp q6, q3, [sp, #80] // 32-byte Folded Spill
693+
; CHECK-SD-NEXT: str q2, [sp, #48] // 16-byte Folded Spill
694+
; CHECK-SD-NEXT: ldr d2, [sp, #184]
695+
; CHECK-SD-NEXT: str q2, [sp, #64] // 16-byte Folded Spill
696696
; CHECK-SD-NEXT: bl __lttf2
697697
; CHECK-SD-NEXT: cmp w0, #0
698-
; CHECK-SD-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
699698
; CHECK-SD-NEXT: cset w8, lt
700699
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
701700
; CHECK-SD-NEXT: fmov d0, x8
702701
; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
703-
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
702+
; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload
704703
; CHECK-SD-NEXT: bl __lttf2
705704
; CHECK-SD-NEXT: cmp w0, #0
706705
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
707706
; CHECK-SD-NEXT: cset w8, lt
708707
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
709708
; CHECK-SD-NEXT: fmov d1, x8
710709
; CHECK-SD-NEXT: mov v1.d[1], v0.d[0]
711-
; CHECK-SD-NEXT: str q1, [sp, #64] // 16-byte Folded Spill
710+
; CHECK-SD-NEXT: str q1, [sp, #32] // 16-byte Folded Spill
712711
; CHECK-SD-NEXT: ldp q0, q1, [sp, #112] // 32-byte Folded Reload
713712
; CHECK-SD-NEXT: bl __lttf2
714-
; CHECK-SD-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload
713+
; CHECK-SD-NEXT: ldp q0, q3, [sp, #80] // 32-byte Folded Reload
715714
; CHECK-SD-NEXT: cmp w0, #0
716-
; CHECK-SD-NEXT: ldp q2, q4, [sp, #64] // 32-byte Folded Reload
715+
; CHECK-SD-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload
717716
; CHECK-SD-NEXT: cset w8, lt
718717
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
719-
; CHECK-SD-NEXT: ldr q3, [sp, #96] // 16-byte Folded Reload
718+
; CHECK-SD-NEXT: ldr q4, [sp, #64] // 16-byte Folded Reload
720719
; CHECK-SD-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
721-
; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b
720+
; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b
722721
; CHECK-SD-NEXT: fmov d2, x8
723-
; CHECK-SD-NEXT: bsl v2.16b, v4.16b, v3.16b
722+
; CHECK-SD-NEXT: bsl v2.16b, v3.16b, v4.16b
724723
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
725724
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
726725
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
@@ -815,20 +814,20 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double>
815814
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
816815
; CHECK-SD-NEXT: // kill: def $d6 killed $d6 def $q6
817816
; CHECK-SD-NEXT: // kill: def $d7 killed $d7 def $q7
817+
; CHECK-SD-NEXT: add x8, sp, #16
818818
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
819819
; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5
820-
; CHECK-SD-NEXT: ldr d16, [sp, #24]
821-
; CHECK-SD-NEXT: ldr d17, [sp]
822820
; CHECK-SD-NEXT: mov v3.d[1], v4.d[0]
823821
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
824822
; CHECK-SD-NEXT: mov v6.d[1], v7.d[0]
825-
; CHECK-SD-NEXT: ldp d1, d4, [sp, #8]
826823
; CHECK-SD-NEXT: fcmgt v2.2d, v5.2d, v2.2d
827-
; CHECK-SD-NEXT: mov v1.d[1], v4.d[0]
828824
; CHECK-SD-NEXT: fcmgt v0.2d, v3.2d, v0.2d
829-
; CHECK-SD-NEXT: bsl v2.16b, v17.16b, v16.16b
830-
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
825+
; CHECK-SD-NEXT: ldp d3, d1, [sp]
826+
; CHECK-SD-NEXT: ld1 { v1.d }[1], [x8]
831827
; CHECK-SD-NEXT: bsl v0.16b, v6.16b, v1.16b
828+
; CHECK-SD-NEXT: ldr d1, [sp, #24]
829+
; CHECK-SD-NEXT: bsl v2.16b, v3.16b, v1.16b
830+
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
832831
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
833832
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
834833
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1

Diff for: llvm/test/CodeGen/AArch64/fmlal-loreg.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K,
4545
; CHECK-NEXT: mov w8, w3
4646
; CHECK-NEXT: .LBB1_1: // %for.body
4747
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
48-
; CHECK-NEXT: ldr q2, [x1], #2
48+
; CHECK-NEXT: ldr q2, [x2], #2
4949
; CHECK-NEXT: subs x8, x8, #1
50-
; CHECK-NEXT: ldr q3, [x2], #2
51-
; CHECK-NEXT: fmlal v0.4s, v3.4h, v2.h[0]
52-
; CHECK-NEXT: fmlal2 v1.4s, v3.4h, v2.h[0]
50+
; CHECK-NEXT: ld1r { v3.8h }, [x1], #2
51+
; CHECK-NEXT: fmlal v0.4s, v2.4h, v3.4h
52+
; CHECK-NEXT: fmlal2 v1.4s, v2.4h, v3.4h
5353
; CHECK-NEXT: b.ne .LBB1_1
5454
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
5555
; CHECK-NEXT: stp q0, q1, [x0]

0 commit comments

Comments
 (0)