-
Notifications
You must be signed in to change notification settings - Fork 13.3k
DAG: Handle load in SimplifyDemandedVectorElts #122671
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) ChangesThis improves some AMDGPU cases and avoids future regressions. AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have I mostly just regenerated the checks. I assume some set of them should Patch is 773.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122671.diff 167 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b1fb4947fb9451..0e6be878d38cb8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3478,6 +3478,38 @@ bool TargetLowering::SimplifyDemandedVectorElts(
break;
}
+ case ISD::LOAD: {
+ auto *Ld = cast<LoadSDNode>(Op);
+ if (!ISD::isNormalLoad(Ld) || !Ld->isSimple())
+ break;
+
+ // TODO: Handle arbitrary vector extract for isMask
+ if (DemandedElts.popcount() != 1)
+ break;
+
+ EVT VT = Ld->getValueType(0);
+ if (TLO.LegalOperations() &&
+ !isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT,
+ VT /*, IsAfterLegalization*/))
+ break;
+
+ EVT EltVT = VT.getVectorElementType();
+ SDLoc DL(Ld);
+
+ unsigned Idx = DemandedElts.countTrailingZeros();
+
+ SDValue IdxVal = TLO.DAG.getVectorIdxConstant(Idx, DL);
+ SDValue Scalarized =
+ scalarizeExtractedVectorLoad(EltVT, DL, VT, IdxVal, Ld, TLO.DAG);
+ if (!Scalarized)
+ break;
+
+ TLO.DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Scalarized.getValue(1));
+
+ SDValue Insert = TLO.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
+ TLO.DAG.getUNDEF(VT), Scalarized, IdxVal);
+ return TLO.CombineTo(Op, Insert);
+ }
case ISD::VECTOR_SHUFFLE: {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
diff --git a/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
index f5aa4c666a5681..e9a4a83a406838 100644
--- a/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -30,7 +30,7 @@ define void @test_i64_v2f32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev64 v{{[0-9]+}}.2s
; CHECK: str
- %1 = load <2 x float>, ptr %p
+ %1 = load volatile <2 x float>, ptr %p
%2 = fadd <2 x float> %1, %1
%3 = bitcast <2 x float> %2 to i64
%4 = add i64 %3, %3
@@ -43,7 +43,7 @@ define void @test_i64_v2i32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev64 v{{[0-9]+}}.2s
; CHECK: str
- %1 = load <2 x i32>, ptr %p
+ %1 = load volatile <2 x i32>, ptr %p
%2 = add <2 x i32> %1, %1
%3 = bitcast <2 x i32> %2 to i64
%4 = add i64 %3, %3
@@ -121,7 +121,7 @@ define void @test_f64_v2f32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev64 v{{[0-9]+}}.2s
; CHECK: str
- %1 = load <2 x float>, ptr %p
+ %1 = load volatile <2 x float>, ptr %p
%2 = fadd <2 x float> %1, %1
%3 = bitcast <2 x float> %2 to double
%4 = fadd double %3, %3
@@ -134,7 +134,7 @@ define void @test_f64_v2i32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev64 v{{[0-9]+}}.2s
; CHECK: str
- %1 = load <2 x i32>, ptr %p
+ %1 = load volatile <2 x i32>, ptr %p
%2 = add <2 x i32> %1, %1
%3 = bitcast <2 x i32> %2 to double
%4 = fadd double %3, %3
@@ -213,7 +213,7 @@ define void @test_v1i64_v2f32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev64 v{{[0-9]+}}.2s
; CHECK: str
- %1 = load <2 x float>, ptr %p
+ %1 = load volatile <2 x float>, ptr %p
%2 = fadd <2 x float> %1, %1
%3 = bitcast <2 x float> %2 to <1 x i64>
%4 = add <1 x i64> %3, %3
@@ -226,7 +226,7 @@ define void @test_v1i64_v2i32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev64 v{{[0-9]+}}.2s
; CHECK: str
- %1 = load <2 x i32>, ptr %p
+ %1 = load volatile <2 x i32>, ptr %p
%2 = add <2 x i32> %1, %1
%3 = bitcast <2 x i32> %2 to <1 x i64>
%4 = add <1 x i64> %3, %3
@@ -318,7 +318,7 @@ define void @test_v2f32_v1i64(ptr %p, ptr %q) {
define void @test_v2f32_v2i32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: st1 { v{{[0-9]+}}.2s }
- %1 = load <2 x i32>, ptr %p
+ %1 = load volatile <2 x i32>, ptr %p
%2 = add <2 x i32> %1, %1
%3 = bitcast <2 x i32> %2 to <2 x float>
%4 = fadd <2 x float> %3, %3
@@ -410,7 +410,7 @@ define void @test_v2i32_v1i64(ptr %p, ptr %q) {
define void @test_v2i32_v2f32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: st1 { v{{[0-9]+}}.2s }
- %1 = load <2 x float>, ptr %p
+ %1 = load volatile <2 x float>, ptr %p
%2 = fadd <2 x float> %1, %1
%3 = bitcast <2 x float> %2 to <2 x i32>
%4 = add <2 x i32> %3, %3
@@ -488,7 +488,7 @@ define void @test_v4i16_v2f32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.4h }
- %1 = load <2 x float>, ptr %p
+ %1 = load volatile <2 x float>, ptr %p
%2 = fadd <2 x float> %1, %1
%3 = bitcast <2 x float> %2 to <4 x i16>
%4 = add <4 x i16> %3, %3
@@ -501,7 +501,7 @@ define void @test_v4i16_v2i32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.4h }
- %1 = load <2 x i32>, ptr %p
+ %1 = load volatile <2 x i32>, ptr %p
%2 = add <2 x i32> %1, %1
%3 = bitcast <2 x i32> %2 to <4 x i16>
%4 = add <4 x i16> %3, %3
@@ -587,7 +587,7 @@ define void @test_v4f16_v2f32(ptr %p, ptr %q) {
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
- %1 = load <2 x float>, ptr %p
+ %1 = load volatile <2 x float>, ptr %p
%2 = fadd <2 x float> %1, %1
%3 = bitcast <2 x float> %2 to <4 x half>
%4 = fadd <4 x half> %3, %3
@@ -602,7 +602,7 @@ define void @test_v4f16_v2i32(ptr %p, ptr %q) {
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
- %1 = load <2 x i32>, ptr %p
+ %1 = load volatile <2 x i32>, ptr %p
%2 = add <2 x i32> %1, %1
%3 = bitcast <2 x i32> %2 to <4 x half>
%4 = fadd <4 x half> %3, %3
@@ -682,7 +682,7 @@ define void @test_v8i8_v2f32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.8b
; CHECK: st1 { v{{[0-9]+}}.8b }
- %1 = load <2 x float>, ptr %p
+ %1 = load volatile <2 x float>, ptr %p
%2 = fadd <2 x float> %1, %1
%3 = bitcast <2 x float> %2 to <8 x i8>
%4 = add <8 x i8> %3, %3
@@ -695,7 +695,7 @@ define void @test_v8i8_v2i32(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.8b
; CHECK: st1 { v{{[0-9]+}}.8b }
- %1 = load <2 x i32>, ptr %p
+ %1 = load volatile <2 x i32>, ptr %p
%2 = add <2 x i32> %1, %1
%3 = bitcast <2 x i32> %2 to <8 x i8>
%4 = add <8 x i8> %3, %3
@@ -721,7 +721,7 @@ define void @test_f128_v2f64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: ext
; CHECK: str
- %1 = load <2 x double>, ptr %p
+ %1 = load volatile <2 x double>, ptr %p
%2 = fadd <2 x double> %1, %1
%3 = bitcast <2 x double> %2 to fp128
%4 = fadd fp128 %3, %3
@@ -734,7 +734,7 @@ define void @test_f128_v2i64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: ext
; CHECK: str
- %1 = load <2 x i64>, ptr %p
+ %1 = load volatile <2 x i64>, ptr %p
%2 = add <2 x i64> %1, %1
%3 = bitcast <2 x i64> %2 to fp128
%4 = fadd fp128 %3, %3
@@ -816,7 +816,7 @@ define void @test_v2f64_f128(ptr %p, ptr %q) {
define void @test_v2f64_v2i64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: st1 { v{{[0-9]+}}.2d }
- %1 = load <2 x i64>, ptr %p
+ %1 = load volatile <2 x i64>, ptr %p
%2 = add <2 x i64> %1, %1
%3 = bitcast <2 x i64> %2 to <2 x double>
%4 = fadd <2 x double> %3, %3
@@ -895,7 +895,7 @@ define void @test_v2i64_f128(ptr %p, ptr %q) {
define void @test_v2i64_v2f64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: st1 { v{{[0-9]+}}.2d }
- %1 = load <2 x double>, ptr %p
+ %1 = load volatile <2 x double>, ptr %p
%2 = fadd <2 x double> %1, %1
%3 = bitcast <2 x double> %2 to <2 x i64>
%4 = add <2 x i64> %3, %3
@@ -979,7 +979,7 @@ define void @test_v4f32_v2f64(ptr %p, ptr %q) {
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
- %1 = load <2 x double>, ptr %p
+ %1 = load volatile <2 x double>, ptr %p
%2 = fadd <2 x double> %1, %1
%3 = bitcast <2 x double> %2 to <4 x float>
%4 = fadd <4 x float> %3, %3
@@ -994,7 +994,7 @@ define void @test_v4f32_v2i64(ptr %p, ptr %q) {
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
- %1 = load <2 x i64>, ptr %p
+ %1 = load volatile <2 x i64>, ptr %p
%2 = add <2 x i64> %1, %1
%3 = bitcast <2 x i64> %2 to <4 x float>
%4 = fadd <4 x float> %3, %3
@@ -1062,7 +1062,7 @@ define void @test_v4i32_v2f64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.4s }
- %1 = load <2 x double>, ptr %p
+ %1 = load volatile <2 x double>, ptr %p
%2 = fadd <2 x double> %1, %1
%3 = bitcast <2 x double> %2 to <4 x i32>
%4 = add <4 x i32> %3, %3
@@ -1075,7 +1075,7 @@ define void @test_v4i32_v2i64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.4s }
- %1 = load <2 x i64>, ptr %p
+ %1 = load volatile <2 x i64>, ptr %p
%2 = add <2 x i64> %1, %1
%3 = bitcast <2 x i64> %2 to <4 x i32>
%4 = add <4 x i32> %3, %3
@@ -1141,7 +1141,7 @@ define void @test_v8i16_v2f64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.8h
; CHECK: st1 { v{{[0-9]+}}.8h }
- %1 = load <2 x double>, ptr %p
+ %1 = load volatile <2 x double>, ptr %p
%2 = fadd <2 x double> %1, %1
%3 = bitcast <2 x double> %2 to <8 x i16>
%4 = add <8 x i16> %3, %3
@@ -1154,7 +1154,7 @@ define void @test_v8i16_v2i64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.8h
; CHECK: st1 { v{{[0-9]+}}.8h }
- %1 = load <2 x i64>, ptr %p
+ %1 = load volatile <2 x i64>, ptr %p
%2 = add <2 x i64> %1, %1
%3 = bitcast <2 x i64> %2 to <8 x i16>
%4 = add <8 x i16> %3, %3
@@ -1234,7 +1234,7 @@ define void @test_v16i8_v2f64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.16b
; CHECK: st1 { v{{[0-9]+}}.16b }
- %1 = load <2 x double>, ptr %p
+ %1 = load volatile <2 x double>, ptr %p
%2 = fadd <2 x double> %1, %1
%3 = bitcast <2 x double> %2 to <16 x i8>
%4 = add <16 x i8> %3, %3
@@ -1247,7 +1247,7 @@ define void @test_v16i8_v2i64(ptr %p, ptr %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.16b
; CHECK: st1 { v{{[0-9]+}}.16b }
- %1 = load <2 x i64>, ptr %p
+ %1 = load volatile <2 x i64>, ptr %p
%2 = add <2 x i64> %1, %1
%3 = bitcast <2 x i64> %2 to <16 x i8>
%4 = add <16 x i8> %3, %3
@@ -1315,7 +1315,7 @@ define %struct.struct1 @test_v4f16_struct(ptr %ret) {
entry:
; CHECK: ld1 { {{v[0-9]+}}.4h }
; CHECK-NOT: rev
- %0 = load <4 x half>, ptr %ret, align 2
+ %0 = load volatile <4 x half>, ptr %ret, align 2
%1 = extractelement <4 x half> %0, i32 0
%.fca.0.insert = insertvalue %struct.struct1 undef, half %1, 0
ret %struct.struct1 %.fca.0.insert
diff --git a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
index d76e817e62a495..ce657aa1f0b5bc 100644
--- a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
+++ b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
@@ -27,10 +27,7 @@
define i64 @g(ptr %p) {
; CHECK-LABEL: g:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x8, [x0, #8]
-; CHECK-NEXT: add x9, x8, x8
-; CHECK-NEXT: add x8, x9, x8
-; CHECK-NEXT: sub x0, x8, x8
+; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ret
%vec = load <2 x i64>, ptr %p, align 1
%elt = extractelement <2 x i64> %vec, i32 1
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 66f26fc9d85973..d39e537edb7861 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -679,28 +679,27 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-SD-NEXT: .cfi_def_cfa_offset 160
; CHECK-SD-NEXT: .cfi_offset w30, -16
; CHECK-SD-NEXT: stp q2, q5, [sp, #112] // 32-byte Folded Spill
+; CHECK-SD-NEXT: add x8, sp, #176
; CHECK-SD-NEXT: // kill: def $d6 killed $d6 def $q6
; CHECK-SD-NEXT: // kill: def $d7 killed $d7 def $q7
-; CHECK-SD-NEXT: ldr d5, [sp, #184]
-; CHECK-SD-NEXT: str q3, [sp, #64] // 16-byte Folded Spill
-; CHECK-SD-NEXT: ldp d3, d2, [sp, #168]
+; CHECK-SD-NEXT: str q3, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT: ldp d3, d2, [sp, #160]
; CHECK-SD-NEXT: mov v6.d[1], v7.d[0]
; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT: mov v0.16b, v1.16b
; CHECK-SD-NEXT: mov v1.16b, v4.16b
-; CHECK-SD-NEXT: str q5, [sp, #96] // 16-byte Folded Spill
-; CHECK-SD-NEXT: ldr d5, [sp, #160]
-; CHECK-SD-NEXT: mov v3.d[1], v2.d[0]
-; CHECK-SD-NEXT: str q5, [sp, #80] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp q6, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT: ld1 { v2.d }[1], [x8]
+; CHECK-SD-NEXT: stp q6, q3, [sp, #80] // 32-byte Folded Spill
+; CHECK-SD-NEXT: str q2, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT: ldr d2, [sp, #184]
+; CHECK-SD-NEXT: str q2, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: cset w8, lt
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -708,19 +707,19 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
; CHECK-SD-NEXT: fmov d1, x8
; CHECK-SD-NEXT: mov v1.d[1], v0.d[0]
-; CHECK-SD-NEXT: str q1, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT: str q1, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT: ldp q0, q1, [sp, #112] // 32-byte Folded Reload
; CHECK-SD-NEXT: bl __lttf2
-; CHECK-SD-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT: ldp q0, q3, [sp, #80] // 32-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: ldp q2, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-SD-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload
; CHECK-SD-NEXT: cset w8, lt
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
-; CHECK-SD-NEXT: ldr q3, [sp, #96] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldr q4, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
-; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-SD-NEXT: fmov d2, x8
-; CHECK-SD-NEXT: bsl v2.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT: bsl v2.16b, v3.16b, v4.16b
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
@@ -815,20 +814,20 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double>
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d6 killed $d6 def $q6
; CHECK-SD-NEXT: // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT: add x8, sp, #16
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5
-; CHECK-SD-NEXT: ldr d16, [sp, #24]
-; CHECK-SD-NEXT: ldr d17, [sp]
; CHECK-SD-NEXT: mov v3.d[1], v4.d[0]
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT: ldp d1, d4, [sp, #8]
; CHECK-SD-NEXT: fcmgt v2.2d, v5.2d, v2.2d
-; CHECK-SD-NEXT: mov v1.d[1], v4.d[0]
; CHECK-SD-NEXT: fcmgt v0.2d, v3.2d, v0.2d
-; CHECK-SD-NEXT: bsl v2.16b, v17.16b, v16.16b
-; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT: ldp d3, d1, [sp]
+; CHECK-SD-NEXT: ld1 { v1.d }[1], [x8]
; CHECK-SD-NEXT: bsl v0.16b, v6.16b, v1.16b
+; CHECK-SD-NEXT: ldr d1, [sp, #24]
+; CHECK-SD-NEXT: bsl v2.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
index 31ead890ba8ac7..ed22243eeef45f 100644
--- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
+++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
@@ -45,11 +45,11 @@ define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K,
; CHECK-NEXT: mov w8, w3
; CHECK-NEXT: .LBB1_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q2, [x1], #2
+; CHECK-NEXT: ldr q2, [x2], #2
; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: ldr q3, [x2], #2
-; CHECK-NEXT: fmlal v0.4s, v3.4h, v2.h[0]
-; CHECK-NEXT: fmlal2 v1.4s, v3.4h, v2.h[0]
+; CHECK-NEXT: ld1r { v3.8h }, [x1], #2
+; CHECK-NEXT: fmlal v0.4s, v2.4h, v3.4h
+; CHECK-NEXT: fmlal2 v1.4s, v2.4h, v3.4h
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
; CHECK-NEXT: stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index e284795760c5ca..f586647439d255 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1123,30 +1123,29 @@ entry:
define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64> %e) {
; CHECK-SD-LABEL: v3i64_i64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4
; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d6 killed $d6 def $q6
; CHECK-SD-NEXT: // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT: add x8, sp, #16
; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT: ldr d16, [sp, #24]
-; CHECK-SD-NEXT: ldr d17, [sp]
; CHECK-SD-NEXT: mov v3.d[1], v4.d[0]
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT: ldp d1, d4, [sp, #8]
-; CHECK-SD-NEXT: mov v1.d[1], v4.d[0]
+; CHECK-SD-NEXT: ldp d4, d1, [sp]
+; CHECK-SD-NEXT: ld1 { v1.d }[1], [x8]
; CHECK-SD-NEXT: cmgt v0.2d, v3.2d, v0.2d
; CHECK-SD-NEXT: bsl v0.16b, v6.16b, v1.16b
; CHECK-SD-NEXT: cmgt v1.2d, v5.2d, v2.2d
-; CHECK-SD-NEXT: mov v2.16b, v1.16b
+; CHECK-SD-NEXT: ldr d2, [sp, #24]
+; CHECK-SD-NEXT: bit v2.16b, v4.16b, v1.16b
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-SD-NEXT: bsl v2.16b, v17.16b, v16.16b
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v3i64_i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
index ad4efeaf39247a..1e6427c4cd4956 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
@@ -33,10 +33,7 @@ define half @extractelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
define half @extractelement_v16f16(ptr %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v16f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.h, vl16
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT: mov z0.h, z0.h[15]
-; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ldr h0, [x0, #30]
; CHECK-NEXT: ret
%op1 = load <16 x half>, ptr %a
...
[truncated]
|
426f5fa
to
68ca84a
Compare
X86TargetLowering::shouldReduceLoadWidth is a mess, resulting in a lot of duplicate aliasaed loads that make very little sense - we're seeing something similar on #122485, but it might take some time to unravel. |
These tests cases weren't trying to test load+extract. I believe they only used loads because fixed vector arguments weren't supported when they were written or they weren't copied from the structure of other tests that pre-date fixed vector argument support. Reduces diff from llvm#122671.
; CHECK-NEXT: vslidedown.vi v8, v8, 2 | ||
; CHECK-NEXT: vmv.x.s a0, v8 | ||
; CHECK-NEXT: ret | ||
; RV32-LABEL: extractelt_v4i32: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think most of these tests were only using loads because fixed vector arguments werent' supported when the test was written.
; CHECK-NEXT: vle32.v v8, (a0) | ||
; CHECK-NEXT: vmv.x.s a0, v8 | ||
; CHECK-NEXT: ret | ||
; RV32-LABEL: vreduce_add_v1i32: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same story as fixed-vectors-extract.ll. This test wasn't interested in loads.
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma | ||
; CHECK-NEXT: vle16.v v8, (a0) | ||
; CHECK-NEXT: vfmv.f.s fa5, v8 | ||
; CHECK-NEXT: flh fa5, 0(a0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same story as fixed-vectors-extract.ll. This test wasn't interested in loads.
…. NFC These tests weren't interested in the loas. Removing them reduces the diffs from llvm#122671.
These tests cases weren't trying to test load+extract. I believe they only used loads because fixed vector arguments weren't supported when they were written or they weren't copied from the structure of other tests that pre-date fixed vector argument support. Reduces diff from llvm#122671.
These test cases weren't trying to test load+extract. I believe they only used loads because fixed vector arguments weren't supported when they were written or they weren't copied from the structure of other tests that pre-date fixed vector argument support. Reduces diff from #122671.
9bedb14
to
65e9c1b
Compare
68ca84a
to
8f15ec9
Compare
8f15ec9
to
b745947
Compare
65e9c1b
to
b7d320b
Compare
@arsenm Please can you rebase this and then I'll see what I can do to help with the x86 regressions |
This improves some AMDGPU cases and avoids future regressions. The combiner likes to form shuffles for cases where an extract_vector_elt would do perfectly well, and this recovers some of the regressions from losing load narrowing. AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have some improvements, but mostly regressions. In particular X86 looks much worse. I'm guessing this is because it's shouldReduceLoadWidth is wrong. I mostly just regenerated the checks. I assume some set of them should switch to use volatile loads to defeat the optimization.
b745947
to
1cae79c
Compare
I'm still looking at the x86 mess - but something I've hit is that the hasOneUse() checks on the shouldReduceLoadWidth callback are often getting confused by extra uses of the load nodes's chain - is there anything we can do is clean that up? (See also #126764) |
SDNode::hasOneUse checks are rarely the correct thing over SDValue::hasOneUse. We should probably rename the SDNode version; it's too easy to mix up N->hasOneUse vs. N.hasOneUse |
…ded value, not the chain etc. The hasOneUse check was failing in any case where the load was part of a chain - we should only be checking if the loaded value has one use, and any updates to the chain should be handled by the fold calling shouldReduceLoadWidth. I've updated the x86 implementation to match, although it has no effect here yet (I'm still looking at how to improve the x86 implementation) as the inner for loop was discarding chain uses anyway. By using hasNUsesOfValue instead this patch exposes a missing dependency on the LLVMSelectionDAG library in a lot of tools + unittests, we can either update the CMakeLists.txt dependencies or make SDNode::hasNUsesOfValue inline - no string opinions on this tbh. Noticed while fighting the x86 regressions in llvm#122671
…ded value, not the chain etc. The hasOneUse check was failing in any case where the load was part of a chain - we should only be checking if the loaded value has one use, and any updates to the chain should be handled by the fold calling shouldReduceLoadWidth. I've updated the x86 implementation to match, although it has no effect here yet (I'm still looking at how to improve the x86 implementation) as the inner for loop was discarding chain uses anyway. By using SDValue::hasOneUse instead this patch exposes a missing dependency on the LLVMSelectionDAG library in a lot of tools + unittests, which resulted in having to make SDNode::hasNUsesOfValue inline. Noticed while fighting the x86 regressions in llvm#122671
…value - not the chain (#128167) The hasOneUse check was failing in any case where the load was part of a chain - we should only be checking if the loaded value has one use, and any updates to the chain should be handled by the fold calling shouldReduceLoadWidth. I've updated the x86 implementation to match, although it has no effect here yet (I'm still looking at how to improve the x86 implementation) as the inner for loop was discarding chain uses anyway. By using SDValue::hasOneUse instead this patch exposes a missing dependency on the LLVMSelectionDAG library in a lot of tools + unittests, which resulted in having to make SDNode::hasNUsesOfValue inline. Noticed while fighting the x86 regressions in #122671
…st(p0) if either load is oneuse This fold is currently limited to cases where the load_subv(p0) has oneuse, but its beneficial if either load has oneuse and will be replaced. Yes another yak shave for llvm#122671
…st(p0) if either load is oneuse (llvm#128857) This fold is currently limited to cases where the load_subv(p0) has oneuse, but its beneficial if either load has oneuse and will be replaced. Yet another yak shave for llvm#122671
This improves some AMDGPU cases and avoids future regressions.
The combiner likes to form shuffles for cases where an extract_vector_elt
would do perfectly well, and this recovers some of the regressions from
losing load narrowing.
AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have
some improvements, but mostly regressions. In particular X86 looks much
worse. I'm guessing this is because it's shouldReduceLoadWidth is wrong.
I mostly just regenerated the checks. I assume some set of them should
switch to use volatile loads to defeat the optimization.